<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">REPORT</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Rangappa_Idiap-RR-09-2025/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Enhancing Speaker Diarization using Correlation-Based Clustering Initialization</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Rangappa, Pradeep</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Prasad, Amrutha</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Madikeri, Srikanth</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Motlicek, Petr</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">DISPLACE-2</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">ECAPA-TDNN embedding</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">local speaker segmentation</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">Speaker Diarization</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/reports/2025/Rangappa_Idiap-RR-09-2025.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="088" ind1=" " ind2=" ">
			<subfield code="a">Idiap-RR-09-2025</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2025</subfield>
			<subfield code="b">Idiap</subfield>
		</datafield>
		<datafield tag="771" ind1="2" ind2=" ">
			<subfield code="d">August 2025</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">Speaker diarization becomes challenging in multilingual and code-switched speech due to frequent speaker changes and acoustic variability. While PyAnnote achieves state-of-the-art performance on standard benchmarks, its effectiveness drops on complex datasets like DISPLACE-2. To address this issue, we propose to improve the performance of the global agglomerative clustering by improving the input embeddings. Specifically, we enhance the embeddings by analyzing their pairwise correlations and averaging highly correlated embeddings. This approach improves speaker representation for highly correlated embeddings while reducing speaker confusion and improving clustering accuracy. Evaluated on DISPLACE-2 Track-1 (multilingual speaker diarization), our method shows a 3% relative DER improvement over the baseline, and 8% when combined with segmentation fine-tuning. Notably, the approach reduces DER in rapid turn-taking and language transition regions, improving robustness in code-mixed speech.</subfield>
		</datafield>
	</record>
</collection>