<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">CONF</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Rangappa_ICASSP2025_2025/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Speech Data Selection for Efficient ASR Fine-Tuning using Domain Classifier and Pseudo-Label Filtering</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Rangappa, Pradeep</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Zuluaga-Gomez, Juan</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Madikeri, Srikanth</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Carofilis, Andrés</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Prakash, Jeena</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Burdisso, Sergio</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Kumar, Shashi</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Villatoro-Tello, Esaú</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Iuliia, Nigmatulina</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Motlicek, Petr</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">S, Karthik Pandia D</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Ganapathiraju, Aravind</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/papers/2025/Rangappa_ICASSP2025_2025.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="711" ind1="2" ind2=" ">
			<subfield code="a">2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2025</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2=" ">
			<subfield code="u">https://ieeexplore.ieee.org/document/10888138</subfield>
			<subfield code="z">URL</subfield>
		</datafield>
		<datafield tag="024" ind1="7" ind2=" ">
			<subfield code="a">10.1109/ICASSP49660.2025.10888138</subfield>
			<subfield code="2">doi</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">In real-world speech data processing, the scarcity of annotated data and the abundance of unlabelled speech data present a significant challenge. To address this, we propose an efficient data selection pipeline for fine-tuning ASR models by generating pseudo-labels using WhisperX pipeline and selecting efficient labels for fine-tuning. In our work, we propose a domain classifier system developed with a computationally inexpensive TFIDF and classical machine learning algorithm. Later, we filter data from the classifier output using a novel metric that assesses word ratio and perplexity distribution. The filtered pseudo labels are then used for fine-tuning standard encoder- decoder Whisper models and Zipformer. Our proposed data selection pipeline reduces the dataset size by approximately 1/100th while maintaining performance comparable to the full dataset, outperforming random domain-independent selection strategies.</subfield>
		</datafield>
	</record>
</collection>