%Aigaion2 BibTeX export from Idiap Publications %Wednesday 19 February 2025 03:36:27 AM @INPROCEEDINGS{Rangappa_ICASSP2025_2025, author = {Rangappa, Pradeep and Zuluaga-Gomez, Juan and Madikeri, Srikanth and Carofilis, Andr{\'{e}}s and Prakash, Jeena and Burdisso, Sergio and Kumar, Shashi and Villatoro-Tello, Esa{\'{u}} and Iuliia, Nigmatulina and Motlicek, Petr and S, Karthik Pandia D and Ganapathiraju, Aravind}, projects = {UNIPHORE, ELOQUENCE}, month = apr, title = {Speech Data Selection for Efficient ASR Fine-Tuning using Domain Classifier and Pseudo-Label Filtering}, booktitle = {2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)}, year = {2025}, abstract = {In real-world speech data processing, the scarcity of annotated data and the abundance of unlabelled speech data present a significant challenge. To address this, we propose an efficient data selection pipeline for fine-tuning ASR models by generating pseudo-labels using WhisperX pipeline and selecting efficient labels for fine-tuning. In our work, we propose a domain classifier system developed with a computationally inexpensive TFIDF and classical machine learning algorithm. Later, we filter data from the classifier output using a novel metric that assesses word ratio and perplexity distribution. The filtered pseudo labels are then used for fine-tuning standard encoder- decoder Whisper models and Zipformer. Our proposed data selection pipeline reduces the dataset size by approximately 1/100th while maintaining performance comparable to the full dataset, outperforming random domain-independent selection strategies.}, pdf = {https://publications.idiap.ch/attachments/papers/2025/Rangappa_ICASSP2025_2025.pdf} }