%Aigaion2 BibTeX export from Idiap Publications
%Thursday 04 December 2025 06:19:21 PM

@INPROCEEDINGS{Rangappa_ICASSP2025_2025,
                      author = {Rangappa, Pradeep and Zuluaga-Gomez, Juan and Madikeri, Srikanth and Carofilis, Andr{\'{e}}s and Prakash, Jeena and Burdisso, Sergio and Kumar, Shashi and Villatoro-Tello, Esa{\'{u}} and Iuliia, Nigmatulina and Motlicek, Petr and S, Karthik Pandia D and Ganapathiraju, Aravind},
                    projects = {UNIPHORE, ELOQUENCE},
         mainresearchprogram = {Human-AI Teaming},
                       month = apr,
                       title = {Speech Data Selection for Efficient ASR Fine-Tuning using Domain Classifier and Pseudo-Label Filtering},
                   booktitle = {2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)},
                        year = {2025},
                         url = {https://ieeexplore.ieee.org/document/10888138},
                         doi = {10.1109/ICASSP49660.2025.10888138},
                    abstract = {In real-world speech data processing, the scarcity of annotated data and the abundance of unlabelled speech data present a significant challenge. To address this, we propose an efficient data selection pipeline for fine-tuning ASR models by generating pseudo-labels using WhisperX pipeline and selecting efficient labels for fine-tuning. In our work, we propose a domain classifier system developed with a computationally inexpensive TFIDF and classical machine learning algorithm. Later, we filter data from the classifier output using a novel metric that assesses word ratio and perplexity distribution. The filtered pseudo labels are then used for fine-tuning standard encoder- decoder Whisper models and Zipformer. Our proposed data selection pipeline reduces the dataset size by approximately 1/100th while maintaining performance comparable to the full dataset, outperforming random domain-independent selection strategies.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2025/Rangappa_ICASSP2025_2025.pdf}
}