%Aigaion2 BibTeX export from Idiap Publications
%Wednesday 19 February 2025 03:36:27 AM

@INPROCEEDINGS{Rangappa_ICASSP2025_2025,
         author = {Rangappa, Pradeep and Zuluaga-Gomez, Juan and Madikeri, Srikanth and Carofilis, Andr{\'{e}}s and Prakash, Jeena and Burdisso, Sergio and Kumar, Shashi and Villatoro-Tello, Esa{\'{u}} and Iuliia, Nigmatulina and Motlicek, Petr and S, Karthik Pandia D and Ganapathiraju, Aravind},
       projects = {UNIPHORE, ELOQUENCE},
          month = apr,
          title = {Speech Data Selection for Efficient ASR Fine-Tuning using Domain Classifier and Pseudo-Label Filtering},
      booktitle = {2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)},
           year = {2025},
       abstract = {In real-world speech data processing, the scarcity of annotated data and the abundance of unlabelled speech data present a significant challenge. To address this, we propose an efficient data selection pipeline for fine-tuning ASR models by generating pseudo-labels using WhisperX pipeline and selecting efficient labels for fine-tuning. In our work, we propose a domain classifier system developed with a computationally inexpensive TFIDF and classical machine learning algorithm. Later, we filter data from the classifier output using a novel metric that assesses word ratio and perplexity distribution. The filtered pseudo labels are then used for fine-tuning standard encoder- decoder Whisper models and Zipformer. Our proposed data selection pipeline reduces the dataset size by approximately 1/100th while maintaining performance comparable to the full dataset, outperforming random domain-independent selection strategies.},
            pdf = {https://publications.idiap.ch/attachments/papers/2025/Rangappa_ICASSP2025_2025.pdf}
}