%Aigaion2 BibTeX export from Idiap Publications
%Friday 05 December 2025 01:33:54 AM

@INPROCEEDINGS{Prasad_ICASSP2024_2024,
                      author = {Prasad, Amrutha and Carofilis, Andr{\'{e}}s and Vanderreydt, Geoffroy and Khalil, Driss and Madikeri, Srikanth and Motlicek, Petr and Christof, Sch{\"{u}}pbach},
                    projects = {Idiap, armasuisse},
         mainresearchprogram = {Sustainable & Resilient Societies},
  additionalresearchprograms = {AI for Everyone},
                       title = {Fine-tuning Self-Supervised Models For Language Identification Using Orthonormal Constraint},
                   booktitle = {Proceedings of the 49th IEEE International Conference on Acoustics, Speech, & Signal Processing (ICASSP)},
                        year = {2024},
                       pages = {11921-11925},
                        issn = {2379-190X},
                         doi = {10.1109/ICASSP48485.2024.10446751},
                    abstract = {Self-supervised models trained with high linguistic diversity, such as the XLS-R model, can be effectively fine-tuned for the language recognition task. Typically, a back-end classifier followed by statistics pooling layer are added during training. Commonly used back-end classifiers require a large number of parameters to be trained, which is not ideal in limited data conditions. In this work, we explore smaller parameter back-ends using factorized Time Delay Neural Network (TDNN-F). The TDNN-F architecture is also integrated into Emphasized Channel Attention, Propagation and Aggregation-TDNN (ECAPA-TDNN) models, termed ECAPA-TDNN-F, reducing the number of parameters by 30 to 50\% absolute, with competitive accuracies and no change in minimum cost. The results show that the ECAPA-TDNN-F can be extended to tasks where ECAPA-TDNN is suitable. We also test the effectiveness of a linear classifier and a variant, the Orthonormal linear classifier, previously used in x-vector type systems. The models are trained with NIST LRE17 data and evaluated on NIST LRE17, LRE22 and the ATCO2 LID datasets. Both linear classifiers outperform conventional back-ends with improvements in accuracy between 0.9\% and 9.1\%.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2024/Prasad_ICASSP2024_2024.pdf}
}