%Aigaion2 BibTeX export from Idiap Publications
%Friday 05 December 2025 11:24:15 AM

@INPROCEEDINGS{Kumar_ICASSP2025_2025,
                      author = {Kumar, Shashi and Madikeri, Srikanth and Zuluaga-Gomez, Juan and Villatoro-Tello, Esa{\'{u}} and Thorbecke, Iuliia and Motlicek, Petr and E, Manjunath K and Ganapathiraju, Aravind},
                    keywords = {self-supervised learning, streaming ASR, transformer transducer, XLSR},
                    projects = {UNIPHORE, ELOQUENCE},
         mainresearchprogram = {Human-AI Teaming},
  additionalresearchprograms = {AI for Everyone},
                       month = apr,
                       title = {XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models},
                   booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
                        year = {2025},
                   publisher = {IEEE},
                    location = {Hyderabad, India},
                        issn = {2379-190X},
                        isbn = {979-8-3503-6874-1},
                         url = {https://ieeexplore.ieee.org/document/10888110},
                         doi = {https://doi.org/10.1109/ICASSP49660.2025.10888110},
                    crossref = {Kumar_Idiap-RR-08-2024},
                    abstract = {Self-supervised pretrained models exhibit competitive performance in automatic speech recognition (ASR) on finetuning, even with limited in-domain supervised data. However, popular pretrained models are not suitable for streaming ASR because they are trained with full attention context. In this paper, we introduce XLSR-Transducer, where the XLSR-53 model is used as encoder in transducer setup. Our experiments on the AMI dataset reveal that the XLSR-Transducer achieves 4\% absolute WER improvement over Whisper large-v2 and 8\% over a Zipformer transducer model trained from scratch. To enable streaming capabilities, we investigate different attention masking patterns in the self-attention computation of transformer layers within the XLSR-53 model.
We validate XLSR-Transducer on AMI and 5 languages from CommonVoice under low-resource scenarios.
Finally, with the introduction of attention sinks, we reduce the left context by half while achieving a relative 12\% improvement in WER.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2025/Kumar_ICASSP2025_2025.pdf}
}



crossreferenced publications: 
@TECHREPORT{Kumar_Idiap-RR-08-2024,
                      author = {Kumar, Shashi and Madikeri, Srikanth and Zuluaga-Gomez, Juan and Villatoro-Tello, Esa{\'{u}} and Iuliia, Nigmatulina and Motlicek, Petr and E, Manjunath K and Ganapathiraju, Aravind},
                    projects = {Idiap},
         mainresearchprogram = {Human-AI Teaming},
  additionalresearchprograms = {AI for Everyone},
                       month = {8},
                       title = {XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models},
                        type = {Idiap-RR},
                      number = {Idiap-RR-08-2024},
                        year = {2024},
                 institution = {Idiap},
                         url = {https://arxiv.org/abs/2407.04439},
                    abstract = {Self-supervised pretrained models exhibit competitive performance in automatic speech recognition on finetuning, even with limited in-domain supervised data for training. However, popular pretrained models are not suitable for streaming ASR because they are trained with full attention context. In this paper, we introduce XLSR-Transducer, where the XLSR-53 model is used as encoder in transducer setup. Our experiments on the AMI dataset reveal that the XLSR-Transducer achieves 4\% absolute WER improvement over Whisper large-v2 and 8\% over a Zipformer transducer model trained from scratch. To enable streaming capabilities, we investigate different attention masking patterns in the self-attention computation of transformer layers within the XLSR-53 model. We validate XLSR-Transducer on AMI and 5 languages from CommonVoice under low-resource scenarios. Finally, with the introduction of attention sinks, we reduce the left context by half while achieving a relative 12\% improvement in WER.},
                         pdf = {https://publications.idiap.ch/attachments/reports/2024/Kumar_Idiap-RR-08-2024.pdf}
}