%Aigaion2 BibTeX export from Idiap Publications %Thursday 21 November 2024 04:23:05 PM @TECHREPORT{Kumar_Idiap-RR-08-2024, author = {Kumar, Shashi and Madikeri, Srikanth and Juan, Zuluaga-Gomez. and Villatoro-Tello, Esa{\'{u}} and Iuliia, Nigmatulina and Motlicek, Petr and E, Manjunath K and Ganapathiraju, Aravind}, month = {8}, title = {XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models}, type = {Idiap-RR}, number = {Idiap-RR-08-2024}, year = {2024}, institution = {Idiap}, url = {https://arxiv.org/abs/2407.04439}, abstract = {Self-supervised pretrained models exhibit competitive performance in automatic speech recognition on finetuning, even with limited in-domain supervised data for training. However, popular pretrained models are not suitable for streaming ASR because they are trained with full attention context. In this paper, we introduce XLSR-Transducer, where the XLSR-53 model is used as encoder in transducer setup. Our experiments on the AMI dataset reveal that the XLSR-Transducer achieves 4\% absolute WER improvement over Whisper large-v2 and 8\% over a Zipformer transducer model trained from scratch. To enable streaming capabilities, we investigate different attention masking patterns in the self-attention computation of transformer layers within the XLSR-53 model. We validate XLSR-Transducer on AMI and 5 languages from CommonVoice under low-resource scenarios. Finally, with the introduction of attention sinks, we reduce the left context by half while achieving a relative 12\% improvement in WER.}, pdf = {https://publications.idiap.ch/attachments/reports/2024/Kumar_Idiap-RR-08-2024.pdf} }