%Aigaion2 BibTeX export from Idiap Publications
%Saturday 11 May 2024 05:49:25 PM

@INPROCEEDINGS{Juan_EMNLP_2023,
         author = {Juan, Zuluaga-Gomez. and Huang, Zhaocheng and Niu, Xing and Srinavasan, Sundararajan and Mathur, Prashant and Thompson, Brian and Federico, Marcello},
       projects = {Idiap},
          month = dec,
          title = {End-to-End Single-Channel Speaker-Turn Aware Conversational Speech Translation},
      booktitle = {The 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
         series = {1},
         volume = {1},
         number = {1},
           year = {2023},
       location = {Singapore},
            url = {https://arxiv.org/abs/2311.00697},
       abstract = {Conventional speech-to-text translation (ST) systems are trained on single-speaker utterances, and they may not generalize to real-life scenarios where the audio contains conversations by multiple speakers. In this paper, we tackle single-channel multi-speaker conversational ST with an end-to-end and multi-task training model, named Speaker-Turn Aware Conversational Speech Translation, that combines automatic speech recognition, speech translation and speaker turn detection using special tokens in a serialized labeling format. We run experiments on the Fisher-CALLHOME corpus, which we adapted by merging the two single-speaker channels into one multi-speaker channel, thus representing the more realistic and challenging scenario with multi-speaker turns and cross-talk. Experimental results across single- and multi-speaker conditions and against conventional ST systems, show that our model outperforms the reference systems on the multi-speaker condition, while attaining comparable performance on the single-speaker condition. We release scripts for data processing and model training.},
            pdf = {https://publications.idiap.ch/attachments/papers/2023/Juan_EMNLP_2023.pdf}
}