%Aigaion2 BibTeX export from Idiap Publications
%Saturday 07 March 2026 07:12:07 AM

@INPROCEEDINGS{Motlicek_SIDS2025_2025,
                      author = {Motlicek, Petr and Kumar, Shashi and Khalil, Driss and Prasad, Amrutha and Christof, Sch{\"{u}}pbach},
                    keywords = {Air traffic control, Automatic Speech Recognition, semi-supervised learning},
                    projects = {Idiap, armasuisse},
         mainresearchprogram = {Human-AI Teaming},
                       month = dec,
                       title = {Leveraging Untranscribed Data for End-to-End Speech and Callsign Recognition in Air-Traffic Communication},
                   booktitle = {SESAR Innovation Days 2025 (https://www.sesarju.eu/SIDS2025)},
                        year = {2025},
                    location = {Bled, Slovenia},
                organization = {Eurocontrol},
                         url = {https://www.sesarju.eu/sites/default/files/documents/sid/2025/papers/SIDs_2025_paper_103-final.pdf},
                    abstract = {Accurate Automatic Speech Recognition (ASR) and callsign recognition in Air Traffic Control (ATC) are vital for safety, yet conventional two-step systems rely on large amounts of manually transcribed data, which is both costly and limited. This paper introduces a practical alternative using TokenVerse, a unified end-to-end model trained under a dual-task framework and enhanced through semi-supervised learning. Our main contribution shows that the model can jointly learn callsign boundaries and speech recognition, improving performance on both tasks simultaneously. Additionally, by generating pseudo-labels for 500 hours of unlabeled audio, we substantially expand the effective training data.
Experiments across multiple in-domain and out-of-domain ATC datasets demonstrate that the TokenVerse framework achieves state-of-the-art performance in both ASR and callsign detection, surpassing cascaded pipelines built on modern architectures (including Kaldi, XLSR/wav2vec 2.0, Zipformer, and Whisper). This work provides a robust and scalable foundation for deploying and continuously refining high-accuracy ATC systems in real-world settings where labeled data is inherently scarce. The end-to-end architecture is also relatively compact (approximately 317M parameters), making it well suited for real-time, low-latency deployment.}
}