%Aigaion2 BibTeX export from Idiap Publications
%Sunday 17 May 2026 09:15:55 PM

@PHDTHESIS{Iuliia_THESIS_2025,
                      author = {Iuliia, Thorbecke},
                    keywords = {Automatic Speech Recognition, named entity detection, Natural language processing, speech processing},
                    projects = {Idiap, HAAWAII},
         mainresearchprogram = {Human-AI Teaming},
  additionalresearchprograms = {AI for Everyone},
                       month = mar,
                       title = {CONTEXTUALISATION OF AUTOMATIC SPEECH RECOGNITION AND RELATED APPLICATIONS},
                        year = {2026},
                      school = {University of Zurich, Faculty of Arts},
                     address = {CH-8044 Z{\"{u}}rich, Switzerland},
                         url = {https://www.zora.uzh.ch/browse/author?startsWith=University%20of%20Zurich},
                         doi = {https://doi.org/10.5167/uzh-293071},
                    abstract = {Automatic Speech Recognition (ASR) systems have made remarkable progress in recent years, largely driven by advances in deep learning and the availability of large-scale training datasets. These improvements have led to significant reductions in Word Error Rate (WER), particularly in general-purpose, in-domain tasks. However, real-world applications often demand more than overall accuracy - they require precise recognition of specific, context-dependent information such as domain-specific terms, critical word sequences, or named entities (NEs). These are the very areas where high-performing ASR systems tend to underperform, especially in the absence of explicit contextual support.
This thesis addresses the problem of contextual ASR, with a specific focus on text-only contextual data and scenarios where the base ASR model cannot be retrained. We aim to improve both general ASR performance and the accurate recognition of key target entities, namely (1) n-grams that are more likely within a specific context and (2) NEs, through context-aware biasing strategies. These strategies are evaluated across different ASR paradigms, including hybrid and End-to-End models, and in varied configurations (e.g., offline/online ASR, semi-supervised learning, streaming, and domain adaptation).
A central contribution of this work is a comprehensive evaluation and analysis of contextual integration methods, offering practical insights into when and how different techniques can be effectively employed. For hybrid (pipeline) ASR, we investigate three context injection methods: lattice-level biasing, decoding graph modification, and grammar FST augmentation with context-specific entities. We demonstrate the effectiveness of these methods in improving recognition not only for frequent terms but also for rare and out-of-vocabulary (OOV) words. Furthermore, we introduce a GPU-efficient algorithm for real-time dynamic contextual biasing. This method adjusts decoding graph arc weights using input keywords and n-grams, without generating lattices, enabling rapid adaptation and closer integration of inference and decoding processes. 
In the domain of End-to-End ASR, we propose an efficient approach for integrating word-level n-gram language models (LMs) using the Aho-Corasick (AC) algorithm. This enables the fusion of keyword biasing and n-gram LM adaptation into a single, unified context trie. Compared to standard shallow fusion (SF) with neural network LMs, our method achieves faster decoding with competitive WER and real-time factor (RTF) performance. Evaluations on four languages and datasets confirm improvements in both NE recognition and general ASR accuracy, including for OOV terms. We further show that SF remains effective even when biasing entities are generated automatically or when distractors are present, and we benchmark multiple SF strategies across languages and data domains. In addition, we explore dynamic contextualisation via SF as a complement to static encoder conditioning, and present early results on domain adaptation using text-only and pseudo-audio data. 
We extend the investigation to ASR-related tasks, including Named Entity Recognition (NER) from speech and semi-supervised learning (SSL). We develop a two-step biasing method combining ASR and NLP modules: first boosting NER via FST biasing, then correcting erroneous predictions with NLP post-processing. This approach is shown to work effectively in both cascaded ASR-NER pipelines and End-to-End multitask models. In SSL, we propose a method to incorporate contextual knowledge into iterative model training. Pseudo-labeled lattices from untranscribed data are rescored using context-aware techniques, improving key information recognition and outperforming traditional SSL methods in unseen domains. 
To address the challenge of training streaming ASR models with minimal supervision, we introduce a framework that uses noisy pseudo-labels from foundational models. Transformer-Transducer architectures trained from scratch on this data show promising results, particularly when combined with shallow fusion using ngram LMs and extracted NEs. 
Finally, a significant portion of this research is conducted using ATC data, where radar surveillance information serves as context. Our contributions in this domain demonstrate the potential of ASR contextualisation to enhance communication reliability and efficiency in high-stakes environments. We consider our work to be a meaningful step toward the development of ASR-assisted ATC systems capable of supporting domain-critical tasks and mitigating the risks associated with miscommunication.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2026/Iuliia_THESIS_2025.pdf}
}