%Aigaion2 BibTeX export from Idiap Publications
%Thursday 04 December 2025 05:31:46 PM

@INPROCEEDINGS{VILLATORO-TELLO_ICASSP'24_2023,
                      author = {Villatoro-Tello, Esa{\'{u}} and Madikeri, Srikanth and Sharma, Bidisha and Khalil, Driss and Kumar, Shashi and Iuliia, Nigmatulina and Motlicek, Petr and Ganapathiraju, Aravind},
                    keywords = {Cross-modal Alignment, Intent Classification, knowledge distillation, Spoken Language Understanding, Word-Confusion-Networks},
                    projects = {UNIPHORE},
         mainresearchprogram = {Sustainable & Resilient Societies},
  additionalresearchprograms = {AI for Everyone},
                       month = apr,
                       title = {Probability-Aware Word-Confusion-Network-to-Text Alignment Approach for Intent Classification},
                   booktitle = {Proceedings of the 49th IEEE International Conference on Acoustics, Speech, & Signal Processing (ICASSP) 2024},
                        year = {2024},
                       pages = {12617-12621},
                   publisher = {IEEE},
                    location = {Seoul, Republic of Korea},
                         url = {https://ieeexplore.ieee.org/document/10445934},
                         doi = {10.1109/ICASSP48485.2024.10445934},
                    abstract = {Spoken Language Understanding (SLU) technologies have seen a big improvement due to the effective pretraining of speech representations. A common requirement of industry-based solutions is the portability to deploy SLU models in voice-assistant devices. Thus, distilling knowledge from large text-based language models has become an attractive solution for achieving good performance and guaranteeing portability. In this paper, we introduce a novel architecture that uses a cross-modal attention mechanism to extract bin-level contextual embeddings from a word-confusion network (WNC) encoding such that these can be directly compared and aligned with traditional text-based contextual embeddings. This alignment is achieved using a recently proposed tokenwise constrastive loss function. We validated our architecture's effectiveness by fine-tuning our WCN-based pretrained model to perform intent classification on the SLURP dataset. Obtained accuracy (81\%), depicts a 9.4\% relative improvement compared to a recent and equivalent E2E method.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2024/VILLATORO-TELLO_ICASSP24_2023.pdf}
}