%Aigaion2 BibTeX export from Idiap Publications
%Thursday 04 December 2025 10:12:32 PM
@TECHREPORT{Sarkar_Idiap-RR-07-2025,
author = {Sarkar, Eklavya and Tarigopula, Neha},
projects = {Idiap},
month = {8},
title = {Tokenwise Contrastive Speech and Text Pre-Training for Speech Emotion Recognition},
type = {Idiap-RR},
number = {Idiap-RR-07-2025},
year = {2025},
institution = {Idiap},
abstract = {Human emotion recognition involves either decomposing audio signals to reflect the emotion or processing the corresponding text to extract the semantic meaning behind it. In this study, we explore the task of multi-modal emotion recognition by enriching acoustic representations with semantic meaning from the corresponding textual transcript. We use an pre-training strategy to learn the multi-modal representations via contrastive learning of token-by-token alignment of Whisper (speech) and BERT (text) representations using the LibriSpeech dataset. The aligned multi-modal features are then used for training an emotion classifier on IEMOCAP and EmoDB datasets. Despite the multi-modal representations outperforming the BERT-only uni-modal baselines, our results indicate a marginal underperformance compared to the Whisper-only uni-modal model, suggesting that leveraging additional textual information during pre-training might not necessarily improve representations for a downstream emotion recognition task.},
pdf = {https://publications.idiap.ch/attachments/reports/2024/Sarkar_Idiap-RR-07-2025.pdf}
}