%Aigaion2 BibTeX export from Idiap Publications
%Saturday 21 December 2024 06:06:20 PM

@INPROCEEDINGS{Vlasenko_MUSE'21_2021,
         author = {Vlasenko, Bogdan and Prasad, RaviShankar and Magimai.-Doss, Mathew},
       projects = {Idiap},
          title = {Fusion of Acoustic and Linguistic Information Using Supervised Autoencoder for Improved Emotion Recognition},
      booktitle = {2nd Multimodal Sentiment Analysis Challenge (MuSe '21), October 24, 2021, Virtual Event, China},
           year = {2021},
            doi = {10.1145/3475957.3484448},
       abstract = {Automatic recognition of human emotion has a wide range of applications and has always attracted increasing attention. Expressions of human emotions can apparently be identified across different modalities of communication, such as speech, text, mimics, etc. The ‘Multimodal Sentiment Analysis in Real-life Media’ (MuSe) 2021 challenge provides an environment to develop new techniques to recognize human emotions or sentiments using multiple modalities (audio, video, and text) over in–the–wild data. The challenge encourages to jointly model the information across audio, video and text modalities, for improving emotion recognition. The present paper describes our attempt towards the MuSe–Sent task in the challenge. The goal of the sub–challenge is to perform turn–level prediction of emotions within the arousal and valence dimensions.
In the paper, we investigate different approaches to optimally fuse linguistic and acoustic information for emotion recognition systems. The proposed systems employ features derived from these modalities, and uses different deep learning architectures to explore their cross–dependencies. Wide range of acoustic and linguistic
features provided by organizers and recently established acoustic embedding wav2vec 2.0 are used for modeling the inherent emotions. In this paper we compare discriminative characteristics of hand–crafted and data–driven acoustic features in a context of emotional classification in arousal and valence dimensions. Ensemble based classifiers were compared with advanced supervised autoendcoder (SAE) technique with Bayesian Optimizer hyperparameter tuning approach. Comparison of uni– and bi–modal classification 
techniques showed that joint modeling of acoustic and linguistic cues could improve classification performance compared to individualual modalities. Experimental results show improvement over the proposed baseline system, which focuses on fusion of acoustic and text based information, on the test set evaluation.},
            pdf = {https://publications.idiap.ch/attachments/papers/2022/Vlasenko_MUSE21_2021.pdf}
}