%Aigaion2 BibTeX export from Idiap Publications
%Saturday 07 March 2026 05:03:55 AM

@PHDTHESIS{Yella_THESIS-2_2015,
author = {Yella, Sree Harsha},
projects = {Idiap, IM2},
month = jan,
title = {Speaker diarization of spontaneous meeting room conversations},
year = {2015},
school = {EPFL},
address = {Lausanne},
abstract = {Speaker diarization is the task of identifying “who spoke when” in an audio stream containing
multiple speakers. This is an unsupervised task as there is no a priori information about the
speakers. Diagnostical studies on state-of-the-art diarization systems have isolated three main
issues with the systems; overlapping speech, effects of background noise and speech/non-
speech detection errors on clustering, and signficant performance variance between different
systems. In this thesis we focuss on addressing these issues in diarization.
We propose new features based on structure of a conversation such as silence and speaker
change statistics for overlap detection. The features are estimated from a long-term context
(3-4 seconds) and are used to estimate the probability of overlap at a given instant. These
probabilities are later incorporated into acoustic feature based overlap detector as prior prob-
abilities. Experiments on several meeting corpora reveal that overlap detection is improved
significantly by the proposed method and this consequently reduces the diarization error.
To address the issues arising from background noise, errors in speech/non-speech detection
and capture speaker discriminative information in the signal, we propose two methods. In
the first method, we propose Information Bottleneck with Side Information (IBSI) based
diarization to supress artefacts of background noise and non-speech segments introduced into
clustering. In the second method, we show that the phoneme transcript of a given recording
carries useful information for speaker diarization. This obervation was used in estimation
of phoneme background model which is used for diarization in Information Bottleneck (IB)
framework. Both the methods achieve significant reduction in error on various meeting
corpora.
We train different artificial neural network (ANN) architectures to extract speaker discriminant
features and use these features as input to speaker diarization systems. The ANNs are trained
to perform related tasks such as speaker comparison, speaker classification and auto encod-
ing. The bottleneck layer activations from these networks are used as features for speaker
diarization. Experiments on different meeting corpora revealed that combination of MFCCs
and ANN features reduces the diarization error.
To address the issue of performance variations across different sytems, we propose feature
level combination of HMM/GMM and IB diarization systems. The combination does not
require any changes to the original systems. The output of IB system is used to generate
features which when combined with MFCCs in a HMM/GMM system reduce diarization error.},
pdf = {https://publications.idiap.ch/attachments/papers/2015/Yella_THESIS-2_2015.pdf}
}