%Aigaion2 BibTeX export from Idiap Publications
%Thursday 17 April 2025 09:28:06 PM

@TECHREPORT{Aradilla_Idiap-RR-67-2008,
         author = {Aradilla, Guillermo},
       projects = {Idiap},
          month = {9},
          title = {Acoustic Models for Posterior Features in Speech Recognition},
           type = {Idiap-RR},
         number = {Idiap-RR-67-2008},
           year = {2008},
    institution = {Idiap},
           note = {Thesis No 4164 (2008,',','),
 {\~{A}}‰cole polytechnique F{\'{e}}d{\'{e}}rale de lausanne},
       abstract = {In this thesis, we investigate the use of posterior probabilities of sub-word units directly as input
features for automatic speech recognition (ASR). These posteriors, estimated from data-driven
methods, display some favourable properties such as increased speaker invariance, but unlike conventional
speech features also hold some peculiarities, such that their components are non-negative
and sum up to one. State-of-the-art acoustic models for ASR rely on general-purpose similarity
measures like Euclidean-based distances or likelihoods computed from Gaussian mixture models
(GMMs,',','),
 hence, they do not explicitly take into account the particular properties of posterior-based
speech features. We explore here the use of the Kullback-Leibler (KL) divergence as similarity
measure in both non-parametric methods using templates and parametric models that rely on an
architecture based on hidden Markov models (HMMs).
Traditionally, template matching (TM)-based ASR uses cepstral features and requires a large
number of templates to capture the natural variability of spoken language. Thus, TM-based approaches
are generally oriented to speaker-dependent and small vocabulary recognition tasks. In
our work, we use posterior features to represent the templates and test utterances. Given the discriminative
nature of posterior features, we show that a limited number of templates can accurately
characterize a word. Experiments on different databases show that using KL divergence as local
similarity measure yields significantly better performance than traditional TM-based approaches.
The entropy of posterior features can also be used to further improve the results.
In the context of HMMs, we propose a novel acoustic model where each state is parameterized
by a reference multinomial distribution and the state score is based on the KL divergence between
the reference distribution and the posterior features. Besides the fact that the KL divergence is
a natural dissimilarity measure between posterior distributions, we further motivate the use of
the KL divergence by showing that the proposed model can be interpreted in terms of maximum
likelihood and information theoretic clustering. Furthermore, the KL-based acoustic model can
be seen as a general case of other known acoustic models for posterior features such as hybrid
HMM/MLP and discrete HMM. The presented approach has been extended to large vocabulary
recognition tasks. When compared to state-of-the-art HMM/GMM, the KL-based acoustic model
yields comparable results while using significantly fewer parameters.},
            pdf = {https://publications.idiap.ch/attachments/reports/2008/Aradilla_Idiap-RR-67-2008.pdf}
}