%Aigaion2 BibTeX export from Idiap Publications
%Thursday 17 April 2025 10:36:01 PM

@ARTICLE{Asaei_SPECOM_2016,
         author = {Cernak, Milos and Asaei, Afsaneh and Bourlard, Herv{\'{e}}},
       projects = {Idiap, PHASER 200021-153507},
          title = {On Structured Sparsity of Phonological Posteriors for Linguistic Parsing},
        journal = {Speech Communication},
         volume = {84},
           year = {2016},
          pages = {36-45},
            url = {http://www.sciencedirect.com/science/article/pii/S0167639316300152},
            doi = {http://dx.doi.org/10.1016/j.specom.2016.08.004},
       crossref = {Cernak_Idiap-RR-07-2016},
       abstract = {The speech signal conveys information on different time scales from
short (20--40 ms) time scale or segmental, associated to phonological
and phonetic information to long (150--250 ms) time scale or supra
segmental, associated to syllabic and prosodic information. Linguistic
and neurocognitive studies recognize the \emph{phonological} classes
at segmental level as the essential and invariant representations used
in speech temporal organization.

In the context of speech processing, a deep neural network (DNN) is an
effective computational method to infer the probability of individual
phonological classes from a short segment of speech signal. A vector
of all phonological class probabilities is referred to as
\emph{phonological posterior}. There are only very few classes
comprising a short term speech signal; hence, the phonological
posterior is a sparse vector. Although the phonological posteriors are
estimated at segmental level, we claim that they convey
supra-segmental information. Specifically, we demonstrate that
phonological posteriors are indicative of syllabic and prosodic
events.

Building on findings from converging linguistic evidence on the
gestural model of Articulatory Phonology as well as the neural basis
of speech perception, we hypothesize that phonological posteriors
convey properties of linguistic classes at multiple time scales, and
this information is embedded in their support (index) of active
coefficients. To verify this hypothesis, we obtain a binary
representation of phonological posteriors at the segmental level which
is referred to as first-order sparsity structure; the high-order
structures are obtained by the concatenation of first-order binary
vectors. It is then confirmed that the classification of
supra-segmental linguistic events, the problem known as
\emph{linguistic parsing}, can be achieved with high accuracy using a
simple binary pattern matching of first-order or high-order
structures.},
            pdf = {https://publications.idiap.ch/attachments/papers/2016/Asaei_SPECOM_2016.pdf}
}



crossreferenced publications: 
@TECHREPORT{Cernak_Idiap-RR-07-2016,
         author = {Cernak, Milos and Asaei, Afsaneh and Bourlard, Herv{\'{e}}},
       keywords = {Binary pattern matching, Deep neural network (DNN) , Linguistic parsing, phonological posteriors , Structured sparse representation},
       projects = {Idiap},
          month = {4},
          title = {On Structured Sparsity of Phonological Posteriors for Linguistic Parsing},
           type = {Idiap-RR},
         number = {Idiap-RR-07-2016},
           year = {2016},
    institution = {Idiap},
            url = {http://arxiv.org/abs/1601.05647},
       abstract = {The speech signal conveys information on different time scales from short (20–40 ms) time scale or
segmental, associated to phonological and phonetic information to long (150–250 ms) time scale or supra
segmental, associated to syllabic and prosodic information. Linguistic and neurocognitive studies recognize
the phonological classes at segmental level as the essential and invariant representations used in speech
temporal organization.
In the context of speech processing, a deep neural network (DNN) is an effective computational method
to infer the probability of individual phonological classes from a short segment of speech signal. A vector of
all phonological class probabilities is referred to as phonological posterior. There are only very few classes
comprising a short term speech signal; hence, the phonological posterior is a sparse vector. Although
the phonological posteriors are estimated at segmental level, we claim that they convey supra-segmental
information. Namely, we demonstrate that phonological posteriors are indicative of syllabic and prosodic
events.
Building on findings from converging linguistic evidence on the gestural model of Articulatory Phonology
as well as neural basis of speech perception, we hypothesize that phonological posteriors convey properties
of linguistic classes at multiple time scales, and this information is embedded in their support (index) of
active coefficients. To verify this hypothesis, we obtain a binary representation of phonological posteriors at
segmental level which is referred to as first-order sparsity structure; the high-order structures are obtained
by concatenation of first-order binary vectors. It is then confirmed that classification of supra-segmental
linguistic events, the problem known as linguistic parsing, can be achieved with high accuracy using a simple
binary pattern matching of first-order or high-order structures.},
            pdf = {https://publications.idiap.ch/attachments/reports/2016/Cernak_Idiap-RR-07-2016.pdf}
}