%Aigaion2 BibTeX export from Idiap Publications %Thursday 02 January 2025 04:29:00 PM @PHDTHESIS{misrPhDThesis06, author = {Misra, Hemant}, projects = {Idiap}, month = {3}, title = {Multi-stream Processing for Noise Robust Speech Recognition}, year = {2006}, school = {{\'{E}}cole Polytechnique F{\'{e}}d{\'{e}}rale de Lausanne}, address = {Lausanne, Switzerland}, note = {IDIAP-RR 2006 28}, crossref = {misra-rr-06-28}, abstract = {In this thesis, the framework of multi-stream combination has been explored to improve the noise robustness of automatic speech recognition (ASR) systems. The central idea of multi-stream ASR is to combine information from several sources to improve the performance of a system. The two important issues of multi-stream systems are which information sources (feature representations) to combine and what importance (weights) be given to each information source. In the framework of hybrid hidden Markov model/artificial neural network (HMM/ANN) and Tandem systems, several weighting strategies are investigated in this thesis to merge the posterior outputs of multi-layered perceptrons (MLPs) trained on different feature representations. The best results were obtained by inverse entropy weighting in which the posterior estimates at the output of the MLPs were weighted by their respective inverse output entropies. In the second part of this thesis, two feature representations have been investigated, namely pitch frequency and spectral entropy features. The pitch frequency feature is used along with perceptual linear prediction (PLP) features in a multi-stream framework. The second feature proposed in this thesis is estimated by applying an entropy function to the normalized spectrum to produce a measure which has been termed spectral entropy. The idea of the spectral entropy feature is extended to multi-band spectral entropy features by dividing the normalized full-band spectrum into sub-bands and estimating the spectral entropy of each sub-band. The proposed multi-band spectral entropy features were observed to be robust in high noise conditions. Subsequently, the idea of embedded training is extended to multi-stream HMM/ANN systems. To evaluate the maximum performance that can be achieved by frame-level weighting, we investigated an ``oracle test''. We also studied the relationship of oracle selection to inverse entropy weighting and proposed an alternative interpretation of the oracle test to analyze the complementarity of streams in multi-stream systems. The techniques investigated in this work gave a significant improvement in performance for clean as well as noisy test conditions.}, pdf = {https://publications.idiap.ch/attachments/reports/2006/rr06-28.pdf}, postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr06-28.ps.gz}, ipdinar={2006}, ipdmembership={speech}, language={English}, } crossreferenced publications: @TECHREPORT{misra-rr-06-28, author = {Misra, Hemant}, projects = {Idiap}, title = {Multi-stream Processing for Noise Robust Speech Recognition}, type = {Idiap-RR}, number = {Idiap-RR-28-2006}, year = {2006}, institution = {IDIAP}, address = {Martigny, Switzerland}, note = {Thesis \# 3508}, abstract = {In this thesis, the framework of multi-stream combination has been explored to improve the noise robustness of automatic speech recognition (ASR) systems. The central idea of multi-stream ASR is to combine information from several sources to improve the performance of a system. The two important issues of multi-stream systems are which information sources (feature representations) to combine and what importance (weights) be given to each information source. In the framework of hybrid hidden Markov model/artificial neural network (HMM/ANN) and Tandem systems, several weighting strategies are investigated in this thesis to merge the posterior outputs of multi-layered perceptrons (MLPs) trained on different feature representations. The best results were obtained by inverse entropy weighting in which the posterior estimates at the output of the MLPs were weighted by their respective inverse output entropies. In the second part of this thesis, two feature representations have been investigated, namely pitch frequency and spectral entropy features. The pitch frequency feature is used along with perceptual linear prediction (PLP) features in a multi-stream framework. The second feature proposed in this thesis is estimated by applying an entropy function to the normalized spectrum to produce a measure which has been termed spectral entropy. The idea of the spectral entropy feature is extended to multi-band spectral entropy features by dividing the normalized full-band spectrum into sub-bands and estimating the spectral entropy of each sub-band. The proposed multi-band spectral entropy features were observed to be robust in high noise conditions. Subsequently, the idea of embedded training is extended to multi-stream HMM/ANN systems. To evaluate the maximum performance that can be achieved by frame-level weighting, we investigated an ``oracle test''. We also studied the relationship of oracle selection to inverse entropy weighting and proposed an alternative interpretation of the oracle test to analyze the complementarity of streams in multi-stream systems. The techniques investigated in this work gave a significant improvement in performance for clean as well as noisy test conditions.}, pdf = {https://publications.idiap.ch/attachments/reports/2006/rr06-28.pdf}, postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr06-28.ps.gz}, ipdinar={2006}, ipdmembership={speech}, language={English}, }