<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">CONF</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Dubagunta_ICASSP_2019/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Segment-level training of ANNs based on acoustic confidence measures for hybrid HMM/ANN Speech Recognition</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Dubagunta, S. Pavankumar</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Magimai-Doss, Mathew</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">confidence measures</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">local posterior probability</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">segment-level training.</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">speech recognition</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/papers/2019/Dubagunta_ICASSP_2019.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="711" ind1="2" ind2=" ">
			<subfield code="a">Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2019</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">We show that confidence measures estimated from local posterior probabilities can serve as objective functions for training ANNs in hybrid HMM based speech recognition systems. This leads to a segment-level training paradigm that overcomes the limitation of frame-level updates ignoring the sequence structure in speech. We propose measures that train at the state and phone segment levels, while still decoding in the conventional framework. Experimental results on multiple corpora show that such trainings not only yield better systems in terms of performance, but also give additional improvements with sequence discriminative training. These techniques generalise across front-ends and model architectures, and efficiently handle the effect of segment duration variations on the ANN training.</subfield>
		</datafield>
	</record>
</collection>