<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">REPORT</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Potard_Idiap-RR-02-2015/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Preliminary Work on Speaker Adaptation for DNN-Based Speech Synthesis</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Potard, Blaise</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Motlicek, Petr</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Imseng, David</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">deep neural networks</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">fmllr</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">neural network</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">speaker adaptation</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">speech synthesis</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">text-to-speech</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/reports/2014/Potard_Idiap-RR-02-2015.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="088" ind1=" " ind2=" ">
			<subfield code="a">Idiap-RR-02-2015</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2015</subfield>
			<subfield code="b">Idiap</subfield>
		</datafield>
		<datafield tag="771" ind1="2" ind2=" ">
			<subfield code="d">January 2015</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">We investigate speaker adaptation in the context of deep neural network (DNN) based speech synthesis.
More specifically, our current work focuses on the exploitation of auxiliary information such as 
gender, speaker identity or age during the DNN training process.
The proposed technique is compared to standard acoustic feature
transformations such as the feature based maximum likelihood linear
regression (FMLLR) based speaker adaptation.
Objective error measurements as well as perceptual experiments, performed on the WSJCAM0 database, suggest that
the proposed method is superior to standard feature transformations.</subfield>
		</datafield>
	</record>
</collection>