<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">CONF</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Cernak_ICASSP15_2015/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Phonological Vocoding Using Artificial Neural Networks</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Cernak, Milos</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Potard, Blaise</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Garner, Philip N.</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/papers/2015/Cernak_ICASSP15_2015.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2=" ">
			<subfield code="u">http://publications.idiap.ch/index.php/publications/showcite/Cernak_Idiap-RR-04-2015</subfield>
			<subfield code="z">Related documents</subfield>
		</datafield>
		<datafield tag="711" ind1="2" ind2=" ">
			<subfield code="a">IEEE 40th International Conference on Acoustics, Speech and Signal Processing (ICASSP)</subfield>
			<subfield code="c">Brisbane, Australia</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2015</subfield>
			<subfield code="b">IEEE</subfield>
		</datafield>
		<datafield tag="773" ind1=" " ind2=" ">
			<subfield code="c">4844-4848</subfield>
		</datafield>
		<datafield tag="024" ind1="7" ind2=" ">
			<subfield code="a">10.1109/ICASSP.2015.7178891</subfield>
			<subfield code="2">doi</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">We investigate a vocoder based on artificial neural networks using a
phonological speech representation. Speech decomposition is based on
the phonological encoders, realised as neural network classifiers,
that are trained for a particular language. The speech reconstruction
process involves using a Deep Neural Network (DNN) to map phonological
features posteriors to speech parameters -- line spectra and glottal
signal parameters -- followed by LPC resynthesis. This DNN is trained
on a target voice without transcriptions, in a semi-supervised
manner. Both encoder and decoder are based on neural networks and
thus the vocoding is achieved using a simple fast forward pass. An
experiment with French vocoding and a target male voice trained on 21
hour long audio book is presented. An application of the phonological
vocoder to low bit rate speech coding is shown, where transmitted
phonological posteriors are pruned and quantized. The vocoder with
scalar quantization operates at 1 kbps, with potential for lower
bit-rate.</subfield>
		</datafield>
	</record>
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">REPORT</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Cernak_Idiap-RR-04-2015/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Phonological vocoding using artificial neural networks</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Cernak, Milos</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Potard, Blaise</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Garner, Philip N.</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">low bit rate speech coding</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">Parametric vocoding</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">phonology</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/reports/2014/Cernak_Idiap-RR-04-2015.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="088" ind1=" " ind2=" ">
			<subfield code="a">Idiap-RR-04-2015</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2015</subfield>
			<subfield code="b">Idiap</subfield>
		</datafield>
		<datafield tag="771" ind1="2" ind2=" ">
			<subfield code="d">February 2015</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">We investigate a vocoder based on artificial neural networks using a
phonological speech representation. Speech decomposition is based on
the phonological encoders, realised as neural network classifiers,
that are trained for a particular language. The speech reconstruction
process involves using a Deep Neural Network (DNN) to map phonological
features posteriors to speech parameters -- line spectra and glottal
signal parameters -- followed by LPC resynthesis. This DNN is trained
on a target voice without transcriptions, in a semi-supervised
manner. Both encoder and decoder are based on neural networks and
thus the vocoding is achieved using a simple fast forward pass. An
experiment with French vocoding and a target male voice trained on 21
hour long audio book is presented. An application of the phonological
vocoder to low bit rate speech coding is shown, where transmitted
phonological posteriors are pruned and quantized. The vocoder with
scalar quantization operates at 1 kbps, with potential for lower
bit-rate.</subfield>
		</datafield>
	</record>
</collection>