%Aigaion2 BibTeX export from Idiap Publications %Monday 20 January 2025 07:20:50 PM @INPROCEEDINGS{Viglino_INTERSPEECH_2019, author = {Viglino, Thibault and Motlicek, Petr and Cernak, Milos}, projects = {Idiap, CTI-Shaped}, month = sep, title = {End-to-End Accented Speech Recognition}, booktitle = {International Conference on Speech and Language Processing, Interspeech}, year = {2019}, pages = {2140-2144}, location = {Graz, Austria}, organization = {ISCA}, doi = {10.21437}, crossref = {Viglino_Idiap-RR-04-2022}, abstract = {Correct pronunciation is known to be the most difficult part to acquire for (native or non-native) language learners. The accented speech is thus more variable, and standard Automatic Speech Recognition (ASR) training approaches that rely on intermediate phone alignment might introduce errors during the ASR training. With end-to-end training we could alleviate this problem. In this work, we explore the use of multi-task training and accent embedding in the context of end-to-end ASR trained with the connectionist temporal classification loss. Comparing to the baseline developed using conventional ASR framework exploiting time-delay neural networks trained on accented English, we show significant relative improvement of about 25\% in word error rate. Additional evaluation on unseen accent data yields relative improvements of of 31\% and 2\% for New Zealand English and Indian English, respectively.}, pdf = {https://publications.idiap.ch/attachments/papers/2022/Viglino_INTERSPEECH_2019.pdf} } crossreferenced publications: @TECHREPORT{Viglino_Idiap-RR-04-2022, author = {Viglino, Thibault and Motlicek, Petr and Cernak, Milos}, keywords = {accent embedding, Accented speech, end-to-end, multi-task, speech recognition}, projects = {Idiap}, month = {3}, title = {End-to-end Accented Speech Recognition}, type = {Idiap-RR}, number = {Idiap-RR-04-2022}, year = {2022}, institution = {Idiap}, address = {Rue Marconi 19, Martigny}, abstract = {Correct pronunciation is known to be the most difficult part to acquire for (native or non-native) language learners. The accented speech is thus more variable, and standard Automatic Speech Recognition (ASR) training approaches that rely on intermediate phone alignment might introduce errors during the ASR training. With end-to-end training we could alleviate this problem. In this work, we explore the use of multi-task training and accent embedding in the context of end-to-end ASR trained with the connectionist temporal classification loss. Comparing to the baseline developed using conventional ASR framework exploiting time-delay neural networks trained on accented English, we show significant relative improvement of about 25\% in word error rate. Additional evaluation on unseen accent data yields relative improvements of of 31\% and 2\% for New Zealand English and Indian English, respectively.}, pdf = {https://publications.idiap.ch/attachments/reports/2019/Viglino_Idiap-RR-04-2022.pdf} }