%Aigaion2 BibTeX export from Idiap Publications %Thursday 21 November 2024 01:05:43 PM @TECHREPORT{Sarfjoo_Idiap-RR-10-2019, author = {Sarfjoo, Seyyed Saeed and Magimai.-Doss, Mathew and Marcel, S{\'{e}}bastien}, editor = {Sarfjoo, Seyyed Saeed}, projects = {Tesla, UNITS}, month = {9}, title = {Domain Adaptation and Investigation of Robustness of DNN-based Embeddings for Text-Independent Speaker Verification Using Dilated Residual Networks}, type = {Idiap-RR}, number = {Idiap-RR-10-2019}, year = {2019}, institution = {Idiap}, address = {Centre du Parc, Rue Marconi 19, P.O. Box 592, CH - 1920 Martigny}, abstract = {Robustness of extracted embeddings in cross-database scenarios is one of the main challenges in text-independent speaker verification (SV) systems. In this paper, we investigate this robustness via performing structural cross-database experiments with or without additive noise. This noise can be added from the seen set, where the noise type is similar to the noise which is used in data augmentation for training the SV model, or unseen set, where distribution of additive noise in train and evaluation sets are different. For extracting the robust embeddings, we investigate applying the time dilation in the ResNet architecture, so-called dilated residual network (DRN). Dimension and number of segment level layers are tuned in this architecture. The proposed model with time dilation significantly outperformed the ResNet model and is comparable with the state-of-the-art SV systems on Voxceleb1 dataset. In addition, this architecture showed significant robustness in out of domain scenarios. Language mismatch is part of domain mismatch which recently is one of the main focuses of research in SV systems. Similar to image recognition field, we hypothesize that low-level convolutional neural network (CNN) layers are domain-specific features while high-level CNN layers are domain-independent and have more discriminative power. For adapting these domain-specific units, combination of triplet and intra-class losses are investigated. The adapted model on the evaluation part of the CMN2 dataset, relatively outperformed the DRN and x-vector SV systems without adaptation with 8.0 and 20.5 \%, respectively in equal error-rate.}, pdf = {https://publications.idiap.ch/attachments/reports/2019/Sarfjoo_Idiap-RR-10-2019.pdf} }