%Aigaion2 BibTeX export from Idiap Publications %Thursday 21 November 2024 04:49:25 PM @INPROCEEDINGS{Chen_SSW12_2023, author = {Chen, Haolin and Garner, Philip N.}, keywords = {adaptive layer norm, adaptive TTS, diffusion transformer, speech synthesis}, projects = {Idiap, NAST}, month = aug, title = {Diffusion Transformer for Adaptive Text-to-Speech}, booktitle = {Proc. 12th ISCA Speech Synthesis Workshop (SSW 12)}, year = {2023}, doi = {10.21437/SSW.2023-25}, abstract = {Given the success of diffusion in synthesizing realistic speech, we investigate how diffusion can be included in adaptive text-to-speech systems. Inspired by the adaptable layer norm modules for Transformer, we adapt a new backbone of diffusion models, Diffusion Transformer, for acoustic modeling. Specifically, the adaptive layer norm in the architecture is used to condition the diffusion network on text representations, which further enables parameter-efficient adaptation. We show the new architecture to be a faster alternative to its convolutional counterpart for general text-to-speech, while demonstrating a clear advantage on naturalness and similarity over the Transformer for few-shot and few-parameter adaptation. In the zero-shot scenario, while the new backbone is a decent alternative, the main benefit of such an architecture is to enable high-quality parameter-efficient adaptation when finetuning is performed.}, pdf = {https://publications.idiap.ch/attachments/papers/2023/Chen_SSW12_2023.pdf} }