%Aigaion2 BibTeX export from Idiap Publications
%Friday 05 December 2025 05:03:34 PM

@PHDTHESIS{Fehr_THESIS_2025,
                      author = {Fehr, Fabio},
                    projects = {Idiap, EVOLANG},
                       month = sep,
                       title = {Nonparametric Variational Information Bottleneck: Attention-based Architectures as Latent Variable Models},
                        year = {2025},
                      school = {EPFL},
                         url = {https://infoscience.epfl.ch/handle/20.500.14299/253760},
                    abstract = {Transformers have achieved remarkable success across modalities including text, graphs, speech, and vision, enabled by the attention mechanism. Yet the inductive biases that shape how attention encodes information and supports generalisation are still not well understood. Latent variable models offer a principled framework for explaining the encoded information, improving generalisation through regularisation, and enabling generative modelling. However, applying latent variable models to attention-based architectures is challenging, as attention functions over sets that are both variable in size and permutation-invariant. This thesis introduces the Nonparametric Variational Information Bottleneck (NVIB), a deep latent variable framework that models attention as posterior inference over a Dirichlet process mixture, aligning naturally with these set-based properties. We show that NVIB enables training a novel Transformer-based variational autoencoder from scratch, sparsifying the number of embeddings while regularising their content. As a generative model, it supports smooth interpolation and sampling within variable-sized latent spaces. When applied across stacked self-attention layers, NVIB induces hierarchical abstraction, improving interpretability, robustness, and linguistic alignment. This framework allows for pretrained Transformers to be reinterpreted as nonparametric variational models. NVIB reveals how they encode and separate reliable from unreliable information, enabling a novel and controllable post-training regularisation that improves out-of-distribution generalisation. Finally, NVIB boosts out-of-distribution performance during fine-tuning on speech, text, graph, and vision benchmarks, confirming its effectiveness in inducing generalisable representations across diverse models and tasks. Overall, the thesis offers a variational Bayesian perspective on attention, unifying regularisation, explanation, and generation, and opening new paths for advancing representation learning.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2025/Fehr_THESIS_2025.pdf}
}