%Aigaion2 BibTeX export from Idiap Publications
%Wednesday 22 July 2026 01:23:16 AM

@PHDTHESIS{KarimiMahabadi_THESIS_2023,
                      author = {Karimi Mahabadi, Rabeeh},
                    keywords = {adapter, bias-reduction, Few-shot learning, fine-tuning, generalization, low-resource setting, Multi-task learning, parameter-efficient fine-tuning, robustness, transfer learning},
                    projects = {Idiap},
                       month = mar,
                       title = {Improving Generalization of Pretrained Language Models},
                        year = {2023},
                      school = {Ecole polytechnique f{\'{e}}d{\'{e}}rale de Lausanne (EPFL)},
                         url = {https://infoscience.epfl.ch/handle/20.500.14299/196590},
                         doi = {10.5075/epfl-thesis-8664},
                    abstract = {Transfer learning, where a language model is first pre-trained on large-scale unlabeled data followed
by fine-tuning on a downstream task, has emerged as a dominating technique in natural language
processing obtaining the state of the art results on a wide range of tasks. The striking effectiveness
of transfer learning has given rise to a variety of methods, and practice. In spite of this rapid progress,
transfer learning and building robust models from these pretrained language models (PLMs) which can
generalize to unseen domains is a multifaceted problem, requiring addressing several open questions
such as a) training models robust to dataset biases, which can generalize better in real-world scenarios b)
reducing over-fitting when fine-tuning PLMs on low-resource setting c) fine-tuning strategies allowing
learning from multiple training resources while being able to generalize to new domains d) efficient
and effective fine-tuning methods for PLMs allowing transfer learning with limited data and to new
domains e) few-shot learning with few labeled examples. In this dissertation, we propose multiple
methods to improve generalization of PLMs from different aspects:
Our first contribution is to propose two learning strategies to train neural models, which are more
robust to dataset biases and transfer better to out-of-domain datasets. We specify the biases in terms of
one or more bias-only models, which learn to leverage the dataset biases. During training, the bias-only
models’ predictions are used to adjust the loss of the base model to reduce its reliance on biases by
down-weighting the biased examples and focusing training on the hard examples. Results show that
our debiasing methods greatly improve robustness on several natural language understanding (NLU)
benchmarks and improve transfer learning to other textual entailment datasets.
Our second contribution is to propose an effective regularization method to reduce overfitting when
fine-tuning PLMs on a small number of training data. We leverage Variational Information Bottleneck
(VIB) (Alemi et al., 2017) to suppress irrelevant features when fine-tuning on low-resource target tasks
and show that our method effectively reduces overfitting. Moreover, we show that our VIB model
finds sentence representations that are more robust to biases in natural language inference datasets,
and thereby substantially improves generalization to out-of-domain datasets.
Our third contribution is to develop an effective and parameter-efficient way to fine-tune PLMs in a
multi-task learning setup while allowing generalization to new domains. Our method allows sharing
information across tasks to enable positive transfer to low-resource and related tasks while avoiding
negative task interference. we propose H YPER F ORMER ++, which employs a compact hypernetwork
(Ha et al., 2017; Oswald et al., 2020) shared across tasks and layers. The hypernetwork then learns
to generate task and layer-specific adapter parameters, conditioned on task and layer id embeddings in
a transformer model. This parameter-efficient multi-task learning framework allows us to achieve the best of both worlds by sharing knowledge across tasks via hypernetworks while enabling the model
to adapt to each individual task through task-specific adapters. Experiments on the well-known GLUE
benchmark show improved performance in multi-task learning while adding only 0.29\% parameters
per task. We additionally demonstrate substantial performance improvements in few-shot domain
generalization across a variety of tasks.
Our fourth contribution is to propose C OMPACTER , a method for fine-tuning large-scale language
models with a better trade-off between task performance and the number of trainable parameters
than prior work. C OMPACTER accomplishes this by building on top of ideas from adapters, low-rank
optimization (Aghajanyan et al., 2021), and parameterized hypercomplex multiplication layers (Zhang
et al., 2021a). Specifically, C OMPACTER inserts task-specific weight matrices into a pretrained model’s
weights, which are computed efficiently as a sum of Kronecker products between shared “slow”
weights and “fast” rank-one matrices defined per C OMPACTER layer. By only training 0.047\% of a
pretrained model’s parameters, C OMPACTER performs on par with standard fine-tuning on GLUE and
outperforms standard fine-tuning on SuperGLUE and low-resource settings.
Our final contribution is to propose P ERFECT , a simple and efficient method for few-shot fine-tuning
of PLMs without relying on any handcrafting, which is highly effective given as few as 32 data points.
This is in contrast to the current methods for few-shot fine-tuning of pretrained masked language
model (PLM) require carefully engineered prompts and verbalizers for each new task, to convert
examples into a cloze-format that the PLM can score. P ERFECT makes two key design choices: First,
we show that manually engineered task prompts can be replaced with task-specific adapters that
enable sample-efficient fine-tuning and reduce memory and storage costs by roughly factors of 5 and
100, respectively. Second, instead of using handcrafted verbalizers, we learn a new multi-token label
embedding during fine-tuning which are not tied to the model vocabulary and which allows us to avoid
complex auto-regressive decoding. These embeddings are not only learnable from limited data but also
enables nearly 100x faster training and inference. Experiments on a wide range of few shot NLP tasks
demonstrate that P ERFECT , while being simple and efficient, also outperforms existing state-of-the-art
few-shot learning methods.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2023/KarimiMahabadi_THESIS_2023.pdf}
}