%Aigaion2 BibTeX export from Idiap Publications
%Friday 22 May 2026 08:02:27 PM

@PHDTHESIS{Courdier_THESIS_2024,
                      author = {Courdier, Evann},
                    keywords = {ambiguous segmentation, discrete diffusion, efficient transformers, future segmentation, patch pausing, real-time segmentation, semantic segmentation},
                    projects = {Idiap},
                       month = feb,
                       title = {Fast and Future: Towards Efficient Forecasting in Video Semantic Segmentation},
                        year = {2024},
                      school = {Ecole polytechnique f{\'{e}}d{\'{e}}rale de Lausanne (EPFL)},
                         url = {https://infoscience.epfl.ch/handle/20.500.14299/203213},
                         doi = {10.5075/epfl-thesis-9858},
                    abstract = {Deep learning has revolutionized the field of computer vision, a success largely attributable
to the growing size of models, datasets, and computational power. Simultaneously, a criti-
cal pain point arises as several computer vision applications are deployed on low-power
embedded devices, necessitating real-time processing capabilities. This challenge intensi-
fies for semantic segmentation, a dense prediction task demanding substantial memory
and computational resources. This thesis explores techniques to streamline real-time
segmentation networks, enhance their efficiency, and deal with potential ambiguity.
First, we introduce a latency-aware segmentation metric, a measure that combines the
mean Intersection over Union with the network processing time, providing a practical
metric for applied settings. Emphasis is placed on the concept of "anticipation" in real-
time networks - these systems should be capable of predicting future input segmentation.
Consequently, we then design an anticipatory convolutional network incorporating an
inventive convolution layer. This novel layer reduces computation by reusing features
from previous video frame computations, exploiting their temporal coherence. Next, we
present a method to accelerate transformer-based segmentation networks called ‘patch-
pausing’. This technique halts the processing of image patches deemed to be already
correctly segmented by assessing the network’s confidence in its prediction. Remarkably,
our experimental results indicate that more than half of the patches can be paused early in
the process, with a minimal impact on segmentation accuracy. This study concludes with
the introduction of a discrete diffusion model for segmentation. This model allows for the
sampling of multiple potential segmentations for a given input while accurately following
the training data distribution. Combining this diffusion model within an autoregressive
scheme, we successfully showcase its capacity to generate long-term future predictions of
segmentation.
The implementation and evaluation of these approaches contribute to the ongoing efforts
to improve real-time segmentation networks and facilitate more efficient deployment of
computer vision applications on low-power devices.},
                         pdf = {https://publications.idiap.ch/attachments/papers/2024/Courdier_THESIS_2024.pdf}
}