%Aigaion2 BibTeX export from Idiap Publications
%Sunday 27 April 2025 04:46:13 PM

@INPROCEEDINGS{Tafasca_CVPR_2024,
         author = {Tafasca, Samy and Gupta, Anshul and Odobez, Jean-Marc},
       projects = {Idiap, AI4Autism},
          month = jun,
          title = {Sharingan: A Transformer Architecture for Multi-Person Gaze Following},
      booktitle = {Int. Conference Computer Vision and Pattern Recognition (CVPR), Seatle},
           year = {2024},
       abstract = {Gaze is a powerful form of non-verbal communication that humans develop from an early age. As such, modeling this behavior is an important task that can benefit a broad set of application domains ranging from robotics to soci- ology. In particular, the gaze following task in computer vision is defined as the prediction of the 2D pixel coordi- nates where a person in the image is looking. Previous at- tempts in this area have primarily centered on CNN-based architectures, but they have been constrained by the need to process one person at a time, which proves to be highly inefficient. In this paper, we introduce a novel and effective multi-person transformer-based architecture for gaze pre- diction. While there exist prior works using transformers for multi-person gaze prediction [38, 39], they use a fixed set of learnable embeddings to decode both the person and its gaze target, which requires a matching step afterward to link the predictions with the annotations. Thus, it is diffi- cult to quantitatively evaluate these methods reliably with
the available benchmarks, or integrate them into a larger human behavior understanding system. Instead, we are the first to propose a multi-person transformer-based architec- ture that maintains the original task formulation and en- sures control over the people fed as input. Our main con- tribution lies in encoding the person-specific information into a single controlled token to be processed alongside image tokens and using its output for prediction based on a novel multiscale decoding mechanism. Our new archi- tecture achieves state-of-the-art results on the GazeFollow, VideoAttentionTarget, and ChildPlay datasets and outper- forms comparable multi-person architectures with a notable margin. Our code, checkpoints, and data extractions will be made publicly available soon.},
            pdf = {https://publications.idiap.ch/attachments/papers/2024/Tafasca_CVPR_2024.pdf}
}