<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">CONF</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">Sarkar_VIHAR_2024/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">On the Utility of Speech and Audio Foundation Models for Marmoset Call Analysis</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Sarkar, Eklavya</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Magimai-Doss, Mathew</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">bandwidth</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">bioacoustics</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">call-type and caller classification</subfield>
		</datafield>
		<datafield tag="653" ind1="1" ind2=" ">
			<subfield code="a">speech and audio</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/papers/2024/Sarkar_VIHAR_2024.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="711" ind1="2" ind2=" ">
			<subfield code="a">4th International Workshop on Vocal Interactivity in-and-between Humans, Animals and Robots</subfield>
		</datafield>
		<datafield tag="440" ind1=" " ind2=" ">
			<subfield code="a">Interspeech 2024 Satellite Event</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2024</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">Marmoset monkeys encode vital information in their calls and serve as a surrogate model for neuro-biologists to understand the evolutionary origins of human vocal communication. Traditionally analyzed with signal processing-based features, recent approaches have utilized self-supervised models pre-trained on human speech for feature extraction, capitalizing on their ability to learn a signal's intrinsic structure independently of its acoustic domain. However, the utility of such foundation models remains unclear for marmoset call analysis in terms of multi-class classification, bandwidth, and pre-training domain. This study assesses feature representations derived from speech and general audio domains, across pre-training bandwidths of 4, 8, and 16 kHz for marmoset call-type and caller classification tasks. Results show that models with higher bandwidth improve performance, and pre-training on speech or general audio yields comparable results, improving over a spectral baseline.</subfield>
		</datafield>
	</record>
</collection>