%Aigaion2 BibTeX export from Idiap Publications
%Saturday 21 December 2024 05:39:20 PM

@ARTICLE{Imseng_TASLP_2010,
         author = {Imseng, David and Friedland, Gerald},
       projects = {Idiap, IM2},
          month = {11},
          title = {Tuning-Robust Initialization Methods for Speaker Diarization},
        journal = {IEEE Transactions on  Audio, Speech, and Language Processing},
         volume = {18},
         number = {8},
           year = {2010},
          pages = {2028-2037},
           issn = {1558-7916},
            doi = {10.1109/TASL.2010.2040796},
       crossref = {Imseng_Idiap-RR-35-2010},
       abstract = {This paper investigates a typical speaker diarization system regarding its robustness against initialization parameter variation and presents a method to reduce manual tuning of these values significantly. The behavior of an agglomerative hierarchical clustering system is studied to determine which initialization parameters impact accuracy most. We show that the accuracy of typical systems is indeed very sensitive to the values chosen for the initialization parameters and factors such as the duration of speech in the recording. We then present a solution that reduces the sensitivity of the initialization values and therefore reduces the need for manual tuning significantly while at the same time increasing the accuracy of the system. For short meetings extracted from the previous (2006, 2007, and 2009) National Institute of Standards and Technology (NIST) Rich Transcription (RT) evaluation data, the decrease of the diarization error rate is up to 50\% relative. The approach consists of a novel initialization parameter estimation method for speaker diarization that uses agglomerative clustering with Bayesian information criterion (BIC) and Gaussian mixture models (GMMs) of frame-based cepstral features (MFCCs). The estimation method balances the relationship between the optimal value of the seconds of speech data per Gaussian and the duration of the speech data and is combined with a novel nonuniform initialization method. This approach results in a system that performs better than the current ICSI baseline engine on datasets of the NIST RT evaluations of the years 2006, 2007, and 2009.},
            pdf = {https://publications.idiap.ch/attachments/papers/2010/Imseng_TASLP_2010.pdf}
}



crossreferenced publications: 
@TECHREPORT{Imseng_Idiap-RR-35-2010,
         author = {Imseng, David and Friedland, Gerald},
       projects = {Idiap, IM2},
          month = {10},
          title = {Tuning-Robust Initialization Methods for Speaker Diarization},
           type = {Idiap-RR},
         number = {Idiap-RR-35-2010},
           year = {2010},
    institution = {Idiap},
        address = {Centre du Parc, Rue Marconi 19, Case Postale 592, CH-1920 Martigny},
       abstract = {This paper investigates a typical Speaker Diarization system regarding its robustness against initialization parameter variation and presents a method to reduce manual tuning of these values significantly. The behavior of an agglomerative hierarchical clustering system is studied to determine which initialization parameters impact accuracy most. We show that the accuracy of typical systems is indeed very sensitive to the values chosen for the initialization parameters and factors such as the length of the recording. We then present a solution that reduces the sensitivity of the initialization values and therefore reduces the need for manual tuning significantly while at the same time increasing the accuracy of the system. For short meetings extracted from the previous (2006 and 2007) National Institute of Standards and Technology (NIST) Rich Transcription (RT) evaluation data, the decrease of the Diarization Error Rate is up to 50\% relative. The approach consists of a novel initial parameter estimation method for Speaker Diarization that uses agglomerative clustering with Bayesian Information Criterion (BIC) and Gaussian Mixture Models (GMMs) of frame-based cepstral features (MFCCs). The estimation method leverages the relationship between the optimal value of the seconds of speech data per Gaussian and the duration of the speech data and is combined with a novel non-uniform initialization method. This approach results in a system that performs better than the current ICSI baseline engine on datasets of the NIST RT evaluations of the years 2006 and 2007.},
            pdf = {https://publications.idiap.ch/attachments/reports/2009/Imseng_Idiap-RR-35-2010.pdf}
}