<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">CONF</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">keller:nips:2005/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Benchmarking Non-Parametric Statistical Tests</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Keller, Mikaela</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Bengio, Samy</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Wong, Siew Yeung</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/papers/2005/keller-nips-2005.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2=" ">
			<subfield code="u">http://publications.idiap.ch/index.php/publications/showcite/keller:rr05-38</subfield>
			<subfield code="z">Related documents</subfield>
		</datafield>
		<datafield tag="711" ind1="2" ind2=" ">
			<subfield code="a">Advances in Neural Information Processing Systems, NIPS 18. MIT Press</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2005</subfield>
		</datafield>
		<datafield tag="500" ind1=" " ind2=" ">
			<subfield code="a">IDIAP-RR 05-38</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">Although non-parametric tests have already been proposed for that purpose, statistical significance tests for non-standard measures (different from the classification error) are less often used in the literature. This paper is an attempt at empirically verifying how these tests compare with more classical tests, on various conditions. More precisely, using a very large dataset to estimate the whole ``population'', we analyzed the behavior of several statistical test, varying the class unbalance, the compared models, the performance measure, and the sample size. The main result is that providing big enough evaluation sets non-parametric tests are relatively reliable in all conditions.</subfield>
		</datafield>
	</record>
	<record>
		<datafield tag="980" ind1=" " ind2=" ">
			<subfield code="a">REPORT</subfield>
		</datafield>
		<datafield tag="970" ind1=" " ind2=" ">
			<subfield code="a">keller:rr05-38/IDIAP</subfield>
		</datafield>
		<datafield tag="245" ind1=" " ind2=" ">
			<subfield code="a">Benchmarking Non-Parametric Statistical Tests</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Keller, Mikaela</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Bengio, Samy</subfield>
		</datafield>
		<datafield tag="700" ind1=" " ind2=" ">
			<subfield code="a">Wong, Siew Yeung</subfield>
		</datafield>
		<datafield tag="856" ind1="4" ind2="0">
			<subfield code="i">EXTERNAL</subfield>
			<subfield code="u">http://publications.idiap.ch/attachments/reports/2005/keller-idiap-rr-05-38.pdf</subfield>
			<subfield code="x">PUBLIC</subfield>
		</datafield>
		<datafield tag="088" ind1=" " ind2=" ">
			<subfield code="a">Idiap-RR-38-2005</subfield>
		</datafield>
		<datafield tag="260" ind1=" " ind2=" ">
			<subfield code="c">2005</subfield>
			<subfield code="b">IDIAP</subfield>
		</datafield>
		<datafield tag="500" ind1=" " ind2=" ">
			<subfield code="a">To appear in Advances in Neural Information Processing Systems, NIPS 18. MIT Press, 2005.</subfield>
		</datafield>
		<datafield tag="520" ind1=" " ind2=" ">
			<subfield code="a">Although non-parametric tests have already been proposed for that purpose, statistical significance tests for non-standard measures (different from the classification error) are less often used in the literature. This paper is an attempt at empirically verifying how these tests compare with more classical tests, on various conditions. More precisely, using a very large dataset to estimate the whole ``population'', we analyzed the behavior of several statistical test, varying the class unbalance, the compared models, the performance measure, and the sample size. The main result is that providing big enough evaluation sets non-parametric tests are relatively reliable in all conditions.</subfield>
		</datafield>
	</record>
</collection>