1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
import numpy as np
from cogent.util.unit_test import TestCase, main
from cogent.maths.stats.jackknife import JackknifeStats
__author__ = "Anuj Pahwa, Gavin Huttley"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Anuj Pahwa", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Gavin Huttley"
__email__ = "Gavin.Huttley@anu.edu.au"
__status__ = "Production"
def pmcc(data, axis=1):
"""Compute the Product-moment correlation coefficient.
Expression 15.3 from Biometry by Sokal/Rohlf
This code implementation is on the proviso that the data that is provided
is two dimensional: [[Y1], [Y2]] (trying to determine the correlation
coefficient between data sets Y1 and Y2"""
if axis is 0:
data = data.transpose()
axis = 1
other_axis = 0
mean = data.mean(axis=axis)
data_less_mean = np.array([data[0] - mean[0],
data[1] - mean[1]])
sum_squares = np.sum(np.square(data_less_mean), axis=axis)
sum_products = np.sum(np.prod(data_less_mean, axis=other_axis))
pmcc = np.divide(sum_products, np.sqrt(np.prod(sum_squares)))
z_trans = np.arctanh(pmcc)
return z_trans
# test data from Box 15.2; Biometry by Sokal/Rohlf
data = np.array([[159, 179, 100, 45, 384, 230, 100, 320, 80, 220, 320, 210],
[14.40, 15.20, 11.30, 2.50, 22.70, 14.90, 1.41, 15.81, 4.19, \
15.39, 17.25, 9.52]])
# factory function generator for the statistical function of interest
def stat_maker(func, data, axis):
def calc_stat(coords):
subset_data = data.take(coords, axis)
return func(subset_data, axis)
return calc_stat
# function to compute mean of a np array
def mean(data, axis):
return data.mean(axis=axis)
class JackknifeTests(TestCase):
def test_proper_initialise(self):
"""jackknife should initialise correctly"""
# Scalar
pmcc_stat = stat_maker(pmcc, data, 1)
test_knife = JackknifeStats(data.shape[1], pmcc_stat)
self.assertEqual(test_knife.n, data.shape[1])
self.assertEqual(test_knife._jackknifed_stat, None)
# Vector
mean_stat = stat_maker(mean, data, 1)
test_knife = JackknifeStats(data.shape[1], mean_stat)
self.assertEqual(test_knife.n, data.shape[1])
self.assertEqual(test_knife._jackknifed_stat, None)
def test_jackknife_stats(self):
"""jackknife results should match Sokal & Rolf example"""
# Scalar
pmcc_stat = stat_maker(pmcc, data, 1)
test_knife = JackknifeStats(data.shape[1], pmcc_stat)
self.assertAlmostEquals(test_knife.JackknifedStat, 1.2905845)
self.assertAlmostEquals(test_knife.StandardError, 0.2884490)
self.assertTrue(test_knife._jackknifed_stat is not None)
# Vector
mean_stat = stat_maker(mean, data, 1)
test_knife = JackknifeStats(data.shape[1], mean_stat)
expected_jk_stat = data.mean(axis=1)
got_jk_stat = test_knife.JackknifedStat
expected_standard_err = [30.69509346, 1.87179671]
got_standard_err = test_knife.StandardError
for index in [0,1]:
self.assertAlmostEqual(got_jk_stat[index], expected_jk_stat[index])
self.assertAlmostEqual(got_standard_err[index],
expected_standard_err[index])
def test_tables(self):
"""jackknife should work for calculators return scalars or vectors"""
# Scalar
pmcc_stat = stat_maker(pmcc, data, 1)
test_knife = JackknifeStats(data.shape[1], pmcc_stat)
expected_subsample_stats = [1.4151, 1.3946, 1.4314, 1.1889, 1.1323, \
1.3083, 1.3561, 1.3453, 1.2412, 1.3216, \
1.2871, 1.3664]
expected_pseudovalues = [0.1968, 0.4224, 0.0176, 2.6852, 3.3084, \
1.3718, 0.8461, 0.9650, 2.1103, 1.2253, \
1.6049, 0.7333]
test_knife.jackknife()
got_subsample_stats = test_knife._subset_statistics
got_pseudovalues = test_knife._pseudovalues
for index in range(data.shape[1]):
self.assertAlmostEqual(got_subsample_stats[index],
expected_subsample_stats[index], places=4)
self.assertAlmostEqual(got_pseudovalues[index],
expected_pseudovalues[index], places=4)
# Vector
mean_stat = stat_maker(mean, data, 1)
test_knife = JackknifeStats(data.shape[1], mean_stat)
test_knife.jackknife()
expected_pseudovalues = data.transpose()
expected_subsample_stats = [[ 198.9091, 11.8336],
[ 197.0909, 11.7609],
[ 204.2727, 12.1155],
[ 209.2727, 12.9155],
[ 178.4545, 11.0791],
[ 192.4545, 11.7882],
[ 204.2727, 13.0145],
[ 184.2727, 11.7055],
[ 206.0909, 12.7618],
[ 193.3636, 11.7436],
[ 184.2727, 11.5745],
[ 194.2727, 12.2773]]
got_subsample_stats = test_knife._subset_statistics
got_pseudovalues = test_knife._pseudovalues
for index1 in range(data.shape[1]):
for index2 in range(data.shape[0]):
self.assertAlmostEqual(got_subsample_stats[index1][index2],
expected_subsample_stats[index1][index2],
places=4)
self.assertAlmostEqual(got_pseudovalues[index1][index2],
expected_pseudovalues[index1][index2],
places=4)
if __name__ == "__main__":
main()
|