File: sample.py

package info (click to toggle)
python-bitarray 3.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,288 kB
  • sloc: python: 11,456; ansic: 7,657; makefile: 73; sh: 6
file content (136 lines) | stat: -rw-r--r-- 4,568 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import sys
from math import sqrt, erf, exp, pi


class SampleMeanDist:

    # This class describes the mean of a discrete uniform distribution
    # without replacement of k integers in range(n).
    # That is, from the integers in range(n) (population), we take a random
    # sample of k (without replacement) and calculate their mean value x.
    # The distribution of x is what we are interested it.

    def __init__(self, n, k):
        assert n >= 1
        assert 1 <= k < n
        self.n = n
        self.k = k

        # the sample mean is the population mean
        self.mu = 0.5 * (n - 1)  # sum(range(n)) / n = (n - 1) / 2

        # The standard deviation of a sample, is also called standard error.
        # The standard error of the mean is the standard deviation of the
        # population, divided by the square root of the sample size k.
        # The variance of the population is:
        #
        #     (n**2 - 1) / 12     (see below)
        #
        # So for the standard deviation, we get:
        self.sigma = sqrt((n * n - 1) / (12 * k))

        # Finite population correction (FPC)
        # ----------------------------------
        #
        # Let us consider two cases:
        #
        # (a) For very small sample sizes k (compared to the population
        #     size n), the FPC is close to one.  That is, it has no effect
        #     on the standard error.  Our distribution is basically the same
        #     as if we had replacement.
        #
        # (b) For sample sizes k close to the population size n, the FPC (and
        #     hence the standard error) becomes zero.  That is, when we
        #     sample all elements, we always get the same sample mean (the
        #     population mean) with no standard error.
        #
        fpc = sqrt((n - k) / (n - 1))
        self.sigma *= fpc

    def print(self):
        print("n = %d    k = %d" % (self.n, self.k))
        print("mu = %f" % self.mu)
        print("sigma = %f" % self.sigma)

    def pdf(self, x):
        return exp(-0.5 * ((x - self.mu) / self.sigma) ** 2) / (
            sqrt(2.0 * pi) * self.sigma)

    def cdf(self, x):
        return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))

    def range_p(self, x1, x2):
        "probability for x (the mean of the sample) being in x1 < x < x2"
        assert 0 <= x1 <= x2 <= self.n
        return self.cdf(x2) - self.cdf(x1)

# ---------------------------------------------------------------------------

import unittest

class SampleMeanDistTests(unittest.TestCase):

    def test_pdf(self):
        n, k = 100, 30
        smd = SampleMeanDist(n, k)
        N = 1000
        dx = n / N
        p = 0.0
        for i in range(N + 1):
            x = i * dx
            p += smd.pdf(x) * dx
        self.assertAlmostEqual(p, 1.0)

    def test_cdf(self):
        smd = SampleMeanDist(100, 30)
        self.assertAlmostEqual(smd.cdf(  0.0), 0.0)
        self.assertAlmostEqual(smd.cdf( 49.5), 0.5)
        self.assertAlmostEqual(smd.cdf(100.0), 1.0)

    def test_range_half(self):
        for k in 10, 20, 50, 100, 150:
            smd = SampleMeanDist(200, k)
            self.assertAlmostEqual(smd.range_p(0.0, 99.5), 0.5)
            self.assertAlmostEqual(smd.range_p(99.5, 200), 0.5)

    def test_verify_pop_mean(self):
        for n in range(1, 100):
            self.assertEqual((n - 1) /2, sum(range(n)) / n)

    def test_verify_pop_variance(self):
        for n in range(1, 100):
            mean = (n - 1) / 2
            sigma2 = sum((j - mean)**2 for j in range(n)) / n
            self.assertEqual((n * n - 1) / 12, sigma2)


if __name__ == '__main__':
    if len(sys.argv) == 1:
        unittest.main()

    # This code was used to create some of the tests for util.random_k()
    # in ../test_random.py
    #
    # python sample.py 100_000 1000 500 0 500 510 520 1000
    # python sample.py 100_000 500 400 200 249.5 251 255 260 300

    from itertools import pairwise

    m, n, k = [int(i) for i in sys.argv[1:4]]
    smd = SampleMeanDist(n, k)
    smd.print()
    ranges = [float(x) for x in sys.argv[4:]]
    print("ranges = %r" % ranges)
    p_tot = 0.0
    for i, (x1, x2) in enumerate(pairwise(ranges)):
        p = smd.range_p(x1, x2)
        p_tot += p
        mu = m * p
        sigma = sqrt(m * p * (1.0 - p))
        if mu < 10 * sigma:
            continue
        fmt = "self.assertTrue(abs(C[{:d}] - {:6_d}) <= {:6_d})  # p = {:f}"
        print(fmt.format(i, round(mu), round(10 * sigma), p))

    if abs(p_tot - 1.0) > 1e-15:
        print(p_tot)