File: digits.py

package info (click to toggle)
python-bumps 1.0.0b2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 6,144 kB
  • sloc: python: 23,941; xml: 493; ansic: 373; makefile: 209; sh: 91; javascript: 90
file content (211 lines) | stat: -rw-r--r-- 6,675 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
Significant digits
------------------

Calculate the number significant digits in a statistic using boot
strapping/jackknife.

The 'gum' method from the GUM supplement[1] splits the sequence into n
independent sequences, computes the statistic for each sequence, and
returns the mean and variance of the statistic as the estimated value
with uncertainty.  Each block should have at least 10000 samples, or
100/(1-ci) samples if the desired credible interval ci > 0.99.

The 'jack' method uses

notes:

- the gum method gives a slight underestimate of the sd, while
  jackknife gives an overestimate.
- Jackknife performs better than gum on small samples
- in current implementation, gum is much faster
- in picking the number of cuts for jackknife, lower is usually better
  (but not too low...)

[1] JCGM, 2008. Evaluation of measurement data - Supplement 1 to the "Guide to
the expression of uncertainty in measurement" - Propagation of distributions
using a Monte Carlo method (No. JCGM 101:2008). Geneva, Switzerland.
"""

"""
THINGS TO DO:
- actually useful display of information in output
- an effective, fast, online version to use as a stoping crit
- use as a method to guess how many more samples:
    - provides estimate of sd of sampling distribution of statistic
    - sd of stat for n samples = sd for k samples / sqrt(n/k - 1)
    - not actually well tested...
- provide better user accessible control hooks
- do better deciding between jack and gum
- these don't deal with lists of CIs (as the actual ci functions do)
- very slow, needs a restructure
"""
# TODO: needs a refactor before going into production

import numpy as np
import numpy.ma as ma

from .stats import credible_interval


def credible_interval_sd(data, ci, fn=None, method=None, cuts=None):
    if fn is None:
        # ugly hack here
        fn = lambda *a, **kw: credible_interval(*a, **kw)[0]

    if method is None:
        # do something clever to choose gum or jack
        method = "gum"

    # Determine the number of cuts
    if cuts is None or cuts <= 0:
        # Use groups of 10000 unless looking for ci > 0.99, in which case
        # use groups of 100/(1-ci).
        M = max(int(100 / (1 - ci)), 10000)
        cuts = len(data) // M
    if cuts < 3:
        cuts = 3

    if method == "gum":
        return gum_sd(data, fn, ci)
    elif method == "jack":
        return jack_sd(data, fn, ci)
    else:
        raise ValueError("Unknown sd method" + method)


# TODO does not handle either CI function natively
# requires a 1d ndarray return value


def gum_sd(data, f, ci, cuts=10):
    """
    adaptation of method in section 7.9.4 of gum suppliment 1
    """
    # reshape data into blocks, skipping the first partial block
    M = len(data) // cuts
    data = data[-M * cuts :].reshape((cuts, M))
    # compute the variance of the statistic over the sub-blocks
    stat = np.apply_along_axis(f, 1, data, ci)
    var = np.var(stat, axis=0, ddof=1)
    # estimate standard deviation of the statistic for the full data set
    return np.sqrt(var / cuts)


def jack_sd(data, f, ci, cuts=10):
    """
    does an average of sd on smaller blocks to counteract skewed
    distribution of jackknife
    """
    # reshape data into blocks, skipping the first partial block
    M = len(data) // cuts
    data = data[-M * cuts :].reshape((cuts, M))
    # compute the standard deviation of the statistic over the sub-blocks
    std = np.apply_along_axis(fast_jack, 1, data, f, ci)
    # estimate standard deviation of the statistic for the full data set
    return np.mean(std, axis=0) / np.sqrt(cuts)


# TODO: does not deal well with a list of CIs
# returns the sd of a statistic, f
def fast_jack(data, f, *a, **kw):
    jack = FastJackknife(data=data, fn=f)
    jack.set_args(*a, **kw)
    return jack.std()


# TODO: this is coded badly and so is very slow
class FastJackknife(object):
    def __init__(self, **kw):
        self.fn = None
        self._x = None
        for k, v in kw.items():
            try:
                setattr(self, k, v)
            except AttributeError:
                pass

    @property
    def data(self):
        return self._x

    @data.setter
    def data(self, value):
        self._x = np.sort(value)
        self.n = len(value)

    @property
    def fn(self):
        if self._fn is not None:
            return lambda x: self._fn(x, *self._args, **self._kwargs)
        else:
            return None

    @fn.setter
    def fn(self, f):
        self._fn = f

    def set_args(self, *a, **kw):
        self._args = a
        self._kwargs = kw

    def _f_drop(self, i):
        from numpy import zeros_like

        mask = zeros_like(self.data)
        mask[i] = 1
        mx = ma.array(self.data, mask=mask)
        return np.array(self.fn(mx.compressed()))

    def pooled(self):
        assert self.data is not None and self.fn is not None
        return self.fn(self.data)

    def std(self):
        from numpy import sqrt, var, empty, searchsorted

        true_stat = self.fn(self.data)
        drop_stats = empty((self.n, len(true_stat)))

        # this part is specific for CI
        idx_lo, idx_hi = searchsorted(self.data, true_stat)
        self._fill_interval(drop_stats, (0, idx_lo), true_stat)
        self._fill_interval(drop_stats, (idx_lo, idx_hi), true_stat)
        self._fill_interval(drop_stats, (idx_hi, self.n - 1), true_stat)

        return sqrt((self.n - 1) * var(drop_stats, axis=0, ddof=1))

    def _fill_interval(self, drop_stats, interval, ev, splits=None):
        lo, hi = interval
        v_hi = self._f_drop(hi)
        v_lo = self._f_drop(lo)
        a = np.array([v_lo, v_hi, ev]).T
        g = lambda ar: (ar[0] == ar).all()
        if np.apply_along_axis(g, 1, a).all():
            for i in range(lo, hi + 1):
                drop_stats[i] = v_hi
        elif hi == lo:
            drop_stats[hi] = v_hi
        elif hi == lo + 1:
            drop_stats[hi] = v_hi
            drop_stats[lo] = v_lo
        else:
            splits = max(int(np.ceil(np.log10(hi - lo))), 2)
            dx = (hi - lo) // splits
            intervals = [[lo + dx * i, lo + dx * (i + 1)] for i in range(splits)]
            intervals[-1][1] = hi
            for inter in intervals:
                self._fill_interval(drop_stats, inter, np.mean((v_lo, v_hi), axis=0))


def n_dig(sx, sd):
    """
    Converts standard deviation to # of significand digits.

    *sx* is standard deviation of the statistic
    *sd* is standard deviation of the sample

    Adapted from section 7.9.2 of gum suppliment 1
    """
    l = np.log10(4 * sx)
    return np.floor(np.log10(sd)) - l + 1  # return as float or round now?