File: statistics.py

package info (click to toggle)
python-petl 1.7.17-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,224 kB
  • sloc: python: 22,617; makefile: 109; xml: 9
file content (98 lines) | stat: -rw-r--r-- 2,515 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from __future__ import absolute_import, print_function, division


from collections import namedtuple


from petl.util.base import values, Table


def limits(table, field):
    """
    Find minimum and maximum values under the given field. E.g.::

        >>> import petl as etl
        >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]]
        >>> minv, maxv = etl.limits(table, 'bar')
        >>> minv
        1
        >>> maxv
        3

    The `field` argument can be a field name or index (starting from zero).

    """

    vals = iter(values(table, field))
    try:
        minv = maxv = next(vals)
    except StopIteration:
        return None, None
    else:
        for v in vals:
            if v < minv:
                minv = v
            if v > maxv:
                maxv = v
        return minv, maxv


Table.limits = limits


_stats = namedtuple('stats', ('count', 'errors', 'sum', 'min', 'max', 'mean',
                              'pvariance', 'pstdev'))


def stats(table, field):
    """
    Calculate basic descriptive statistics on a given field. E.g.::

        >>> import petl as etl
        >>> table = [['foo', 'bar', 'baz'],
        ...          ['A', 1, 2],
        ...          ['B', '2', '3.4'],
        ...          [u'B', u'3', u'7.8', True],
        ...          ['D', 'xyz', 9.0],
        ...          ['E', None]]
        >>> etl.stats(table, 'bar')
        stats(count=3, errors=2, sum=6.0, min=1.0, max=3.0, mean=2.0, pvariance=0.6666666666666666, pstdev=0.816496580927726)

    The `field` argument can be a field name or index (starting from zero).

    """

    _min = None
    _max = None
    _sum = 0
    _mean = 0
    _var = 0
    _count = 0
    _errors = 0
    for v in values(table, field):
        try:
            v = float(v)
        except (ValueError, TypeError):
            _errors += 1
        else:
            _count += 1
            if _min is None or v < _min:
                _min = v
            if _max is None or v > _max:
                _max = v
            _sum += v
            _mean, _var = onlinestats(v, _count, mean=_mean, variance=_var)
    _std = _var**.5
    return _stats(_count, _errors, _sum, _min, _max, _mean, _var, _std)


Table.stats = stats


def onlinestats(xi, n, mean=0, variance=0):
    # function to calculate online mean and variance
    meanprv = mean
    varianceprv = variance
    mean = (((n - 1)*meanprv) + xi)/n
    variance = (((n - 1)*varianceprv) + ((xi - meanprv)*(xi - mean)))/n
    return mean, variance