File: counter.pyx

package info (click to toggle)
python-preshed 4.0.0-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 224 kB
  • sloc: python: 367; ansic: 160; sh: 13; makefile: 6
file content (192 lines) | stat: -rw-r--r-- 6,189 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""Count occurrences of uint64-valued keys."""
from __future__ import division
cimport cython
from cython.operator cimport dereference as deref
from libc.math cimport log, exp, sqrt
from libcpp.memory cimport make_unique


cdef class PreshCounter:
    def __init__(self, initial_size=8):
        assert initial_size != 0
        assert initial_size & (initial_size - 1) == 0
        self.c_map = make_unique[MapStruct]()
        map_init(self.c_map.get(), initial_size)
        self.smoother = None
        self.total = 0

    property length:
        def __get__(self):
            return deref(self.c_map).cells.size()

    def __len__(self):
        return deref(self.c_map).cells.size()

    def __iter__(self):
        cdef int i = 0
        cdef key_t key
        cdef void* value
        while map_iter(self.c_map.get(), &i, &key, &value):
            yield key, <size_t>value

    def __getitem__(self, key_t key):
        return <count_t>map_get(self.c_map.get(), key)

    cpdef int inc(self, key_t key, count_t inc) except -1:
        cdef count_t c = <count_t>map_get(self.c_map.get(), key)
        c += inc
        map_set(self.c_map.get(), key, <void*>c)
        self.total += inc
        return c

    def prob(self, key_t key):
        cdef GaleSmoother smoother
        cdef void* value = map_get(self.c_map.get(), key)
        if self.smoother is not None:
            smoother = self.smoother
            r_star = self.smoother(<count_t>value)
            return r_star / self.smoother.total
        elif value == NULL:
            return 0
        else:
            return <count_t>value / self.total

    def smooth(self):
        self.smoother = GaleSmoother(self)
       

cdef class GaleSmoother:
    cdef Pool mem
    cdef count_t* Nr
    cdef double gradient
    cdef double intercept
    cdef readonly count_t cutoff
    cdef count_t Nr0
    cdef readonly double total

    def __init__(self, PreshCounter counts):
        count_counts = PreshCounter()
        cdef double total = 0
        for _, count in counts:
            count_counts.inc(count, 1)
            total += count
        # If we have no items seen 1 or 2 times, this doesn't work. But, this
        # won't be true in real data...
        assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"
        # Extrapolate Nr0 from Nr1 and Nr2.
        self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])
        self.mem = Pool()

        cdef double[2] mb

        cdef int n_counts = 0
        for _ in count_counts:
            n_counts += 1
        sorted_r = <count_t*>self.mem.alloc(n_counts, sizeof(count_t))
        self.Nr = <count_t*>self.mem.alloc(n_counts, sizeof(count_t))
        for i, (count, count_count) in enumerate(sorted(count_counts)):
            sorted_r[i] = count
            self.Nr[i] = count_count

        _fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)
        
        self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],
                                           n_counts)
        self.gradient = mb[0]
        self.intercept = mb[1]
        self.total = self(0) * self.Nr0
        for count, count_count in count_counts:
            self.total += self(count) * count_count

    def __call__(self, count_t r):
        if r == 0:
            return self.Nr[1] / self.Nr0
        elif r < self.cutoff:
            return turing_estimate_of_r(<double>r, <double>self.Nr[r-1], <double>self.Nr[r])
        else:
            return gale_estimate_of_r(<double>r, self.gradient, self.intercept)

    def count_count(self, count_t r):
        if r == 0:
            return self.Nr0
        else:
            return self.Nr[r-1]


@cython.cdivision(True)
cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:
    return ((r + 1) * Nr1) / Nr


@cython.cdivision(True)
cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:
    cdef double e_nr  = exp(gradient * log(r) + intercept)
    cdef double e_nr1 = exp(gradient * log(r+1) + intercept)
    return (r + 1) * (e_nr1 / e_nr)


@cython.cdivision(True)
cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,
                               int length) except *:
    cdef double x_mean = 0.0
    cdef double y_mean = 0.0

    cdef Pool mem = Pool()
    x = <double*>mem.alloc(length, sizeof(double))
    y = <double*>mem.alloc(length, sizeof(double))

    cdef int i
    for i in range(length):
        r = sorted_r[i]
        x[i] = log(<double>r)
        y[i] = log(<double>_get_zr(i, sorted_r, Nr[i], length))
        x_mean += x[i]
        y_mean += y[i]
    
    x_mean /= length
    y_mean /= length

    cdef double ss_xy = 0.0
    cdef double ss_xx = 0.0
   
    for i in range(length):
        x_dist = x[i] - x_mean
        y_dist = y[i] - y_mean
        # SS_xy = sum the product of the distances from the mean
        ss_xy += x_dist * y_dist
        # SS_xx = sum the squares of the x distance
        ss_xx  += x_dist * x_dist
    # Gradient
    output[0]  = ss_xy / ss_xx
    # Intercept
    output[1] = y_mean - output[0] * x_mean


@cython.cdivision(True)
cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:
    cdef double r_i = sorted_r[j-1] if j >= 1 else 0
    cdef double r_j = sorted_r[j]
    cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)
    return 2 * Nr_j / (r_k - r_i)


@cython.cdivision(True)
cdef double _variance(double r, double Nr, double Nr1) nogil:
    return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))


@cython.cdivision(True)
cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,
                                  int length) except -1:
    cdef int i
    cdef count_t r
    for i in range(length-1):
        r = sorted_r[i]
        if sorted_r[i+1] != r+1:
            return r
        g_r = gale_estimate_of_r(r, m, b)
        t_r = turing_estimate_of_r(<double>r, <double>Nr[i], <double>Nr[i+1])
        if abs(t_r - g_r) <= _variance(<double>r, <double>Nr[i], <double>Nr[i+1]):
            return r
    else:
        return length - 1