File: hyperloglog.tcl

package info (click to toggle)
redis 5%3A7.0.15-1~deb12u5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 18,872 kB
  • sloc: ansic: 172,600; tcl: 40,259; sh: 4,319; perl: 4,139; makefile: 1,667; ruby: 772; python: 663; cpp: 364
file content (265 lines) | stat: -rw-r--r-- 8,767 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
start_server {tags {"hll"}} {
    test {HyperLogLog self test passes} {
        catch {r pfselftest} e
        set e
    } {OK} {needs:pfdebug}

    test {PFADD without arguments creates an HLL value} {
        r pfadd hll
        r exists hll
    } {1}

    test {Approximated cardinality after creation is zero} {
        r pfcount hll
    } {0}

    test {PFADD returns 1 when at least 1 reg was modified} {
        r pfadd hll a b c
    } {1}

    test {PFADD returns 0 when no reg was modified} {
        r pfadd hll a b c
    } {0}

    test {PFADD works with empty string (regression)} {
        r pfadd hll ""
    }

    # Note that the self test stresses much better the
    # cardinality estimation error. We are testing just the
    # command implementation itself here.
    test {PFCOUNT returns approximated cardinality of set} {
        r del hll
        set res {}
        r pfadd hll 1 2 3 4 5
        lappend res [r pfcount hll]
        # Call it again to test cached value invalidation.
        r pfadd hll 6 7 8 8 9 10
        lappend res [r pfcount hll]
        set res
    } {5 10}

    test {HyperLogLogs are promote from sparse to dense} {
        r del hll
        r config set hll-sparse-max-bytes 3000
        set n 0
        while {$n < 100000} {
            set elements {}
            for {set j 0} {$j < 100} {incr j} {lappend elements [expr rand()]}
            incr n 100
            r pfadd hll {*}$elements
            set card [r pfcount hll]
            set err [expr {abs($card-$n)}]
            assert {$err < (double($card)/100)*5}
            if {$n < 1000} {
                assert {[r pfdebug encoding hll] eq {sparse}}
            } elseif {$n > 10000} {
                assert {[r pfdebug encoding hll] eq {dense}}
            }
        }
    } {} {needs:pfdebug}

    test {HyperLogLog sparse encoding stress test} {
        for {set x 0} {$x < 1000} {incr x} {
            r del hll1
            r del hll2
            set numele [randomInt 100]
            set elements {}
            for {set j 0} {$j < $numele} {incr j} {
                lappend elements [expr rand()]
            }
            # Force dense representation of hll2
            r pfadd hll2
            r pfdebug todense hll2
            r pfadd hll1 {*}$elements
            r pfadd hll2 {*}$elements
            assert {[r pfdebug encoding hll1] eq {sparse}}
            assert {[r pfdebug encoding hll2] eq {dense}}
            # Cardinality estimated should match exactly.
            assert {[r pfcount hll1] eq [r pfcount hll2]}
        }
    } {} {needs:pfdebug}

    test {Corrupted sparse HyperLogLogs are detected: Additional at tail} {
        r del hll
        r pfadd hll a b c
        r append hll "hello"
        set e {}
        catch {r pfcount hll} e
        set e
    } {*INVALIDOBJ*}

    test {Corrupted sparse HyperLogLogs are detected: Broken magic} {
        r del hll
        r pfadd hll a b c
        r setrange hll 0 "0123"
        set e {}
        catch {r pfcount hll} e
        set e
    } {*WRONGTYPE*}

    test {Corrupted sparse HyperLogLogs are detected: Invalid encoding} {
        r del hll
        r pfadd hll a b c
        r setrange hll 4 "x"
        set e {}
        catch {r pfcount hll} e
        set e
    } {*WRONGTYPE*}

    test {Corrupted sparse HyperLogLogs doesn't cause overflow and out-of-bounds with XZERO opcode} {
        r del hll
        
        # Create a sparse-encoded HyperLogLog header
        set pl [string cat "HYLL" [binary format c12 {1 0 0 0 0 0 0 0 0 0 0 0}]]

        # Create an XZERO opcode with the maximum run length of 16384(2^14)
        set runlen [expr 16384 - 1]
        set chunk [binary format cc [expr {0b01000000 | ($runlen >> 8)}] [expr {$runlen & 0xff}]]
        # Fill the HLL with more than 131072(2^17) XZERO opcodes to make the total
        # run length exceed 4GB, will cause an integer overflow.
        set repeat [expr 131072 + 1000]
        for {set i 0} {$i < $repeat} {incr i} {
            append pl $chunk
        }

        # Create a VAL opcode with a value that will cause out-of-bounds.
        append pl [binary format c 0b11111111]
        r set hll $pl

        # This should not overflow and out-of-bounds.
        assert_error {*INVALIDOBJ*} {r pfcount hll hll}
        assert_error {*INVALIDOBJ*} {r pfdebug getreg hll}
        r ping
    }

    test {Corrupted sparse HyperLogLogs doesn't cause overflow and out-of-bounds with ZERO opcode} {
        r del hll
        
        # Create a sparse-encoded HyperLogLog header
        set pl [string cat "HYLL" [binary format c12 {1 0 0 0 0 0 0 0 0 0 0 0}]]

        # # Create an ZERO opcode with the maximum run length of 64(2^6)
        set chunk [binary format c [expr {0b00000000 | 0x3f}]]
        # Fill the HLL with more than 33554432(2^17) ZERO opcodes to make the total
        # run length exceed 4GB, will cause an integer overflow.
        set repeat [expr 33554432 + 1000]
        for {set i 0} {$i < $repeat} {incr i} {
            append pl $chunk
        }

        # Create a VAL opcode with a value that will cause out-of-bounds.
        append pl [binary format c 0b11111111]
        r set hll $pl

        # This should not overflow and out-of-bounds.
        assert_error {*INVALIDOBJ*} {r pfcount hll hll}
        assert_error {*INVALIDOBJ*} {r pfdebug getreg hll}
        r ping
    }

    test {Corrupted dense HyperLogLogs are detected: Wrong length} {
        r del hll
        r pfadd hll a b c
        r setrange hll 4 "\x00"
        set e {}
        catch {r pfcount hll} e
        set e
    } {*WRONGTYPE*}

    test {Fuzzing dense/sparse encoding: Redis should always detect errors} {
        for {set j 0} {$j < 1000} {incr j} {
            r del hll
            set items {}
            set numitems [randomInt 3000]
            for {set i 0} {$i < $numitems} {incr i} {
                lappend items [expr {rand()}]
            }
            r pfadd hll {*}$items

            # Corrupt it in some random way.
            for {set i 0} {$i < 5} {incr i} {
                set len [r strlen hll]
                set pos [randomInt $len]
                set byte [randstring 1 1 binary]
                r setrange hll $pos $byte
                # Don't modify more bytes 50% of times
                if {rand() < 0.5} break
            }

            # Use the hyperloglog to check if it crashes
            # Redis in some way.
            catch {
                r pfcount hll
            }
        }
    }

    test {PFADD, PFCOUNT, PFMERGE type checking works} {
        r set foo{t} bar
        catch {r pfadd foo{t} 1} e
        assert_match {*WRONGTYPE*} $e
        catch {r pfcount foo{t}} e
        assert_match {*WRONGTYPE*} $e
        catch {r pfmerge bar{t} foo{t}} e
        assert_match {*WRONGTYPE*} $e
        catch {r pfmerge foo{t} bar{t}} e
        assert_match {*WRONGTYPE*} $e
    }

    test {PFMERGE results on the cardinality of union of sets} {
        r del hll{t} hll1{t} hll2{t} hll3{t}
        r pfadd hll1{t} a b c
        r pfadd hll2{t} b c d
        r pfadd hll3{t} c d e
        r pfmerge hll{t} hll1{t} hll2{t} hll3{t}
        r pfcount hll{t}
    } {5}

    test {PFCOUNT multiple-keys merge returns cardinality of union #1} {
        r del hll1{t} hll2{t} hll3{t}
        for {set x 1} {$x < 10000} {incr x} {
            r pfadd hll1{t} "foo-$x"
            r pfadd hll2{t} "bar-$x"
            r pfadd hll3{t} "zap-$x"

            set card [r pfcount hll1{t} hll2{t} hll3{t}]
            set realcard [expr {$x*3}]
            set err [expr {abs($card-$realcard)}]
            assert {$err < (double($card)/100)*5}
        }
    }

    test {PFCOUNT multiple-keys merge returns cardinality of union #2} {
        r del hll1{t} hll2{t} hll3{t}
        set elements {}
        for {set x 1} {$x < 10000} {incr x} {
            for {set j 1} {$j <= 3} {incr j} {
                set rint [randomInt 20000]
                r pfadd hll$j{t} $rint
                lappend elements $rint
            }
        }
        set realcard [llength [lsort -unique $elements]]
        set card [r pfcount hll1{t} hll2{t} hll3{t}]
        set err [expr {abs($card-$realcard)}]
        assert {$err < (double($card)/100)*5}
    }

    test {PFDEBUG GETREG returns the HyperLogLog raw registers} {
        r del hll
        r pfadd hll 1 2 3
        llength [r pfdebug getreg hll]
    } {16384} {needs:pfdebug}

    test {PFADD / PFCOUNT cache invalidation works} {
        r del hll
        r pfadd hll a b c
        r pfcount hll
        assert {[r getrange hll 15 15] eq "\x00"}
        r pfadd hll a b c
        assert {[r getrange hll 15 15] eq "\x00"}
        r pfadd hll 1 2 3
        assert {[r getrange hll 15 15] eq "\x80"}
    }
}