File: genfilter.py

package info (click to toggle)
libkkc 0.3.5-12
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,908 kB
  • sloc: ansic: 7,099; makefile: 917; cpp: 435; python: 231; sh: 124
file content (121 lines) | stat: -rw-r--r-- 3,457 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python3

# Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
# Copyright (C) 2011-2014 Red Hat, Inc.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import mmap
import math
import struct

ERROR_RATE = 0.25

def murmur_hash3_32(b0, b1, seed):
    h1 = seed

    c1 = 0xcc9e2d51
    c2 = 0x1b873593

    # body: b0
    b0 *= c1
    b0 &= 0xFFFFFFFF
    b0 = (b0 << 15) | (b0 >> (32 - 15))
    b0 &= 0xFFFFFFFF
    b0 *= c2
    b0 &= 0xFFFFFFFF

    h1 ^= b0
    h1 &= 0xFFFFFFFF
    h1 = (h1 << 13) | (h1 >> (32 - 13)) 
    h1 &= 0xFFFFFFFF
    h1 = h1 * 5 + 0xe6546b64
    h1 &= 0xFFFFFFFF

    # body: b1
    b1 *= c1
    b1 &= 0xFFFFFFFF
    b1 = (b1 << 15) | (b1 >> (32 - 15))
    b1 &= 0xFFFFFFFF
    b1 *= c2
    b1 &= 0xFFFFFFFF

    h1 ^= b1
    h1 &= 0xFFFFFFFF
    h1 = (h1 << 13) | (h1 >> (32 - 13)) 
    h1 &= 0xFFFFFFFF
    h1 = h1 * 5 + 0xe6546b64
    h1 &= 0xFFFFFFFF

    # No tail processing needed.

    # fmix
    h1 ^= 8
    h1 &= 0xFFFFFFFF
    h1 ^= h1 >> 16
    h1 &= 0xFFFFFFFF
    h1 *= 0x85ebca6b
    h1 &= 0xFFFFFFFF
    h1 ^= h1 >> 13
    h1 &= 0xFFFFFFFF
    h1 *= 0xc2b2ae35
    h1 &= 0xFFFFFFFF
    h1 ^= h1 >> 16
    h1 &= 0xFFFFFFFF
    return h1

class FilterGenerator(object):
    def __init__(self, infile, outfile, record_size):
        self.infile = infile
        self.outfile = outfile
        self.record_size = record_size

    def generate(self):
        size = os.fstat(self.infile.fileno()).st_size
        n = size // self.record_size
        m = int(math.ceil(-n*math.log10(ERROR_RATE) /
                          math.pow(math.log10(2), 2)))
        m = (m//8 + 1)*8
        inmem = mmap.mmap(self.infile.fileno(),
                          size,
                          access=mmap.ACCESS_READ)
        outmem = bytearray(m//8)
        for i in range(0, n):
            offset = i*self.record_size
            b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
            for k in range(0, 4):
                h = murmur_hash3_32(b0, b1, k)
                h = int(h * (m / float(0xFFFFFFFF)))
                outmem[h//8] |= (1 << (h%8))
        inmem.close()
        # Convert bytearray to bytes, for Python 3 compatibility.
        self.outfile.write(bytes(outmem))

if __name__ == '__main__':
    import sys
    import argparse

    parser = argparse.ArgumentParser(description='filter')
    parser.add_argument('infile', type=argparse.FileType('r'),
                        help='input file')
    parser.add_argument('outfile', type=argparse.FileType('wb'),
                        help='output file')
    parser.add_argument('record_size', type=int,
                        help='record size')
    args = parser.parse_args()
    generator = FilterGenerator(args.infile,
                                args.outfile,
                                args.record_size)
    generator.generate()