1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
|
#!/usr/bin/python3
# Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
# Copyright (C) 2011-2014 Red Hat, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import mmap
import math
import struct
ERROR_RATE = 0.25
def murmur_hash3_32(b0, b1, seed):
h1 = seed
c1 = 0xcc9e2d51
c2 = 0x1b873593
# body: b0
b0 *= c1
b0 &= 0xFFFFFFFF
b0 = (b0 << 15) | (b0 >> (32 - 15))
b0 &= 0xFFFFFFFF
b0 *= c2
b0 &= 0xFFFFFFFF
h1 ^= b0
h1 &= 0xFFFFFFFF
h1 = (h1 << 13) | (h1 >> (32 - 13))
h1 &= 0xFFFFFFFF
h1 = h1 * 5 + 0xe6546b64
h1 &= 0xFFFFFFFF
# body: b1
b1 *= c1
b1 &= 0xFFFFFFFF
b1 = (b1 << 15) | (b1 >> (32 - 15))
b1 &= 0xFFFFFFFF
b1 *= c2
b1 &= 0xFFFFFFFF
h1 ^= b1
h1 &= 0xFFFFFFFF
h1 = (h1 << 13) | (h1 >> (32 - 13))
h1 &= 0xFFFFFFFF
h1 = h1 * 5 + 0xe6546b64
h1 &= 0xFFFFFFFF
# No tail processing needed.
# fmix
h1 ^= 8
h1 &= 0xFFFFFFFF
h1 ^= h1 >> 16
h1 &= 0xFFFFFFFF
h1 *= 0x85ebca6b
h1 &= 0xFFFFFFFF
h1 ^= h1 >> 13
h1 &= 0xFFFFFFFF
h1 *= 0xc2b2ae35
h1 &= 0xFFFFFFFF
h1 ^= h1 >> 16
h1 &= 0xFFFFFFFF
return h1
class FilterGenerator(object):
def __init__(self, infile, outfile, record_size):
self.infile = infile
self.outfile = outfile
self.record_size = record_size
def generate(self):
size = os.fstat(self.infile.fileno()).st_size
n = size // self.record_size
m = int(math.ceil(-n*math.log10(ERROR_RATE) /
math.pow(math.log10(2), 2)))
m = (m//8 + 1)*8
inmem = mmap.mmap(self.infile.fileno(),
size,
access=mmap.ACCESS_READ)
outmem = bytearray(m//8)
for i in range(0, n):
offset = i*self.record_size
b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
for k in range(0, 4):
h = murmur_hash3_32(b0, b1, k)
h = int(h * (m / float(0xFFFFFFFF)))
outmem[h//8] |= (1 << (h%8))
inmem.close()
# Convert bytearray to bytes, for Python 3 compatibility.
self.outfile.write(bytes(outmem))
if __name__ == '__main__':
import sys
import argparse
parser = argparse.ArgumentParser(description='filter')
parser.add_argument('infile', type=argparse.FileType('r'),
help='input file')
parser.add_argument('outfile', type=argparse.FileType('wb'),
help='output file')
parser.add_argument('record_size', type=int,
help='record size')
args = parser.parse_args()
generator = FilterGenerator(args.infile,
args.outfile,
args.record_size)
generator.generate()
|