File: test_murmurhash.py

package info (click to toggle)
scikit-learn 0.20.2%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 51,036 kB
  • sloc: python: 108,171; ansic: 8,722; cpp: 5,651; makefile: 192; sh: 40
file content (80 lines) | stat: -rw-r--r-- 2,853 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD 3 clause

import numpy as np
from sklearn.externals.six import b, u
from sklearn.utils.murmurhash import murmurhash3_32
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_equal
from sklearn.utils.testing import assert_equal, assert_true


def test_mmhash3_int():
    assert_equal(murmurhash3_32(3), 847579505)
    assert_equal(murmurhash3_32(3, seed=0), 847579505)
    assert_equal(murmurhash3_32(3, seed=42), -1823081949)

    assert_equal(murmurhash3_32(3, positive=False), 847579505)
    assert_equal(murmurhash3_32(3, seed=0, positive=False), 847579505)
    assert_equal(murmurhash3_32(3, seed=42, positive=False), -1823081949)

    assert_equal(murmurhash3_32(3, positive=True), 847579505)
    assert_equal(murmurhash3_32(3, seed=0, positive=True), 847579505)
    assert_equal(murmurhash3_32(3, seed=42, positive=True), 2471885347)


def test_mmhash3_int_array():
    rng = np.random.RandomState(42)
    keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
    keys = keys.reshape((3, 2, 1))

    for seed in [0, 42]:
        expected = np.array([murmurhash3_32(int(k), seed)
                             for k in keys.flat])
        expected = expected.reshape(keys.shape)
        assert_array_equal(murmurhash3_32(keys, seed), expected)

    for seed in [0, 42]:
        expected = np.array([murmurhash3_32(k, seed, positive=True)
                             for k in keys.flat])
        expected = expected.reshape(keys.shape)
        assert_array_equal(murmurhash3_32(keys, seed, positive=True),
                           expected)


def test_mmhash3_bytes():
    assert_equal(murmurhash3_32(b('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(b('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(b('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(b('foo'), 42, positive=True), 2972666014)


def test_mmhash3_unicode():
    assert_equal(murmurhash3_32(u('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(u('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(u('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(u('foo'), 42, positive=True), 2972666014)


def test_no_collision_on_byte_range():
    previous_hashes = set()
    for i in range(100):
        h = murmurhash3_32(' ' * i, 0)
        assert_true(h not in previous_hashes,
                    "Found collision on growing empty string")


def test_uniform_distribution():
    n_bins, n_samples = 10, 100000
    bins = np.zeros(n_bins, dtype=np.float64)

    for i in range(n_samples):
        bins[murmurhash3_32(i, positive=True) % n_bins] += 1

    means = bins / n_samples
    expected = np.full(n_bins, 1. / n_bins)

    assert_array_almost_equal(means / expected, np.ones(n_bins), 2)