File: murmurhash.pyx

package info (click to toggle)
scikit-learn 0.11.0-2%2Bdeb7u1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 13,900 kB
  • sloc: python: 34,740; ansic: 8,860; cpp: 8,849; pascal: 230; makefile: 211; sh: 14
file content (124 lines) | stat: -rw-r--r-- 4,369 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Cython wrapper for MurmurHash3 non-cryptographic hash function

MurmurHash is an extensively tested and very fast hash function that has
good distribution properties suitable for machine learning use cases
such as feature hashing and random projections.

The original C++ code by Austin Appleby is released the public domain
and can be found here:

  https://code.google.com/p/smhasher/

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD Style.

cimport cython
cimport numpy as np
import numpy as np


cpdef np.uint32_t murmurhash3_int_u32(int key, unsigned int seed):
    """Compute the 32bit murmurhash3 of a int key at seed."""
    cdef np.uint32_t out
    MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
    return out


cpdef np.int32_t murmurhash3_int_s32(int key, unsigned int seed):
    """Compute the 32bit murmurhash3 of a int key at seed."""
    cdef np.int32_t out
    MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
    return out


cpdef np.uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed):
    """Compute the 32bit murmurhash3 of a bytes key at seed."""
    cdef np.uint32_t out
    MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
    return out


cpdef np.int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed):
    """Compute the 32bit murmurhash3 of a bytes key at seed."""
    cdef np.int32_t out
    MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
    return out


@cython.boundscheck(False)
cpdef np.ndarray[np.uint32_t, ndim=1] murmurhash3_bytes_array_u32(
    np.ndarray[np.int32_t] key, unsigned int seed):
    """Compute 32bit murmurhash3 hashes of a key int array at seed."""
    # TODO make it possible to pass preallocated ouput array
    cdef np.ndarray[np.uint32_t, ndim=1] out = np.zeros(key.size, np.uint32)
    cdef Py_ssize_t i
    for i in range(key.shape[0]):
        out[i] = murmurhash3_int_u32(key[i], seed)
    return out


@cython.boundscheck(False)
cpdef np.ndarray[np.int32_t, ndim=1] murmurhash3_bytes_array_s32(
    np.ndarray[np.int32_t] key, unsigned int seed):
    """Compute 32bit murmurhash3 hashes of a key int array at seed."""
    # TODO make it possible to pass preallocated ouput array
    cdef np.ndarray[np.int32_t, ndim=1] out = np.zeros(key.size, np.int32)
    cdef Py_ssize_t i
    for i in range(key.shape[0]):
        out[i] = murmurhash3_int_s32(key[i], seed)
    return out


def murmurhash3_32(key, seed=0, positive=False):
    """Compute the 32bit murmurhash3 of key at seed.

    The underlying implementation is MurmurHash3_x86_32 generating low
    latency 32bits hash suitable for implementing lookup tables, Bloom
    filters, count min sketch or feature hashing.

    Parameters
    ----------
    key: int32, bytes, unicode or ndarray with dtype int32
        the physical object to hash

    seed: int, optional default is 0
        integer seed for the hashing algorithm.

    positive: boolean, optional default is False
        True: the results is casted to an unsigned int
          from 0 to 2 ** 32 - 1
        False: the results is casted to a signed int
          from -(2 ** 31) to 2 ** 31 - 1

    """
    if isinstance(key, bytes):
        if positive:
            return murmurhash3_bytes_u32(key, seed)
        else:
            return murmurhash3_bytes_s32(key, seed)
    elif isinstance(key, unicode):
        if positive:
            return murmurhash3_bytes_u32(key.encode('utf-8'), seed)
        else:
            return murmurhash3_bytes_s32(key.encode('utf-8'), seed)
    elif isinstance(key, int) or isinstance(key, np.int32):
        if positive:
            return murmurhash3_int_u32(<np.int32_t>key, seed)
        else:
            return murmurhash3_int_s32(<np.int32_t>key, seed)
    elif isinstance(key, np.ndarray):
        if key.dtype != np.int32:
            raise TypeError(
                "key.dtype should be int32, got %s" % key.dtype)
        if positive:
            return murmurhash3_bytes_array_u32(key.ravel(),
                                               seed).reshape(key.shape)
        else:
            return murmurhash3_bytes_array_s32(key.ravel(),
                                               seed).reshape(key.shape)
    else:
        raise TypeError(
            "key %r with type %s is not supported. "
            "Explicit conversion to bytes is required" % (key, type(key)))