File: _hashing.pyx

package info (click to toggle)
scikit-learn 0.18-5
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 71,040 kB
  • ctags: 91,142
  • sloc: python: 97,257; ansic: 8,360; cpp: 5,649; makefile: 242; sh: 238
file content (85 lines) | stat: -rw-r--r-- 2,511 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Author: Lars Buitinck
# License: BSD 3 clause

import array
from cpython cimport array
cimport cython
from libc.stdlib cimport abs
cimport numpy as np
import numpy as np

from ..externals.six import string_types

from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32

np.import_array()


@cython.boundscheck(False)
@cython.cdivision(True)
def transform(raw_X, Py_ssize_t n_features, dtype):
    """Guts of FeatureHasher.transform.

    Returns
    -------
    n_samples : integer
    indices, indptr, values : lists
        For constructing a scipy.sparse.csr_matrix.

    """
    assert n_features > 0

    cdef np.int32_t h
    cdef double value

    cdef array.array indices
    cdef array.array indptr
    indices = array.array("i")
    indptr = array.array("i", [0])

    # Since Python array does not understand Numpy dtypes, we grow the indices
    # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
    cdef Py_ssize_t capacity = 8192     # arbitrary
    cdef np.int32_t size = 0
    cdef np.ndarray values = np.empty(capacity, dtype=dtype)

    for x in raw_X:
        for f, v in x:
            if isinstance(v, string_types):
                f = "%s%s%s" % (f, '=', v)
                value = 1
            else:
                value = v

            if value == 0:
                continue

            if isinstance(f, unicode):
                f = f.encode("utf-8")
            # Need explicit type check because Murmurhash does not propagate
            # all exceptions. Add "except *" there?
            elif not isinstance(f, bytes):
                raise TypeError("feature names must be strings")

            h = murmurhash3_bytes_s32(f, 0)

            array.resize_smart(indices, len(indices) + 1)
            indices[len(indices) - 1] = abs(h) % n_features
            value *= (h >= 0) * 2 - 1
            values[size] = value
            size += 1

            if size == capacity:
                capacity *= 2
                # can't use resize member because there might be multiple
                # references to the arrays due to Cython's error checking
                values = np.resize(values, capacity)

        array.resize_smart(indptr, len(indptr) + 1)
        indptr[len(indptr) - 1] = size

    if len(indices):
        indices_a = np.frombuffer(indices, dtype=np.int32)
    else:       # workaround for NumPy < 1.7.0
        indices_a = np.empty(0, dtype=np.int32)
    return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size])