1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
|
from __future__ import unicode_literals
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from nose.tools import assert_raises, assert_true
from numpy.testing import assert_array_equal, assert_equal
def test_feature_hasher_dicts():
h = FeatureHasher(n_features=16)
assert_equal("dict", h.input_type)
raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
{"foo": "baz", "gaga": u"string1"}]
X1 = FeatureHasher(n_features=16).transform(raw_X)
gen = (iter(d.items()) for d in raw_X)
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
assert_array_equal(X1.toarray(), X2.toarray())
def test_feature_hasher_strings():
# mix byte and Unicode strings; note that "foo" is a duplicate in row 0
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
["bar".encode("ascii"), "baz", "quux"]]
for lg_n_features in (7, 9, 11, 16, 22):
n_features = 2 ** lg_n_features
it = (x for x in raw_X) # iterable
h = FeatureHasher(n_features, non_negative=True, input_type="string")
X = h.transform(it)
assert_equal(X.shape[0], len(raw_X))
assert_equal(X.shape[1], n_features)
assert_true(np.all(X.data > 0))
assert_equal(X[0].sum(), 4)
assert_equal(X[1].sum(), 3)
assert_equal(X.nnz, 6)
def test_feature_hasher_pairs():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
{"baz": 3, "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 2], x1_nz)
assert_equal([1, 3, 4], x2_nz)
def test_feature_hasher_pairs_with_string_values():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
{"baz": u"abc", "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 1], x1_nz)
assert_equal([1, 1, 4], x2_nz)
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
{"bax": "abc"}])
x1, x2 = h.transform(raw_X).toarray()
x1_nz = np.abs(x1[x1 != 0])
x2_nz = np.abs(x2[x2 != 0])
assert_equal([1], x1_nz)
assert_equal([1], x2_nz)
assert_equal(x1, x2)
def test_hash_empty_input():
n_features = 16
raw_X = [[], (), iter(range(0))]
h = FeatureHasher(n_features=n_features, input_type="string")
X = h.transform(raw_X)
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
def test_hasher_invalid_input():
assert_raises(ValueError, FeatureHasher, input_type="gobbledygook")
assert_raises(ValueError, FeatureHasher, n_features=-1)
assert_raises(ValueError, FeatureHasher, n_features=0)
assert_raises(TypeError, FeatureHasher, n_features='ham')
h = FeatureHasher(n_features=np.uint16(2 ** 6))
assert_raises(ValueError, h.transform, [])
assert_raises(Exception, h.transform, [[5.5]])
assert_raises(Exception, h.transform, [[None]])
def test_hasher_set_params():
# Test delayed input validation in fit (useful for grid search).
hasher = FeatureHasher()
hasher.set_params(n_features=np.inf)
assert_raises(TypeError, hasher.fit)
def test_hasher_zeros():
# Assert that no zeros are materialized in the output.
X = FeatureHasher().transform([{'foo': 0}])
assert_equal(X.data.shape, (0,))
|