1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
|
# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD-style.
from random import Random
import numpy as np
import scipy.sparse as sp
from nose.tools import assert_equal
from nose.tools import assert_true
from nose.tools import assert_false
from numpy.testing import assert_array_equal
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
def test_dictvectorizer():
D = [{"foo": 1, "bar": 3},
{"bar": 4, "baz": 2},
{"bar": 1, "quux": 1, "quuux": 2}]
for sparse in (True, False):
for dtype in (int, np.float32, np.int16):
v = DictVectorizer(sparse=sparse, dtype=dtype)
X = v.fit_transform(D)
assert_equal(sp.issparse(X), sparse)
assert_equal(X.shape, (3, 5))
assert_equal(X.sum(), 14)
assert_equal(v.inverse_transform(X), D)
if sparse:
# COO matrices can't be compared for equality
assert_array_equal(X.A, v.transform(D).A)
else:
assert_array_equal(X, v.transform(D))
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in xrange(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in xrange(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert_equal(v.get_feature_names(), ["useful1", "useful2"])
def test_one_of_k():
D_in = [{"version": "1", "ham": 2},
{"version": "2", "spam": .3},
{"version=3": True, "spam": -1}]
v = DictVectorizer()
X = v.fit_transform(D_in)
assert_equal(X.shape, (3, 5))
D_out = v.inverse_transform(X)
assert_equal(D_out[0], {"version=1": 1, "ham": 2})
names = v.get_feature_names()
assert_true("version=2" in names)
assert_false("version" in names)
def test_unseen_features():
D = [{"camelot": 0, "spamalot": 1}]
v = DictVectorizer(sparse=False).fit(D)
X = v.transform({"push the pram a lot": 2})
assert_array_equal(X, np.zeros((1, 2)))
def test_deterministic_vocabulary():
# Generate equal dictionaries with different memory layouts
items = [("%03d" % i, i) for i in range(1000)]
rng = Random(42)
d_sorted = dict(items)
rng.shuffle(items)
d_shuffled = dict(items)
# check that the memory layout does not impact the resulting vocabulary
v_1 = DictVectorizer().fit([d_sorted])
v_2 = DictVectorizer().fit([d_shuffled])
assert_equal(v_1.vocabulary_, v_2.vocabulary_)
|