1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
|
"""
===========================================
FeatureHasher and DictVectorizer Comparison
===========================================
In this example we illustrate text vectorization, which is the process of
representing non-numerical input data (such as dictionaries or text documents)
as vectors of real numbers.
We first compare :func:`~sklearn.feature_extraction.FeatureHasher` and
:func:`~sklearn.feature_extraction.DictVectorizer` by using both methods to
vectorize text documents that are preprocessed (tokenized) with the help of a
custom Python function.
Later we introduce and analyze the text-specific vectorizers
:func:`~sklearn.feature_extraction.text.HashingVectorizer`,
:func:`~sklearn.feature_extraction.text.CountVectorizer` and
:func:`~sklearn.feature_extraction.text.TfidfVectorizer` that handle both the
tokenization and the assembling of the feature matrix within a single class.
The objective of the example is to demonstrate the usage of text vectorization
API and to compare their processing time. See the example scripts
:ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
and :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` for actual
learning on text documents.
"""
# Author: Lars Buitinck
# Olivier Grisel <olivier.grisel@ensta.org>
# Arturo Amor <david-arturo.amor-quiroz@inria.fr>
# License: BSD 3 clause
# %%
# Load Data
# ---------
#
# We load data from :ref:`20newsgroups_dataset`, which comprises around
# 18000 newsgroups posts on 20 topics split in two subsets: one for training and
# one for testing. For the sake of simplicity and reducing the computational
# cost, we select a subset of 7 topics and use the training set only.
from sklearn.datasets import fetch_20newsgroups
categories = [
"alt.atheism",
"comp.graphics",
"comp.sys.ibm.pc.hardware",
"misc.forsale",
"rec.autos",
"sci.space",
"talk.religion.misc",
]
print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")
# %%
# Define preprocessing functions
# ------------------------------
#
# A token may be a word, part of a word or anything comprised between spaces or
# symbols in a string. Here we define a function that extracts the tokens using
# a simple regular expression (regex) that matches Unicode word characters. This
# includes most characters that can be part of a word in any language, as well
# as numbers and the underscore:
import re
def tokenize(doc):
"""Extract tokens from doc.
This uses a simple regex that matches word characters to break strings
into tokens. For a more principled approach, see CountVectorizer or
TfidfVectorizer.
"""
return (tok.lower() for tok in re.findall(r"\w+", doc))
list(tokenize("This is a simple example, isn't it?"))
# %%
# We define an additional function that counts the (frequency of) occurrence of
# each token in a given document. It returns a frequency dictionary to be used
# by the vectorizers.
from collections import defaultdict
def token_freqs(doc):
"""Extract a dict mapping tokens from doc to their occurrences."""
freq = defaultdict(int)
for tok in tokenize(doc):
freq[tok] += 1
return freq
token_freqs("That is one example, but this is another one")
# %%
# Observe in particular that the repeated token `"is"` is counted twice for
# instance.
#
# Breaking a text document into word tokens, potentially losing the order
# information between the words in a sentence is often called a `Bag of Words
# representation <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.
# %%
# DictVectorizer
# --------------
#
# First we benchmark the :func:`~sklearn.feature_extraction.DictVectorizer`,
# then we compare it to :func:`~sklearn.feature_extraction.FeatureHasher` as
# both of them receive dictionaries as input.
from time import time
from sklearn.feature_extraction import DictVectorizer
dict_count_vectorizers = defaultdict(list)
t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
# %%
# The actual mapping from text token to column index is explicitly stored in
# the `.vocabulary_` attribute which is a potentially very large Python
# dictionary:
type(vectorizer.vocabulary_)
# %%
len(vectorizer.vocabulary_)
# %%
vectorizer.vocabulary_["example"]
# %%
# FeatureHasher
# -------------
#
# Dictionaries take up a large amount of storage space and grow in size as the
# training set grows. Instead of growing the vectors along with a dictionary,
# feature hashing builds a vector of pre-defined length by applying a hash
# function `h` to the features (e.g., tokens), then using the hash values
# directly as feature indices and updating the resulting vector at those
# indices. When the feature space is not large enough, hashing functions tend to
# map distinct values to the same hash code (hash collisions). As a result, it
# is impossible to determine what object generated any particular hash code.
#
# Because of the above it is impossible to recover the original tokens from the
# feature matrix and the best approach to estimate the number of unique terms in
# the original dictionary is to count the number of active columns in the
# encoded feature matrix. For such a purpose we define the following function:
import numpy as np
def n_nonzero_columns(X):
"""Number of columns with at least one non-zero value in a CSR matrix.
This is useful to count the number of features columns that are effectively
active when using the FeatureHasher.
"""
return len(np.unique(X.nonzero()[1]))
# %%
# The default number of features for the
# :func:`~sklearn.feature_extraction.FeatureHasher` is 2**20. Here we set
# `n_features = 2**18` to illustrate hash collisions.
#
# **FeatureHasher on frequency dictionaries**
from sklearn.feature_extraction import FeatureHasher
t0 = time()
hasher = FeatureHasher(n_features=2**18)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
hasher.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")
# %%
# The number of unique tokens when using the
# :func:`~sklearn.feature_extraction.FeatureHasher` is lower than those obtained
# using the :func:`~sklearn.feature_extraction.DictVectorizer`. This is due to
# hash collisions.
#
# The number of collisions can be reduced by increasing the feature space.
# Notice that the speed of the vectorizer does not change significantly when
# setting a large number of features, though it causes larger coefficient
# dimensions and then requires more memory usage to store them, even if a
# majority of them is inactive.
t0 = time()
hasher = FeatureHasher(n_features=2**22)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")
# %%
# We confirm that the number of unique tokens gets closer to the number of
# unique terms found by the :func:`~sklearn.feature_extraction.DictVectorizer`.
#
# **FeatureHasher on raw tokens**
#
# Alternatively, one can set `input_type="string"` in the
# :func:`~sklearn.feature_extraction.FeatureHasher` to vectorize the strings
# output directly from the customized `tokenize` function. This is equivalent to
# passing a dictionary with an implied frequency of 1 for each feature name.
t0 = time()
hasher = FeatureHasher(n_features=2**18, input_type="string")
X = hasher.transform(tokenize(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
hasher.__class__.__name__ + "\non raw tokens"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")
# %%
# We now plot the speed of the above methods for vectorizing.
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12, 6))
y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")
# %%
# In both cases :func:`~sklearn.feature_extraction.FeatureHasher` is
# approximately twice as fast as
# :func:`~sklearn.feature_extraction.DictVectorizer`. This is handy when dealing
# with large amounts of data, with the downside of losing the invertibility of
# the transformation, which in turn makes the interpretation of a model a more
# complex task.
#
# The `FeatureHeasher` with `input_type="string"` is slightly faster than the
# variant that works on frequency dict because it does not count repeated
# tokens: each token is implicitly counted once, even if it was repeated.
# Depending on the downstream machine learning task, it can be a limitation or
# not.
#
# Comparison with special purpose text vectorizers
# ------------------------------------------------
#
# :func:`~sklearn.feature_extraction.text.CountVectorizer` accepts raw data as
# it internally implements tokenization and occurrence counting. It is similar
# to the :func:`~sklearn.feature_extraction.DictVectorizer` when used along with
# the customized function `token_freqs` as done in the previous section. The
# difference being that :func:`~sklearn.feature_extraction.text.CountVectorizer`
# is more flexible. In particular it accepts various regex patterns through the
# `token_pattern` parameter.
from sklearn.feature_extraction.text import CountVectorizer
t0 = time()
vectorizer = CountVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
# %%
# We see that using the :func:`~sklearn.feature_extraction.text.CountVectorizer`
# implementation is approximately twice as fast as using the
# :func:`~sklearn.feature_extraction.DictVectorizer` along with the simple
# function we defined for mapping the tokens. The reason is that
# :func:`~sklearn.feature_extraction.text.CountVectorizer` is optimized by
# reusing a compiled regular expression for the full training set instead of
# creating one per document as done in our naive tokenize function.
#
# Now we make a similar experiment with the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer`, which is
# equivalent to combining the “hashing trick” implemented by the
# :func:`~sklearn.feature_extraction.FeatureHasher` class and the text
# preprocessing and tokenization of the
# :func:`~sklearn.feature_extraction.text.CountVectorizer`.
from sklearn.feature_extraction.text import HashingVectorizer
t0 = time()
vectorizer = HashingVectorizer(n_features=2**18)
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
# %%
# We can observe that this is the fastest text tokenization strategy so far,
# assuming that the downstream machine learning task can tolerate a few
# collisions.
#
# TfidfVectorizer
# ---------------
#
# In a large text corpus, some words appear with higher frequency (e.g. “the”,
# “a”, “is” in English) and do not carry meaningful information about the actual
# contents of a document. If we were to feed the word count data directly to a
# classifier, those very common terms would shadow the frequencies of rarer yet
# more informative terms. In order to re-weight the count features into floating
# point values suitable for usage by a classifier it is very common to use the
# tf–idf transform as implemented by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`. TF stands for
# "term-frequency" while "tf–idf" means term-frequency times inverse
# document-frequency.
#
# We now benchmark the :func:`~sklearn.feature_extraction.text.TfidfVectorizer`,
# which is equivalent to combining the tokenization and occurrence counting of
# the :func:`~sklearn.feature_extraction.text.CountVectorizer` along with the
# normalizing and weighting from a
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
from sklearn.feature_extraction.text import TfidfVectorizer
t0 = time()
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
# %%
# Summary
# -------
# Let's conclude this notebook by summarizing all the recorded processing speeds
# in a single plot:
fig, ax = plt.subplots(figsize=(12, 6))
y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")
# %%
# Notice from the plot that
# :func:`~sklearn.feature_extraction.text.TfidfVectorizer` is slightly slower
# than :func:`~sklearn.feature_extraction.text.CountVectorizer` because of the
# extra operation induced by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
#
# Also notice that, by setting the number of features `n_features = 2**18`, the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` performs better
# than the :func:`~sklearn.feature_extraction.text.CountVectorizer` at the
# expense of inversibility of the transformation due to hash collisions.
#
# We highlight that :func:`~sklearn.feature_extraction.text.CountVectorizer` and
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` perform better than
# their equivalent :func:`~sklearn.feature_extraction.DictVectorizer` and
# :func:`~sklearn.feature_extraction.FeatureHasher` on manually tokenized
# documents since the internal tokenization step of the former vectorizers
# compiles a regular expression once and then reuses it for all the documents.
|