1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
|
"""This module implements a loader and dumper for the svmlight format
This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.
The first element of each line can be used to store a target variable to
predict.
This format is used as the default format for both svmlight and the
libsvm command line programs.
"""
# Authors: Mathieu Blondel <mathieu@mblondel.org>
# Lars Buitinck <L.J.Buitinck@uva.nl>
# Olivier Grisel <olivier.grisel@ensta.org>
# License: Simple BSD.
import numpy as np
import scipy.sparse as sp
from ._svmlight_format import _load_svmlight_file
def load_svmlight_file(f, n_features=None, dtype=np.float64,
multilabel=False, zero_based="auto"):
"""Load datasets in the svmlight / libsvm format into sparse CSR matrix
This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.
The first element of each line can be used to store a target variable
to predict.
This format is used as the default format for both svmlight and the
libsvm command line programs.
Parsing a text based source can be expensive. When working on
repeatedly on the same dataset, it is recommended to wrap this
loader with joblib.Memory.cache to store a memmapped backup of the
CSR results of the first call and benefit from the near instantaneous
loading of memmapped structures for the subsequent calls.
This implementation is naive: it does allocate too much memory and
is slow since written in python. On large datasets it is recommended
to use an optimized loader such as:
https://github.com/mblondel/svmlight-loader
Parameters
----------
f: str or file-like open in binary mode.
(Path to) a file to load.
n_features: int or None
The number of features to use. If None, it will be inferred. This
argument is useful to load several files that are subsets of a
bigger sliced dataset: each subset might not have example of
every feature, hence the inferred shape might vary from one
slice to another.
multilabel: boolean, optional
Samples may have several labels each (see
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
zero_based: boolean or "auto", optional
Whether column indices in f are zero-based (True) or one-based
(False). If set to "auto", a heuristic check is applied to determine
this from the file contents. Both kinds of files occur "in the wild",
but they are unfortunately not self-identifying. Using "auto" or True
should always be safe.
Returns
-------
(X, y)
where X is a scipy.sparse matrix of shape (n_samples, n_features),
y is a ndarray of shape (n_samples,), or, in the multilabel case,
a list of tuples of length n_samples.
See also
--------
load_svmlight_files: similar function for loading multiple files in this
format, enforcing the same number of features/columns on all of them.
"""
return tuple(load_svmlight_files([f], n_features, dtype, multilabel,
zero_based))
def _open_and_load(f, dtype, multilabel, zero_based):
if hasattr(f, "read"):
return _load_svmlight_file(f, dtype, multilabel, zero_based)
with open(f, "rb") as f:
return _load_svmlight_file(f, dtype, multilabel, zero_based)
def load_svmlight_files(files, n_features=None, dtype=np.float64,
multilabel=False, zero_based="auto"):
"""Load dataset from multiple files in SVMlight format
This function is equivalent to mapping load_svmlight_file over a list of
files, except that the results are concatenated into a single, flat list
and the samples vectors are constrained to all have the same number of
features.
Parameters
----------
files : iterable over {str, file-like}
(Paths to) files to load.
n_features: int or None
The number of features to use. If None, it will be inferred from the
maximum column index occurring in any of the files.
multilabel: boolean, optional
Samples may have several labels each (see
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
zero_based: boolean or "auto", optional
Whether column indices in files are zero-based (True) or one-based
(False). If set to "auto", a heuristic check is applied to determine
this from the files' contents. Both kinds of files occur "in the wild",
but they are unfortunately not self-identifying. Using "auto" or True
should always be safe.
Returns
-------
[X1, y1, ..., Xn, yn]
where each (Xi, yi) pair is the result from load_svmlight_file(files[i]).
Rationale
---------
When fitting a model to a matrix X_train and evaluating it against a
matrix X_test, it is essential that X_train and X_test have the same
number of features (X_train.shape[1] == X_test.shape[1]). This may not
be the case if you load the files individually with load_svmlight_file.
See also
--------
load_svmlight_file
"""
r = [_open_and_load(f, dtype, multilabel, bool(zero_based)) for f in files]
if zero_based is False \
or zero_based == "auto" and all(np.min(indices) > 0
for _, indices, _, _ in r):
for _, indices, _, _ in r:
indices -= 1
if n_features is None:
n_features = max(indices.max() for _, indices, _, _ in r) + 1
result = []
for data, indices, indptr, y in r:
shape = (indptr.shape[0] - 1, n_features)
result += sp.csr_matrix((data, indices, indptr), shape), y
return result
def _dump_svmlight(X, y, f, zero_based):
if X.shape[0] != y.shape[0]:
raise ValueError("X.shape[0] and y.shape[0] should be the same, "
"got: %r and %r instead." % (X.shape[0], y.shape[0]))
is_sp = int(hasattr(X, "tocsr"))
one_based = not zero_based
for i in xrange(X.shape[0]):
s = u" ".join([u"%d:%f" % (j + one_based, X[i, j])
for j in X[i].nonzero()[is_sp]])
f.write((u"%f %s\n" % (y[i], s)).encode('ascii'))
def dump_svmlight_file(X, y, f, zero_based=True):
"""Dump the dataset in svmlight / libsvm file format.
This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.
The first element of each line can be used to store a target variable
to predict.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
f : str or file-like in binary mode
If string it specifies the path that will contain the data.
If f is a file-like then data will be written to f.
zero_based : boolean, optional
Whether column indices should be written zero-based (True) or one-based
(False).
"""
if hasattr(f, "write"):
_dump_svmlight(X, y, f, zero_based)
else:
with open(f, "wb") as f:
_dump_svmlight(X, y, f, zero_based)
|