1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
# Author: Hamzeh Alsalhi <ha258@cornell.edu>
#
# License: BSD 3 clause
from __future__ import division
import numpy as np
import scipy.sparse as sp
import array
from sklearn.utils import check_random_state
from ._random import sample_without_replacement
from .deprecation import deprecated
__all__ = ['sample_without_replacement', 'choice']
# This is a backport of np.random.choice from numpy 1.7
# The function can be removed when we bump the requirements to >=1.7
@deprecated("sklearn.utils.random.choice was deprecated in version 0.19 "
"and will be removed in 0.21. Use np.random.choice or "
"np.random.RandomState.choice instead.")
def choice(a, size=None, replace=True, p=None, random_state=None):
"""
choice(a, size=None, replace=True, p=None)
Generates a random sample from a given 1-D array
.. versionadded:: 1.7.0
Parameters
-----------
a : 1-D array-like or int
If an ndarray, a random sample is generated from its elements.
If an int, the random sample is generated as if a was np.arange(n)
size : int or tuple of ints, optional
Output shape. Default is None, in which case a single value is
returned.
replace : boolean, optional
Whether the sample is with or without replacement.
p : 1-D array-like, optional
The probabilities associated with each entry in a.
If not given the sample assumes a uniform distribution over all
entries in a.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
--------
samples : 1-D ndarray, shape (size,)
The generated random samples
Raises
-------
ValueError
If a is an int and less than zero, if a or p are not 1-dimensional,
if a is an array-like of size 0, if p is not a vector of
probabilities, if a and p have different lengths, or if
replace=False and the sample size is greater than the population
size
See Also
---------
randint, shuffle, permutation
Examples
---------
Generate a uniform random sample from np.arange(5) of size 3:
>>> np.random.choice(5, 3) # doctest: +SKIP
array([0, 3, 4])
>>> #This is equivalent to np.random.randint(0,5,3)
Generate a non-uniform random sample from np.arange(5) of size 3:
>>> np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0]) # doctest: +SKIP
array([3, 3, 0])
Generate a uniform random sample from np.arange(5) of size 3 without
replacement:
>>> np.random.choice(5, 3, replace=False) # doctest: +SKIP
array([3,1,0])
>>> #This is equivalent to np.random.shuffle(np.arange(5))[:3]
Generate a non-uniform random sample from np.arange(5) of size
3 without replacement:
>>> np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0])
... # doctest: +SKIP
array([2, 3, 0])
Any of the above can be repeated with an arbitrary array-like
instead of just integers. For instance:
>>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
>>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
... # doctest: +SKIP
array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'],
dtype='|S11')
"""
if random_state is not None:
random_state = check_random_state(random_state)
return random_state.choice(a, size, replace, p)
else:
return np.random.choice(a, size, replace, p)
def random_choice_csc(n_samples, classes, class_probability=None,
random_state=None):
"""Generate a sparse random matrix given column class distributions
Parameters
----------
n_samples : int,
Number of samples to draw in each column.
classes : list of size n_outputs of arrays of size (n_classes,)
List of classes for each column.
class_probability : list of size n_outputs of arrays of size (n_classes,)
Optional (default=None). Class distribution of each column. If None the
uniform distribution is assumed.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
random_matrix : sparse csc matrix of size (n_samples, n_outputs)
"""
data = array.array('i')
indices = array.array('i')
indptr = array.array('i', [0])
for j in range(len(classes)):
classes[j] = np.asarray(classes[j])
if classes[j].dtype.kind != 'i':
raise ValueError("class dtype %s is not supported" %
classes[j].dtype)
classes[j] = classes[j].astype(np.int64, copy=False)
# use uniform distribution if no class_probability is given
if class_probability is None:
class_prob_j = np.empty(shape=classes[j].shape[0])
class_prob_j.fill(1 / classes[j].shape[0])
else:
class_prob_j = np.asarray(class_probability[j])
if not np.isclose(np.sum(class_prob_j), 1.0):
raise ValueError("Probability array at index {0} does not sum to "
"one".format(j))
if class_prob_j.shape[0] != classes[j].shape[0]:
raise ValueError("classes[{0}] (length {1}) and "
"class_probability[{0}] (length {2}) have "
"different length.".format(j,
classes[j].shape[0],
class_prob_j.shape[0]))
# If 0 is not present in the classes insert it with a probability 0.0
if 0 not in classes[j]:
classes[j] = np.insert(classes[j], 0, 0)
class_prob_j = np.insert(class_prob_j, 0, 0.0)
# If there are nonzero classes choose randomly using class_probability
rng = check_random_state(random_state)
if classes[j].shape[0] > 1:
p_nonzero = 1 - class_prob_j[classes[j] == 0]
nnz = int(n_samples * p_nonzero)
ind_sample = sample_without_replacement(n_population=n_samples,
n_samples=nnz,
random_state=random_state)
indices.extend(ind_sample)
# Normalize probabilities for the nonzero elements
classes_j_nonzero = classes[j] != 0
class_probability_nz = class_prob_j[classes_j_nonzero]
class_probability_nz_norm = (class_probability_nz /
np.sum(class_probability_nz))
classes_ind = np.searchsorted(class_probability_nz_norm.cumsum(),
rng.rand(nnz))
data.extend(classes[j][classes_j_nonzero][classes_ind])
indptr.append(len(indices))
return sp.csc_matrix((data, indices, indptr),
(n_samples, len(classes)),
dtype=int)
|