1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
|
# cython: boundscheck=False
# cython: wraparound=False
#
# Author: Arnaud Joly
#
# License: BSD 3 clause
"""
Random utility function
=======================
This module complements missing features of ``numpy.random``.
The module contains:
* Several algorithms to sample integers without replacement.
"""
from __future__ import division
cimport cython
import numpy as np
cimport numpy as np
np.import_array()
from sklearn.utils import check_random_state
cpdef _sample_without_replacement_check_input(np.int_t n_population,
np.int_t n_samples):
""" Check that input are consistent for sample_without_replacement"""
if n_population < 0:
raise ValueError('n_population should be greater than 0, got %s.'
% n_population)
if n_samples > n_population:
raise ValueError('n_population should be greater or equal than '
'n_samples, got n_samples > n_population (%s > %s)'
% (n_samples, n_population))
cpdef _sample_without_replacement_with_tracking_selection(
np.int_t n_population,
np.int_t n_samples,
random_state=None):
r"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity:
- Worst-case: unbounded
- Average-case:
O(O(np.random.randint) * \sum_{i=1}^n_samples 1 /
(1 - i / n_population)))
<= O(O(np.random.randint) *
n_population * ln((n_population - 2)
/(n_population - 1 - n_samples)))
<= O(O(np.random.randint) *
n_population * 1 / (1 - n_samples / n_population))
Space complexity of O(n_samples) in a python set.
Parameters
----------
n_population : int,
The size of the set to sample from.
n_samples : int,
The number of integer to sample.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : array of size (n_samples, )
The sampled subsets of integer.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef np.int_t i
cdef np.int_t j
cdef np.ndarray[np.int_t, ndim=1] out = np.empty((n_samples, ),
dtype=np.int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# The following line of code are heavily inspired from python core,
# more precisely of random.sample.
cdef set selected = set()
for i in range(n_samples):
j = rng_randint(n_population)
while j in selected:
j = rng_randint(n_population)
selected.add(j)
out[i] = j
return out
cpdef _sample_without_replacement_with_pool(np.int_t n_population,
np.int_t n_samples,
random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity: O(n_population + O(np.random.randint) * n_samples)
Space complexity of O(n_population + n_samples).
Parameters
----------
n_population : int,
The size of the set to sample from.
n_samples : int,
The number of integer to sample.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : array of size (n_samples, )
The sampled subsets of integer.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef np.int_t i
cdef np.int_t j
cdef np.ndarray[np.int_t, ndim=1] out = np.empty((n_samples, ),
dtype=np.int)
cdef np.ndarray[np.int_t, ndim=1] pool = np.empty((n_population, ),
dtype=np.int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# Initialize the pool
for i in xrange(n_population):
pool[i] = i
# The following line of code are heavily inspired from python core,
# more precisely of random.sample.
for i in xrange(n_samples):
j = rng_randint(n_population - i) # invariant: non-selected at [0,n-i)
out[i] = pool[j]
pool[j] = pool[n_population - i - 1] # move non-selected item into
# vacancy
return out
cpdef _sample_without_replacement_with_reservoir_sampling(
np.int_t n_population,
np.int_t n_samples,
random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity of
O((n_population - n_samples) * O(np.random.randint) + n_samples)
Space complexity of O(n_samples)
Parameters
----------
n_population : int,
The size of the set to sample from.
n_samples : int,
The number of integer to sample.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : array of size (n_samples, )
The sampled subsets of integer. The order of the items is not
necessarily random. Use a random permutation of the array if the order
of the items has to be randomized.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef np.int_t i
cdef np.int_t j
cdef np.ndarray[np.int_t, ndim=1] out = np.empty((n_samples, ),
dtype=np.int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# This cython implementation is based on the one of Robert Kern:
# http://mail.scipy.org/pipermail/numpy-discussion/2010-December/
# 054289.html
#
for i in range(n_samples):
out[i] = i
for i from n_samples <= i < n_population:
j = rng_randint(0, i + 1)
if j < n_samples:
out[j] = i
return out
cpdef sample_without_replacement(np.int_t n_population,
np.int_t n_samples,
method="auto",
random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Parameters
----------
n_population : int,
The size of the set to sample from.
n_samples : int,
The number of integer to sample.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
method : "auto", "tracking_selection", "reservoir_sampling" or "pool"
If method == "auto", the ratio of n_samples / n_population is used
to determine which algorithm to use:
If ratio is between 0 and 0.01, tracking selection is used.
If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
If ratio is greater than 0.99, reservoir sampling is used.
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
If method =="tracking_selection", a set based implementation is used
which is suitable for `n_samples` <<< `n_population`.
If method == "reservoir_sampling", a reservoir sampling algorithm is
used which is suitable for high memory constraint or when
O(`n_samples`) ~ O(`n_population`).
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
If method == "pool", a pool based algorithm is particularly fast, even
faster than the tracking selection method. Hovewer, a vector containing
the entire population has to be initialized.
If n_samples ~ n_population, the reservoir sampling method is faster.
Returns
-------
out : array of size (n_samples, )
The sampled subsets of integer. The subset of selected integer might
not be randomized, see the method argument.
"""
_sample_without_replacement_check_input(n_population, n_samples)
all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
ratio = n_samples / n_population if n_population != 0.0 else 1.0
# Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
if method == "auto" and ratio > 0.01 and ratio < 0.99:
rng = check_random_state(random_state)
return rng.permutation(n_population)[:n_samples]
if method == "auto" or method == "tracking_selection":
# TODO the pool based method can also be used.
# however, it requires special benchmark to take into account
# the memory requirement of the array vs the set.
# The value 0.2 has been determined through benchmarking.
if ratio < 0.2:
return _sample_without_replacement_with_tracking_selection(
n_population, n_samples, random_state)
else:
return _sample_without_replacement_with_reservoir_sampling(
n_population, n_samples, random_state)
elif method == "reservoir_sampling":
return _sample_without_replacement_with_reservoir_sampling(
n_population, n_samples, random_state)
elif method == "pool":
return _sample_without_replacement_with_pool(n_population, n_samples,
random_state)
else:
raise ValueError('Expected a method name in %s, got %s. '
% (all_methods, method))
|