1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
|
"""Class to perform over-sampling using ADASYN."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Christos Aridas
# License: MIT
import numbers
import warnings
import numpy as np
from scipy import sparse
from sklearn.utils import _safe_indexing, check_random_state
from ..utils import Substitution, check_neighbors_object
from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
from ..utils._param_validation import HasMethods, Interval
from .base import BaseOverSampler
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class ADASYN(BaseOverSampler):
"""Oversample using Adaptive Synthetic (ADASYN) algorithm.
This method is similar to SMOTE but it generates different number of
samples depending on an estimate of the local distribution of the class
to be oversampled.
Read more in the :ref:`User Guide <smote_adasyn>`.
Parameters
----------
{sampling_strategy}
{random_state}
n_neighbors : int or estimator object, default=5
The nearest neighbors used to define the neighborhood of samples to use
to generate the synthetic samples. You can pass:
- an `int` corresponding to the number of neighbors to use. A
`~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
case.
- an instance of a compatible nearest neighbors algorithm that should
implement both methods `kneighbors` and `kneighbors_graph`. For
instance, it could correspond to a
:class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
any compatible class.
{n_jobs}
.. deprecated:: 0.10
`n_jobs` has been deprecated in 0.10 and will be removed in 0.12.
It was previously used to set `n_jobs` of nearest neighbors
algorithm. From now on, you can pass an estimator where `n_jobs` is
already set instead.
Attributes
----------
sampling_strategy_ : dict
Dictionary containing the information to sample the dataset. The keys
corresponds to the class labels from which to sample and the values
are the number of samples to sample.
nn_ : estimator object
Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`.
n_features_in_ : int
Number of features in the input dataset.
.. versionadded:: 0.9
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during `fit`. Defined only when `X` has feature
names that are all strings.
.. versionadded:: 0.10
See Also
--------
SMOTE : Over-sample using SMOTE.
SMOTENC : Over-sample using SMOTE for continuous and categorical features.
SMOTEN : Over-sample using the SMOTE variant specifically for categorical
features only.
SVMSMOTE : Over-sample using SVM-SMOTE variant.
BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
Notes
-----
The implementation is based on [1]_.
Supports multi-class resampling. A one-vs.-rest scheme is used.
References
----------
.. [1] He, Haibo, Yang Bai, Edwardo A. Garcia, and Shutao Li. "ADASYN:
Adaptive synthetic sampling approach for imbalanced learning," In IEEE
International Joint Conference on Neural Networks (IEEE World Congress
on Computational Intelligence), pp. 1322-1328, 2008.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.over_sampling import ADASYN
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000,
... random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> ada = ADASYN(random_state=42)
>>> X_res, y_res = ada.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{0: 904, 1: 900}})
"""
_parameter_constraints: dict = {
**BaseOverSampler._parameter_constraints,
"n_neighbors": [
Interval(numbers.Integral, 1, None, closed="left"),
HasMethods(["kneighbors", "kneighbors_graph"]),
],
"n_jobs": [numbers.Integral, None],
}
def __init__(
self,
*,
sampling_strategy="auto",
random_state=None,
n_neighbors=5,
n_jobs=None,
):
super().__init__(sampling_strategy=sampling_strategy)
self.random_state = random_state
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
def _validate_estimator(self):
"""Create the necessary objects for ADASYN"""
self.nn_ = check_neighbors_object(
"n_neighbors", self.n_neighbors, additional_neighbor=1
)
def _fit_resample(self, X, y):
# FIXME: to be removed in 0.12
if self.n_jobs is not None:
warnings.warn(
"The parameter `n_jobs` has been deprecated in 0.10 and will be "
"removed in 0.12. You can pass an nearest neighbors estimator where "
"`n_jobs` is already set instead.",
FutureWarning,
)
self._validate_estimator()
random_state = check_random_state(self.random_state)
X_resampled = [X.copy()]
y_resampled = [y.copy()]
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = _safe_indexing(X, target_class_indices)
self.nn_.fit(X)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
# The ratio is computed using a one-vs-rest manner. Using majority
# in multi-class would lead to slightly different results at the
# cost of introducing a new parameter.
n_neighbors = self.nn_.n_neighbors - 1
ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors
if not np.sum(ratio_nn):
raise RuntimeError(
"Not any neigbours belong to the majority"
" class. This case will induce a NaN case"
" with a division by zero. ADASYN is not"
" suited for this specific dataset."
" Use SMOTE instead."
)
ratio_nn /= np.sum(ratio_nn)
n_samples_generate = np.rint(ratio_nn * n_samples).astype(int)
# rounding may cause new amount for n_samples
n_samples = np.sum(n_samples_generate)
if not n_samples:
raise ValueError(
"No samples will be generated with the provided ratio settings."
)
# the nearest neighbors need to be fitted only on the current class
# to find the class NN to generate new samples
self.nn_.fit(X_class)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
enumerated_class_indices = np.arange(len(target_class_indices))
rows = np.repeat(enumerated_class_indices, n_samples_generate)
cols = random_state.choice(n_neighbors, size=n_samples)
diffs = X_class[nns[rows, cols]] - X_class[rows]
steps = random_state.uniform(size=(n_samples, 1))
if sparse.issparse(X):
sparse_func = type(X).__name__
steps = getattr(sparse, sparse_func)(steps)
X_new = X_class[rows] + steps.multiply(diffs)
else:
X_new = X_class[rows] + steps * diffs
X_new = X_new.astype(X.dtype)
y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype)
X_resampled.append(X_new)
y_resampled.append(y_new)
if sparse.issparse(X):
X_resampled = sparse.vstack(X_resampled, format=X.format)
else:
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)
return X_resampled, y_resampled
def _more_tags(self):
return {
"X_types": ["2darray"],
}
|