File: base.py

package info (click to toggle)
scikit-learn 0.20.2%2Bdfsg-6
links: PTS, VCS
area: main
in suites: buster
size: 51,036 kB
sloc: python: 108,171; ansic: 8,722; cpp: 5,651; makefile: 192; sh: 40
file content (90 lines) | stat: -rw-r--r-- 2,977 bytes
"""Helpers for preprocessing"""

import numpy as np
from scipy import sparse

from ..utils import check_array
from ..utils.validation import FLOAT_DTYPES
from ..externals import six


def _transform_selected(X, transform, dtype, selected="all", copy=True,
                        retain_order=False):
    """Apply a transform function to portion of selected features.

    Returns an array Xt, where the non-selected features appear on the right
    side (largest column indices) of Xt.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        Dense array or sparse matrix.

    transform : callable
        A callable transform(X) -> X_transformed

    dtype : number type
        Desired dtype of output.

    copy : boolean, default=True
        Copy X even if it could be avoided.

    selected : "all" or array of indices or mask
        Specify which features to apply the transform to.

    retain_order : boolean, default=False
        If True, the non-selected features will not be displaced to the right
        side of the transformed array. The number of features in Xt must
        match the number of features in X. Furthermore, X and Xt cannot be
        sparse.

    Returns
    -------
    Xt : array or sparse matrix, shape=(n_samples, n_features_new)
    """
    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)

    if sparse.issparse(X) and retain_order:
        raise ValueError("The retain_order option can only be set to True "
                         "for dense matrices.")

    if isinstance(selected, six.string_types) and selected == "all":
        return transform(X)

    if len(selected) == 0:
        return X

    n_features = X.shape[1]
    ind = np.arange(n_features)
    sel = np.zeros(n_features, dtype=bool)
    sel[np.asarray(selected)] = True
    not_sel = np.logical_not(sel)
    n_selected = np.sum(sel)

    if n_selected == 0:
        # No features selected.
        return X
    elif n_selected == n_features:
        # All features selected.
        return transform(X)
    else:
        X_sel = transform(X[:, ind[sel]])
        # The columns of X which are not transformed need
        # to be casted to the desire dtype before concatenation.
        # Otherwise, the stacking will cast to the higher-precision dtype.
        X_not_sel = X[:, ind[not_sel]].astype(dtype)

    if retain_order:
        if X_sel.shape[1] + X_not_sel.shape[1] != n_features:
            raise ValueError("The retain_order option can only be set to True "
                             "if the dimensions of the input array match the "
                             "dimensions of the transformed array.")

        # Fancy indexing not supported for sparse matrices
        X[:, ind[sel]] = X_sel
        return X

    if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
        return sparse.hstack((X_sel, X_not_sel))
    else:
        return np.hstack((X_sel, X_not_sel))