1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
# Author: Andrew nystrom <awnystrom@gmail.com>
from scipy.sparse import csr_matrix
cimport numpy as cnp
cnp.import_array()
ctypedef cnp.int32_t INDEX_T
ctypedef fused DATA_T:
cnp.float32_t
cnp.float64_t
cnp.int32_t
cnp.int64_t
cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
INDEX_T interaction_only) nogil:
"""Compute the index of the column for a degree 2 expansion
d is the dimensionality of the input data, i and j are the indices
for the columns involved in the expansion.
"""
if interaction_only:
return d * i - (i**2 + 3 * i) / 2 - 1 + j
else:
return d * i - (i**2 + i) / 2 + j
cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
INDEX_T interaction_only) nogil:
"""Compute the index of the column for a degree 3 expansion
d is the dimensionality of the input data, i, j and k are the indices
for the columns involved in the expansion.
"""
if interaction_only:
return ((3 * d**2 * i - 3 * d * i**2 + i**3
+ 11 * i - 3 * j**2 - 9 * j) / 6
+ i**2 - 2 * d * i + d * j - d + k)
else:
return ((3 * d**2 * i - 3 * d * i**2 + i ** 3 - i
- 3 * j**2 - 3 * j) / 6
+ d * j + k)
def _csr_polynomial_expansion(cnp.ndarray[DATA_T, ndim=1] data,
cnp.ndarray[INDEX_T, ndim=1] indices,
cnp.ndarray[INDEX_T, ndim=1] indptr,
INDEX_T d, INDEX_T interaction_only,
INDEX_T degree):
"""
Perform a second-degree polynomial or interaction expansion on a scipy
compressed sparse row (CSR) matrix. The method used only takes products of
non-zero features. For a matrix with density d, this results in a speedup
on the order of d^k where k is the degree of the expansion, assuming all
rows are of similar density.
Parameters
----------
data : nd-array
The "data" attribute of the input CSR matrix.
indices : nd-array
The "indices" attribute of the input CSR matrix.
indptr : nd-array
The "indptr" attribute of the input CSR matrix.
d : int
The dimensionality of the input CSR matrix.
interaction_only : int
0 for a polynomial expansion, 1 for an interaction expansion.
degree : int
The degree of the expansion. This must be either 2 or 3.
References
----------
"Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
"""
assert degree in (2, 3)
if degree == 2:
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
else:
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
- interaction_only*d**2)
if expanded_dimensionality == 0:
return None
assert expanded_dimensionality > 0
cdef INDEX_T total_nnz = 0, row_i, nnz
# Count how many nonzero elements the expanded matrix will contain.
for row_i in range(indptr.shape[0]-1):
# nnz is the number of nonzero elements in this row.
nnz = indptr[row_i + 1] - indptr[row_i]
if degree == 2:
total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
else:
total_nnz += ((nnz ** 3 + 3 * nnz ** 2 + 2 * nnz) / 6
- interaction_only * nnz ** 2)
# Make the arrays that will form the CSR matrix of the expansion.
cdef cnp.ndarray[DATA_T, ndim=1] expanded_data = cnp.ndarray(
shape=total_nnz, dtype=data.dtype)
cdef cnp.ndarray[INDEX_T, ndim=1] expanded_indices = cnp.ndarray(
shape=total_nnz, dtype=indices.dtype)
cdef INDEX_T num_rows = indptr.shape[0] - 1
cdef cnp.ndarray[INDEX_T, ndim=1] expanded_indptr = cnp.ndarray(
shape=num_rows + 1, dtype=indptr.dtype)
cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
i_ptr, j_ptr, k_ptr, num_cols_in_row, \
expanded_column
with nogil:
expanded_indptr[0] = indptr[0]
for row_i in range(indptr.shape[0]-1):
row_starts = indptr[row_i]
row_ends = indptr[row_i + 1]
num_cols_in_row = 0
for i_ptr in range(row_starts, row_ends):
i = indices[i_ptr]
for j_ptr in range(i_ptr + interaction_only, row_ends):
j = indices[j_ptr]
if degree == 2:
col = _deg2_column(d, i, j, interaction_only)
expanded_indices[expanded_index] = col
expanded_data[expanded_index] = (
data[i_ptr] * data[j_ptr])
expanded_index += 1
num_cols_in_row += 1
else:
# degree == 3
for k_ptr in range(j_ptr + interaction_only,
row_ends):
k = indices[k_ptr]
col = _deg3_column(d, i, j, k, interaction_only)
expanded_indices[expanded_index] = col
expanded_data[expanded_index] = (
data[i_ptr] * data[j_ptr] * data[k_ptr])
expanded_index += 1
num_cols_in_row += 1
expanded_indptr[row_i+1] = expanded_indptr[row_i] + num_cols_in_row
return csr_matrix((expanded_data, expanded_indices, expanded_indptr),
shape=(num_rows, expanded_dimensionality))
|