1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
|
"""
Cython rewrite of the vector quantization module, originally written
in C at src/vq.c and the wrapper at src/vq_module.c. This should be
easier to maintain than old SWIG output.
Original C version by Damian Eads.
Translated to Cython by David Warde-Farley, October 2009.
"""
cimport cython
import numpy as np
cimport numpy as np
from cluster_blas cimport f_dgemm, f_sgemm
from libc.math cimport sqrt
ctypedef np.float64_t float64_t
ctypedef np.float32_t float32_t
ctypedef np.int32_t int32_t
# Use Cython fused types for templating
# Define supported data types as vq_type
ctypedef fused vq_type:
float32_t
float64_t
# When the number of features is less than this number,
# switch back to the naive algorithm to avoid high overhead.
DEF NFEATURES_CUTOFF=5
# Initialize the NumPy C API
np.import_array()
cdef inline vq_type vec_sqr(int n, vq_type *p):
cdef vq_type result = 0.0
cdef int i
for i in range(n):
result += p[i] * p[i]
return result
cdef inline void cal_M(int nobs, int ncodes, int nfeat, vq_type *obs,
vq_type *code_book, vq_type *M):
"""
Calculate M = obs * code_book.T
"""
cdef vq_type alpha = -2.0, beta = 0.0
# Call BLAS functions with Fortran ABI
# Note that BLAS Fortran ABI uses column-major order
if vq_type is float32_t:
f_sgemm("T", "N", &ncodes, &nobs, &nfeat,
&alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
else:
f_dgemm("T", "N", &ncodes, &nobs, &nfeat,
&alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
cdef int _vq(vq_type *obs, vq_type *code_book,
int ncodes, int nfeat, int nobs,
int32_t *codes, vq_type *low_dist) except -1:
"""
The underlying function (template) of _vq.vq.
Parameters
----------
obs : vq_type*
The pointer to the observation matrix.
code_book : vq_type*
The pointer to the code book matrix.
ncodes : int
The number of centroids (codes).
nfeat : int
The number of features of each observation.
nobs : int
The number of observations.
codes : vq_type*
The pointer to the new codes array.
low_dist : vq_type*
low_dist[i] is the Euclidean distance from obs[i] to the corresponding
centroid.
"""
# Naive algorithm is prefered when nfeat is small
if nfeat < NFEATURES_CUTOFF:
_vq_small_nf(obs, code_book, ncodes, nfeat, nobs, codes, low_dist)
return 0
cdef np.npy_intp i, j
cdef vq_type *p_obs
cdef vq_type *p_codes
cdef vq_type dist_sqr
cdef np.ndarray[vq_type, ndim=1] obs_sqr, codes_sqr
cdef np.ndarray[vq_type, ndim=2] M
if vq_type is float32_t:
obs_sqr = np.ndarray(nobs, np.float32)
codes_sqr = np.ndarray(ncodes, np.float32)
M = np.ndarray((nobs, ncodes), np.float32)
else:
obs_sqr = np.ndarray(nobs, np.float64)
codes_sqr = np.ndarray(ncodes, np.float64)
M = np.ndarray((nobs, ncodes), np.float64)
p_obs = obs
for i in range(nobs):
# obs_sqr[i] is the inner product of the i-th observation with itself
obs_sqr[i] = vec_sqr(nfeat, p_obs)
p_obs += nfeat
p_codes = code_book
for i in range(ncodes):
# codes_sqr[i] is the inner product of the i-th code with itself
codes_sqr[i] = vec_sqr(nfeat, p_codes)
p_codes += nfeat
# M[i][j] is the inner product of the i-th obs and j-th code
# M = obs * codes.T
cal_M(nobs, ncodes, nfeat, obs, code_book, <vq_type *>M.data)
for i in range(nobs):
for j in range(ncodes):
dist_sqr = (M[i, j] +
obs_sqr[i] + codes_sqr[j])
if dist_sqr < low_dist[i]:
codes[i] = j
low_dist[i] = dist_sqr
# dist_sqr may be negative due to float point errors
if low_dist[i] > 0:
low_dist[i] = sqrt(low_dist[i])
else:
low_dist[i] = 0
return 0
cdef void _vq_small_nf(vq_type *obs, vq_type *code_book,
int ncodes, int nfeat, int nobs,
int32_t *codes, vq_type *low_dist):
"""
Vector quantization using naive algorithm.
This is prefered when nfeat is small.
The parameters are the same as those of _vq.
"""
# Temporary variables
cdef vq_type dist_sqr, diff
cdef np.npy_intp i, j, k, obs_offset = 0, code_offset
# Index and pointer to keep track of the current position in
# both arrays so that we don't have to always do index * nfeat.
cdef vq_type *current_obs
cdef vq_type *current_code
for i in range(nobs):
code_offset = 0
current_obs = &(obs[obs_offset])
for j in range(ncodes):
dist_sqr = 0
current_code = &(code_book[code_offset])
# Distance between code_book[j] and obs[i]
for k in range(nfeat):
diff = current_code[k] - current_obs[k]
dist_sqr += diff * diff
code_offset += nfeat
# Replace the code assignment and record distance if necessary
if dist_sqr < low_dist[i]:
codes[i] = j
low_dist[i] = dist_sqr
low_dist[i] = sqrt(low_dist[i])
# Update the offset of the current observation
obs_offset += nfeat
def vq(np.ndarray obs, np.ndarray codes):
"""
Vector quantization ndarray wrapper. Only support float32 and float64.
Parameters
----------
obs : ndarray
The observation matrix. Each row is an observation.
codes : ndarray
The code book matrix.
Notes
-----
The observation matrix and code book matrix should have same ndim and
same number of columns (features). Only 1-dimensional and 2-dimensional
arrays are supported.
"""
cdef int nobs, ncodes, nfeat
cdef np.ndarray outcodes, outdists
# Ensure the arrays are contiguous
obs = np.ascontiguousarray(obs)
codes = np.ascontiguousarray(codes)
if obs.dtype != codes.dtype:
raise TypeError('observation and code should have same dtype')
if obs.dtype not in (np.float32, np.float64):
raise TypeError('type other than float or double not supported')
if obs.ndim != codes.ndim:
raise ValueError(
'observation and code should have same number of dimensions')
if obs.ndim == 1:
nfeat = 1
nobs = obs.shape[0]
ncodes = codes.shape[0]
elif obs.ndim == 2:
nfeat = obs.shape[1]
nobs = obs.shape[0]
ncodes = codes.shape[0]
if nfeat != codes.shape[1]:
raise ValueError('obs and code should have same number of '
'features (columns)')
else:
raise ValueError('ndim different than 1 or 2 are not supported')
# Initialize outdists and outcodes array.
# Outdists should be initialized as INF.
outdists = np.empty((nobs,), dtype=obs.dtype)
outcodes = np.empty((nobs,), dtype=np.int32)
outdists.fill(np.inf)
if obs.dtype.type is np.float32:
_vq(<float32_t *>obs.data, <float32_t *>codes.data,
ncodes, nfeat, nobs, <int32_t *>outcodes.data,
<float32_t *>outdists.data)
elif obs.dtype.type is np.float64:
_vq(<float64_t *>obs.data, <float64_t *>codes.data,
ncodes, nfeat, nobs, <int32_t *>outcodes.data,
<float64_t *>outdists.data)
return outcodes, outdists
@cython.cdivision(True)
cdef np.ndarray _update_cluster_means(vq_type *obs, int32_t *labels,
vq_type *cb, int nobs, int nc, int nfeat):
"""
The underlying function (template) of _vq.update_cluster_means.
Parameters
----------
obs : vq_type*
The pointer to the observation matrix.
labels : int32_t*
The pointer to the array of the labels (codes) of the observations.
cb : vq_type*
The pointer to the new code book matrix.
nobs : int
The number of observations.
nc : int
The number of centroids (codes).
nfeat : int
The number of features of each observation.
Returns
-------
has_members : ndarray
A boolean array indicating which clusters have members.
"""
cdef np.npy_intp i, j, cluster_size, label
cdef vq_type *obs_p
cdef vq_type *cb_p
cdef np.ndarray[int, ndim=1] obs_count
# Calculate the sums the numbers of obs in each cluster
obs_count = np.zeros(nc, np.intc)
obs_p = obs
for i in range(nobs):
label = labels[i]
cb_p = cb + nfeat * label
for j in range(nfeat):
cb_p[j] += obs_p[j]
# Count the obs in each cluster
obs_count[label] += 1
obs_p += nfeat
cb_p = cb
for i in range(nc):
cluster_size = obs_count[i]
if cluster_size > 0:
# Calculate the centroid of each cluster
for j in range(nfeat):
cb_p[j] /= cluster_size
cb_p += nfeat
# Return a boolean array indicating which clusters have members
return obs_count > 0
def update_cluster_means(np.ndarray obs, np.ndarray labels, int nc):
"""
The update-step of K-means. Calculate the mean of observations in each
cluster.
Parameters
----------
obs : ndarray
The observation matrix. Each row is an observation. Its dtype must be
float32 or float64.
labels : ndarray
The label of each observation. Must be an 1d array.
nc : int
The number of centroids.
Returns
-------
cb : ndarray
The new code book.
has_members : ndarray
A boolean array indicating which clusters have members.
Notes
-----
The empty clusters will be set to all zeros and the curresponding elements
in `has_members` will be `False`. The upper level function should decide
how to deal with them.
"""
cdef np.ndarray has_members, cb
cdef int nfeat
# Ensure the arrays are contiguous
obs = np.ascontiguousarray(obs)
labels = np.ascontiguousarray(labels)
if obs.dtype not in (np.float32, np.float64):
raise TypeError('type other than float or double not supported')
if labels.dtype.type is not np.int32:
labels = labels.astype(np.int32)
if labels.ndim != 1:
raise ValueError('labels must be an 1d array')
if obs.ndim == 1:
nfeat = 1
cb = np.zeros(nc, dtype=obs.dtype)
elif obs.ndim == 2:
nfeat = obs.shape[1]
cb = np.zeros((nc, nfeat), dtype=obs.dtype)
else:
raise ValueError('ndim different than 1 or 2 are not supported')
if obs.dtype.type is np.float32:
has_members = _update_cluster_means(<float32_t *>obs.data,
<int32_t *>labels.data,
<float32_t *>cb.data,
obs.shape[0], nc, nfeat)
elif obs.dtype.type is np.float64:
has_members = _update_cluster_means(<float64_t *>obs.data,
<int32_t *>labels.data,
<float64_t *>cb.data,
obs.shape[0], nc, nfeat)
return cb, has_members
|