1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
|
# cython: infer_types=True, cdivision=True, bounds_check=False, wraparound=False, profile=False
cimport cython
cimport numpy as np
from libc.stdint cimport int32_t, uint32_t, uint64_t
from typing import Callable, Optional, Tuple
from ..backends import CupyOps, NumpyOps
from ..config import registry
from ..model import Model
from ..types import ArrayXd
from ..util import get_array_module, get_width, is_cupy_array, is_numpy_array
InT = Tuple[ArrayXd, ArrayXd, ArrayXd]
OutT = ArrayXd
@cython.binding(True)
@registry.layers("SparseLinear.v1")
def SparseLinear(nO: Optional[int] = None, length: int = 2 ** 18):
# NB: We can't have generic return type annotation if we want function to
# be bound (and inspectable): https://github.com/cython/cython/issues/2753
return Model(
"sparse_linear",
forward,
init=init,
params={"W": None, "b": None},
dims={"nO": nO, "length": length},
attrs={"v1_indexing": True},
)
@cython.binding(True)
@registry.layers("SparseLinear.v2")
def SparseLinear_v2(nO: Optional[int] = None, length: int = 2 ** 18):
# NB: We can't have generic return type annotation if we want function to
# be bound (and inspectable): https://github.com/cython/cython/issues/2753
return Model(
"sparse_linear",
forward,
init=init,
params={"W": None, "b": None},
dims={"nO": nO, "length": length},
attrs={"v1_indexing": False},
)
@cython.binding(True)
def forward(model: Model, keys_values_lengths: InT, is_train: bool) -> Tuple[OutT, Callable]:
# NB: We can't have generic Model annotation if we want function to
# be bound (and inspectable): https://github.com/cython/cython/issues/2753
keys, values, lengths = keys_values_lengths
if is_cupy_array(keys):
# Currently we don't have a GPU-compatible implementation of this function :(
# It sucks, but at least we can get the correct result by copying to CPU.
return _begin_gpu_update(model, keys, values, lengths)
else:
return _begin_cpu_update(model, keys, values, lengths)
def init(model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None) -> Model[InT, OutT]:
if Y is not None:
model.set_dim("nO", get_width(Y))
nO = model.get_dim("nO")
length = model.get_dim("length")
model.set_param("W", model.ops.alloc((nO * length,), dtype="f"))
model.set_param("b", model.ops.alloc((nO,), dtype="f"))
return model
def _begin_gpu_update(model: Model[InT, OutT], keys: ArrayXd, values: ArrayXd, lengths: ArrayXd) -> Tuple[ArrayXd, Callable]:
xp = get_array_module(keys)
scores_cpu, callback = _begin_cpu_update(model, keys.get(), values.get(), lengths.get())
def backprop_gpu_update(d_scores: ArrayXd) -> Tuple[ArrayXd, ArrayXd, ArrayXd]:
callback(d_scores.get())
return (keys, values, lengths)
return xp.asarray(scores_cpu), backprop_gpu_update
def _begin_cpu_update(model, np.ndarray keys, np.ndarray values, np.ndarray lengths):
cdef int nO = model.get_dim("nO")
cdef int length = model.get_dim("length")
cdef np.ndarray W = model.get_param("W")
cdef np.ndarray b = model.get_param("b")
cdef np.ndarray scores = model.ops.alloc((len(lengths), nO))
cdef bint v1_indexing = model.attrs["v1_indexing"]
scores += b
set_scoresC(<float*>scores.data,
<uint64_t*>keys.data, <float*>values.data, <int32_t*>lengths.data,
lengths.shape[0], nO,
<float*>W.data, length, v1_indexing)
return scores, _finish_linear_update(model, keys, values, lengths)
class _finish_linear_update:
"""Move this out of a closure, into its own callable object, to avoid
pickling errors :(."""
def __init__(self, model, keys, values, lengths):
self.model = model
self.keys = keys
self.values = values
self.lengths = lengths
def __call__(self, float[:, ::1] d_scores):
nO = self.model.get_dim("nO")
length = self.model.get_dim("length")
cdef np.ndarray d_weights = self.model.ops.alloc((nO*length,))
cdef np.ndarray d_bias = self.model.ops.alloc((nO,))
cdef np.ndarray keys = self.keys
cdef np.ndarray values = self.values
cdef np.ndarray lengths = self.lengths
cdef bint v1_indexing = self.model.attrs["v1_indexing"]
set_gradientC(<float*>d_weights.data,
<uint64_t*>keys.data, <float*>values.data, <int32_t*>lengths.data,
lengths.shape[0], nO, &d_scores[0,0], length, v1_indexing)
cdef int i, j
for i in range(d_scores.shape[0]):
for j in range(d_scores.shape[1]):
d_bias[j] += d_scores[i, j]
self.model.inc_grad("W", d_weights)
self.model.inc_grad("b", d_bias)
return (self.keys, self.values, self.lengths)
# v1_indexing is invalid and only uses a subset of the weight matrix, v1
# indexing is provided here for compatibility. See #752 for more information.
cdef void set_scoresC(float* scores,
const uint64_t* keys, const float* values, const int32_t* lengths,
int batch_size, int nr_out, const float* weights, int nr_weight,
bint v1_indexing) nogil:
cdef uint32_t idx1, idx2
cdef uint32_t hash1, hash2
for length in lengths[:batch_size]:
for i in range(length):
hash1 = MurmurHash3_x86_32_uint64(keys[i], 0)
hash2 = MurmurHash3_x86_32_uint64(keys[i], 1)
if v1_indexing:
idx1 = hash1 & (nr_weight-1)
idx2 = hash2 & (nr_weight-1)
else:
idx1 = hash1 % nr_weight
idx2 = hash2 % nr_weight
value = values[i]
for clas in range(nr_out):
if v1_indexing:
scores[clas] += weights[idx1 + clas] * value
scores[clas] += weights[idx2 + clas] * value
else:
scores[clas] += weights[(clas * nr_weight) + idx1] * value
scores[clas] += weights[(clas * nr_weight) + idx2] * value
scores += nr_out
keys += length
values += length
# v1_indexing is invalid and only uses a subset of the weight matrix, v1
# indexing is provided here for compatibility. See #752 for more information.
cdef void set_gradientC(float* d_weights,
const uint64_t* keys, const float* values, const int32_t* lengths,
int batch_size, int nr_out, const float* d_scores, int nr_weight,
bint v1_indexing) nogil:
cdef uint32_t idx1, idx2
cdef uint32_t hash1, hash2
for length in lengths[:batch_size]:
for i in range(length):
hash1 = MurmurHash3_x86_32_uint64(keys[i], 0)
hash2 = MurmurHash3_x86_32_uint64(keys[i], 1)
if v1_indexing:
idx1 = hash1 & (nr_weight-1)
idx2 = hash2 & (nr_weight-1)
else:
idx1 = hash1 % nr_weight
idx2 = hash2 % nr_weight
value = values[i]
for clas in range(nr_out):
if v1_indexing:
d_weights[idx1 + clas] += d_scores[clas] * value
d_weights[idx2 + clas] += d_scores[clas] * value
else:
d_weights[(clas * nr_weight) + idx1] += d_scores[clas] * value
d_weights[(clas * nr_weight) + idx2] += d_scores[clas] * value
d_scores += nr_out
keys += length
values += length
cdef uint32_t MurmurHash3_x86_32_uint64(uint64_t key, uint32_t seed) nogil:
cdef uint32_t h1 = seed
cdef uint32_t c1 = 0xcc9e2d51u
cdef uint32_t c2 = 0x1b873593u
cdef uint32_t k1
k1 = key & 0xffffffffu
k1 *= c1
k1 = (k1 << 15) | (k1 >> 17)
k1 *= c2
h1 ^= k1
h1 = (h1 << 13) | (h1 >> 19)
h1 = h1*5+0xe6546b64u
k1 = key >> 32
k1 *= c1
k1 = (k1 << 15) | (k1 >> 17)
k1 *= c2
h1 ^= k1
h1 = (h1 << 13) | (h1 >> 19)
h1 = h1*5+0xe6546b64u
h1 ^= 8
h1 ^= h1 >> 16
h1 *= 0x85ebca6bu
h1 ^= h1 >> 13
h1 *= 0xc2b2ae35u
h1 ^= h1 >> 16
return h1
|