1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
|
/* Karatsuba convolution
*
* Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
*
* This file is part of AlsaPlayer.
*
* AlsaPlayer is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* AlsaPlayer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
/* The algorithm is based on the following. For the convolution of a pair
* of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
* multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
* b.d. A similar relation enables us to compute a 2n by 2n convolution
* using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
* multiplications (as opposed to the 4^n that the quadratic algorithm
* takes. */
/* For large n, this is slower than the O(n log n) that the FFT method
* takes, but we avoid using complex numbers, and we only have to compute
* one convolution, as opposed to 3 FFTs. We have good locality-of-
* reference as well, which will help on CPUs with tiny caches. */
/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
* (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
* algorithm. We actually want 257 outputs of a 256 x 512 convolution;
* that doesn't appear to give an easy advantage for the FFT algorithm, but
* for the Karatsuba algorithm, it's easy to use two 256 x 256
* convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference
* is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
* while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
* - 1]. */
/* There's a big lie above, actually... for a 4x4 convolution, it's quicker
* to do it using 16 multiplications than the more complex Karatsuba
* algorithm... So the recursion bottoms out at 4x4s. This increases the
* number of multiplications by a factor of 16/9, but reduces the overheads
* dramatically. */
/* The convolution algorithm is implemented as a stack machine. We have a
* stack of commands, each in one of the forms "do a 2^n x 2^n
* convolution", or "combine these three length 2^n outputs into one
* 2^{n+1} output." */
#include <stdlib.h>
#include "alsaplayer_convolve.h"
typedef union stack_entry_s {
struct {const double * left, * right; double * out;} v;
struct {double * main, * null;} b;
} stack_entry;
#define STACK_SIZE (CONVOLVE_DEPTH * 3)
struct _struct_convolve_state {
double left [CONVOLVE_BIG];
double right [CONVOLVE_SMALL * 3];
double scratch [CONVOLVE_SMALL * 3];
stack_entry stack[STACK_SIZE];
};
/*
* Initialisation routine - sets up tables and space to work in.
* Returns a pointer to internal state, to be used when performing calls.
* On error, returns NULL.
* The pointer should be freed when it is finished with, by convolve_close().
*/
convolve_state *convolve_init(void)
{
return malloc (sizeof(convolve_state));
}
/*
* Free the state allocated with convolve_init().
*/
void convolve_close(convolve_state *state)
{
if (state)
free(state);
}
static void convolve_4 (double * out, const double * left, const double * right)
/* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd
* ordering gives about a 1% speed up on my Pentium II. */
{
double l0, l1, l2, l3, r0, r1, r2, r3;
double a;
l0 = left[0];
r0 = right[0];
a = l0 * r0;
l1 = left[1];
r1 = right[1];
out[0] = a;
a = (l0 * r1) + (l1 * r0);
l2 = left[2];
r2 = right[2];
out[1] = a;
a = (l0 * r2) + (l1 * r1) + (l2 * r0);
l3 = left[3];
r3 = right[3];
out[2] = a;
out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
out[5] = (l2 * r3) + (l3 * r2);
out[6] = l3 * r3;
}
static void convolve_run (stack_entry * top, unsigned size, double * scratch)
/* Interpret a stack of commands. The stack starts with two entries; the
* convolution to do, and an illegal entry used to mark the stack top. The
* size is the number of entries in each input, and must be a power of 2,
* and at least 8. It is OK to have out equal to left and/or right.
* scratch must have length 3*size. The number of stack entries needed is
* 3n-4 where size=2^n. */
{
do {
const double * left;
const double * right;
double * out;
/* When we get here, the stack top is always a convolve,
* with size > 4. So we will split it. We repeatedly split
* the top entry until we get to size = 4. */
left = top->v.left;
right = top->v.right;
out = top->v.out;
top++;
do {
double * s_left, * s_right;
unsigned i;
/* Halve the size. */
size >>= 1;
/* Allocate the scratch areas. */
s_left = scratch + size * 3;
/* s_right is a length 2*size buffer also used for
* intermediate output. */
s_right = scratch + size * 4;
/* Create the intermediate factors. */
for (i = 0; i < size; i++) {
double l = left[i] + left[i + size];
double r = right[i] + right[i + size];
s_left[i + size] = r;
s_left[i] = l;
}
/* Push the combine entry onto the stack. */
top -= 3;
top[2].b.main = out;
top[2].b.null = NULL;
/* Push the low entry onto the stack. This must be
* the last of the three sub-convolutions, because
* it may overwrite the arguments. */
top[1].v.left = left;
top[1].v.right = right;
top[1].v.out = out;
/* Push the mid entry onto the stack. */
top[0].v.left = s_left;
top[0].v.right = s_right;
top[0].v.out = s_right;
/* Leave the high entry in variables. */
left += size;
right += size;
out += size * 2;
} while (size > 4);
/* When we get here, the stack top is a group of 3
* convolves, with size = 4, followed by some combines. */
convolve_4 (out, left, right);
convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
top += 2;
/* Now process combines. */
do {
/* b.main is the output buffer, mid is the middle
* part which needs to be adjusted in place, and
* then folded back into the output. We do this in
* a slightly strange way, so as to avoid having
* two loops. */
double * outx = top->b.main;
double * mid = scratch + size * 4;
unsigned int i;
top++;
outx[size * 2 - 1] = 0;
for (i = 0; i < size-1; i++) {
double lo;
double hi;
lo = mid[0] - (outx[0] + outx[2 * size]) + outx[size];
hi = mid[size] - (outx[size] + outx[3 * size]) + outx[2 * size];
outx[size] = lo;
outx[2 * size] = hi;
outx++;
mid++;
}
size <<= 1;
} while (top->b.null == NULL);
} while (top->b.main != NULL);
}
int convolve_match (const int * lastchoice,
const short * input,
convolve_state * state)
/* lastchoice is a 256 sized array. input is a 512 array. We find the
* contiguous length 256 sub-array of input that best matches lastchoice.
* A measure of how good a sub-array is compared with the lastchoice is
* given by the sum of the products of each pair of entries. We maximise
* that, by taking an appropriate convolution, and then finding the maximum
* entry in the convolutions. state is a (non-NULL) pointer returned by
* convolve_init. */
{
double avg;
double best;
int p = 0;
int i;
double * left = state->left;
double * right = state->right;
double * scratch = state->scratch;
stack_entry * top = state->stack + STACK_SIZE - 1;
#if 1
for (i = 0; i < 512; i++)
left[i] = input[i];
avg = 0;
for (i = 0; i < 256; i++) {
double a = lastchoice[255 - i];
right[i] = a;
avg += a;
}
#endif
/* We adjust the smaller of the two input arrays to have average
* value 0. This makes the eventual result insensitive to both
* constant offsets and positive multipliers of the inputs. */
avg /= 256;
for (i = 0; i < 256; i++)
right[i] -= avg;
/* End-of-stack marker. */
#if 0 /* The following line produces a CRASH, need to figure out why?!! */
top[1].b.null = scratch;
#endif
top[1].b.main = NULL;
/* The low 256x256, of which we want the high 256 outputs. */
top->v.left = left;
top->v.right = right;
top->v.out = right + 256;
convolve_run (top, 256, scratch);
/* The high 256x256, of which we want the low 256 outputs. */
top->v.left = left + 256;
top->v.right = right;
top->v.out = right;
convolve_run (top, 256, scratch);
/* Now find the best position amoungs this. Apart from the first
* and last, the required convolution outputs are formed by adding
* outputs from the two convolutions above. */
best = right[511];
right[767] = 0;
p = -1;
for (i = 0; i < 256; i++) {
double a = right[i] + right[i + 512];
if (a > best) {
best = a;
p = i;
}
}
p++;
#if 0
{
/* This is some debugging code... */
int bad = 0;
best = 0;
for (i = 0; i < 256; i++)
best += ((double) input[i+p]) * ((double) lastchoice[i] - avg);
for (i = 0; i < 257; i++) {
double tot = 0;
unsigned int j;
for (j = 0; j < 256; j++)
tot += ((double) input[i+j]) * ((double) lastchoice[j] - avg);
if (tot > best)
printf ("(%i)", i);
if (tot != left[i + 255])
printf ("!");
}
printf ("%i\n", p);
}
#endif
return p;
}
|