1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
|
/*
* huffman.c: implementation of huffman.h.
*/
#include <assert.h>
#include "halibut.h"
#include "huffman.h"
static const unsigned fibonacci[] = {
0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597,
2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418,
317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465,
14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
433494437, 701408733, 1134903170, 1836311903, 2971215073,
};
/*
* Binary heap functions used by buildhuf(). Each one assumes the
* heap to be stored in an array of ints, with two ints per node
* (user data and key). They take in the old heap length, and
* return the new one.
*/
#define HEAPPARENT(x) (((x)-2)/4*2)
#define HEAPLEFT(x) ((x)*2+2)
#define HEAPRIGHT(x) ((x)*2+4)
static int addheap(int *heap, int len, int userdata, int key)
{
int me, dad, tmp;
me = len;
heap[len++] = userdata;
heap[len++] = key;
while (me > 0) {
dad = HEAPPARENT(me);
if (heap[me+1] < heap[dad+1]) {
tmp = heap[me]; heap[me] = heap[dad]; heap[dad] = tmp;
tmp = heap[me+1]; heap[me+1] = heap[dad+1]; heap[dad+1] = tmp;
me = dad;
} else
break;
}
return len;
}
static int rmheap(int *heap, int len, int *userdata, int *key)
{
int me, lc, rc, c, tmp;
len -= 2;
*userdata = heap[0];
*key = heap[1];
heap[0] = heap[len];
heap[1] = heap[len+1];
me = 0;
while (1) {
lc = HEAPLEFT(me);
rc = HEAPRIGHT(me);
if (lc >= len)
break;
else if (rc >= len || heap[lc+1] < heap[rc+1])
c = lc;
else
c = rc;
if (heap[me+1] > heap[c+1]) {
tmp = heap[me]; heap[me] = heap[c]; heap[c] = tmp;
tmp = heap[me+1]; heap[me+1] = heap[c+1]; heap[c+1] = tmp;
} else
break;
me = c;
}
return len;
}
struct hufscratch {
int *parent, *length, *heap;
};
/*
* The core of the Huffman algorithm: takes an input array of
* symbol frequencies, and produces an output array of code
* lengths.
*
* We cap the output code lengths to fit in an unsigned char (which is
* safe since our clients will impose some smaller limit on code
* length anyway). So if you see 255 in the output, it means '255 or
* more' and is a sign that whatever limit you really wanted has
* certainly been overflowed.
*/
static void buildhuf(struct hufscratch *sc, const int *freqs,
unsigned char *lengths, int nsyms)
{
int heapsize;
int i, j, n;
int si, sj;
for (i = 0; i < nsyms; i++)
sc->parent[i] = 0;
/*
* Begin by building the heap.
*/
heapsize = 0;
for (i = 0; i < nsyms; i++)
if (freqs[i] > 0) /* leave unused symbols out totally */
heapsize = addheap(sc->heap, heapsize, i, freqs[i]);
/*
* Now repeatedly take two elements off the heap and merge
* them.
*/
n = nsyms;
while (heapsize > 2) {
heapsize = rmheap(sc->heap, heapsize, &i, &si);
heapsize = rmheap(sc->heap, heapsize, &j, &sj);
sc->parent[i] = n;
sc->parent[j] = n;
heapsize = addheap(sc->heap, heapsize, n, si + sj);
n++;
}
/*
* Now we have our tree, in the form of a link from each node
* to the index of its parent. Count back down the tree to
* determine the code lengths.
*/
for (i = 0; i < 2*nsyms+1; i++)
sc->length[i] = 0;
/* The tree root has length 0 after that, which is correct. */
for (i = n-1; i-- ;)
if (sc->parent[i] > 0)
sc->length[i] = 1 + sc->length[sc->parent[i]];
/*
* And that's it. (Simple, wasn't it?) Copy the lengths into
* the output array and leave.
*
* Here we cap lengths to fit in unsigned char.
*/
for (i = 0; i < nsyms; i++)
lengths[i] = (sc->length[i] > 255 ? 255 : sc->length[i]);
}
/*
* Wrapper around buildhuf() which enforces the restriction on code
* length.
*/
void build_huffman_tree(int *freqs, unsigned char *lengths,
int nsyms, int limit)
{
struct hufscratch hsc, *sc = &hsc;
int smallestfreq, totalfreq, nactivesyms;
int num, denom, adjust;
int i;
int maxprob;
sc->parent = snewn(2*nsyms+1, int);
sc->length = snewn(2*nsyms+1, int);
sc->heap = snewn(2*nsyms, int);
/*
* Nasty special case: if the frequency table has fewer than
* two non-zero elements, we must invent some, because we can't
* have fewer than one bit encoding a symbol.
*/
assert(nsyms >= 2);
{
int count = 0;
for (i = 0; i < nsyms; i++)
if (freqs[i] > 0)
count++;
if (count < 2) {
for (i = 0; i < nsyms && count > 0; i++)
if (freqs[i] == 0) {
freqs[i] = 1;
count--;
}
}
}
/*
* First, try building the Huffman table the normal way. If
* this works, it's optimal, so we don't want to mess with it.
*/
buildhuf(sc, freqs, lengths, nsyms);
for (i = 0; i < nsyms; i++)
if (lengths[i] > limit)
break;
if (i == nsyms)
goto cleanup; /* OK */
/*
* The Huffman algorithm can only ever generate a code length
* of N bits or more if there is a symbol whose probability is
* less than the reciprocal of the (N+2)th Fibonacci number
* (counting from F_0=0 and F_1=1), i.e. 1/2584 for N=16, or
* 1/55 for N=8. (This is a necessary though not sufficient
* condition.)
*
* Why is this? Well, consider the input symbol with the
* smallest probability. Let that probability be x. In order
* for this symbol to have a code length of at least 1, the
* Huffman algorithm will have to merge it with some other
* node; and since x is the smallest probability, the node it
* gets merged with must be at least x. Thus, the probability
* of the resulting combined node will be at least 2x. Now in
* order for our node to reach depth 2, this 2x-node must be
* merged again. But what with? We can't assume the node it
* merges with is at least 2x, because this one might only be
* the _second_ smallest remaining node. But we do know the
* node it merges with must be at least x, so our order-2
* internal node is at least 3x.
*
* How small a node can merge with _that_ to get an order-3
* internal node? Well, it must be at least 2x, because if it
* was smaller than that then it would have been one of the two
* smallest nodes in the previous step and been merged at that
* point. So at least 3x, plus at least 2x, comes to at least
* 5x for an order-3 node.
*
* And so it goes on: at every stage we must merge our current
* node with a node at least as big as the bigger of this one's
* two parents, and from this starting point that gives rise to
* the Fibonacci sequence. So we find that in order to have a
* node n levels deep (i.e. a maximum code length of n), the
* overall probability of the root of the entire tree must be
* at least F_{n+2} times the probability of the rarest symbol.
* In other words, since the overall probability is 1, it is a
* necessary condition for a code length of 16 or more that
* there must be at least one symbol with probability <=
* 1/F_18.
*
* (To demonstrate that a probability this big really can give
* rise to a code length of 16, consider the set of input
* frequencies { 1-epsilon, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,
* 89, 144, 233, 377, 610, 987 }, for arbitrarily small
* epsilon.)
*
* So here buildhuf() has returned us an overlong code. So to
* ensure it doesn't do it again, we add a constant to all the
* (non-zero) symbol frequencies, causing them to become more
* balanced and removing the danger. We can then feed the
* results back to the standard buildhuf() and be
* assert()-level confident that the resulting code lengths
* contain nothing outside the permitted range.
*/
assert(limit+3 < (int)lenof(fibonacci));
maxprob = fibonacci[limit+3];
totalfreq = nactivesyms = 0;
smallestfreq = -1;
for (i = 0; i < nsyms; i++) {
if (freqs[i] == 0)
continue;
if (smallestfreq < 0 || smallestfreq > freqs[i])
smallestfreq = freqs[i];
totalfreq += freqs[i];
nactivesyms++;
}
assert(smallestfreq <= totalfreq / maxprob);
/*
* We want to find the smallest integer `adjust' such that
* (totalfreq + nactivesyms * adjust) / (smallestfreq +
* adjust) is less than maxprob. A bit of algebra tells us
* that the threshold value is equal to
*
* totalfreq - maxprob * smallestfreq
* ----------------------------------
* maxprob - nactivesyms
*
* rounded up, of course. And we'll only even be trying this if
* smallestfreq <= totalfreq / maxprob, which is precisely the
* condition under which the numerator of this fraction is
* positive.
*
* (As for the denominator, that could only be negative if there
* were more than F_{n+2} symbols overall, in which case it
* _wouldn't_ be possible to avoid having a symbol with
* probability at most 1/F_{n+2}. So that is a constraint on the
* input parameters to this function, which we enforce by
* assertion.)
*/
num = totalfreq - smallestfreq * maxprob;
denom = maxprob - nactivesyms;
assert(num > 0); /* this just restates the assert above */
assert(denom > 0); /* this is a constraint on the function parameters */
adjust = (num + denom - 1) / denom;
/*
* Now add `adjust' to all the input symbol frequencies.
*/
for (i = 0; i < nsyms; i++)
if (freqs[i] != 0)
freqs[i] += adjust;
/*
* Rebuild the Huffman tree...
*/
buildhuf(sc, freqs, lengths, nsyms);
/*
* ... and this time it ought to be OK.
*/
for (i = 0; i < nsyms; i++)
assert(lengths[i] <= limit);
cleanup:
/*
* Finally, free our scratch space.
*/
sfree(sc->parent);
sfree(sc->length);
sfree(sc->heap);
}
#define MAXCODELEN 31 /* codes must fit in an int */
int compute_huffman_codes(const unsigned char *lengths, int *codes, int nsyms)
{
unsigned count[MAXCODELEN], startcode[MAXCODELEN], code;
int maxlen, i;
/* Count the codes of each length. */
maxlen = 0;
for (i = 1; i < MAXCODELEN; i++)
count[i] = 0;
for (i = 0; i < nsyms; i++) {
count[lengths[i]]++;
if (maxlen < lengths[i])
maxlen = lengths[i];
}
/* Determine the starting code for each length block. */
code = 0;
for (i = 1; i < MAXCODELEN; i++) {
startcode[i] = code;
code += count[i];
if (code > (1U << i))
maxlen = -1; /* overcommitted */
code <<= 1;
}
if (code < (1U << MAXCODELEN))
maxlen = -2; /* undercommitted */
/* Determine the code for each symbol. */
for (i = 0; i < nsyms; i++)
codes[i] = startcode[lengths[i]]++;
return maxlen;
}
|