File: strhash.testc

package info (click to toggle)
cyrus-imapd 3.12.1-3
links: PTS, VCS
area: main
in suites: forky, sid
size: 60,544 kB
sloc: ansic: 280,382; perl: 146,834; javascript: 9,624; sh: 5,730; yacc: 2,660; cpp: 2,263; makefile: 2,103; lex: 675; xml: 621; awk: 303; python: 273; asm: 262
file content (231 lines) | stat: -rw-r--r-- 6,573 bytes
parent folder | download | duplicates (8)
#include <config.h>

#include <sys/types.h>
#include <sys/stat.h>

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "cunit/cyrunit.h"
#include "lib/util.h"
#include "lib/strhash.h"

extern int verbose;

static const char *repeat(int c, unsigned n)
{
    static char buf[1024];
    char *p = buf;

    /* always leave room for a \0 */
    if (n >= sizeof(buf))
        n = sizeof(buf) - 1;

    memset(buf, 0, sizeof(buf));
    while (n--) {
        *p++ = c;
    }

    return buf;
}

static void test_repeated(void)
{
    /* repeated chars on the end should not obliterate earlier input */
    unsigned suffix_lengths[] = { 15, 31, 63, 127, 255, 511, 1023 };
    unsigned i;

    for (i = 0; i < sizeof(suffix_lengths) / sizeof(suffix_lengths[0]); i++) {
        char *cat = strconcat("cat", repeat('a', suffix_lengths[i]), NULL);
        char *dog = strconcat("dog", repeat('a', suffix_lengths[i]), NULL);
        char *mouse = strconcat("mouse", repeat('a', suffix_lengths[i]), NULL);

        unsigned xcat = strhash(cat);
        unsigned xdog = strhash(dog);
        unsigned xmouse = strhash(mouse);

        CU_ASSERT_NOT_EQUAL(xcat, xdog);
        CU_ASSERT_NOT_EQUAL(xdog, xmouse);
        CU_ASSERT_NOT_EQUAL(xmouse, xcat);

        free(cat);
        free(dog);
        free(mouse);
    }
}

static void test_seeded(void)
{
    const char *const words[] = { "lorem", "ipsum", "dolor", "sit", "amet" };
    const size_t n_words = sizeof(words) / sizeof(words[0]);
    unsigned hashes[n_words];
    unsigned i, j;

    memset(hashes, 0, sizeof(hashes));

    /* with no seed, same input should produce same hash */
    for (i = 0; i < n_words; i++) {
        unsigned h1 = strhash(words[i]);
        unsigned h2 = strhash(words[i]);
        CU_ASSERT_EQUAL(h1, h2);
    }

    /* with explicit zero seed, same input should produce same hash */
    for (i = 0; i < n_words; i++) {
        unsigned h1 = strhash(words[i]);
        unsigned h2 = strhash_seeded(0, words[i]);
        unsigned h3 = strhash_seeded(0, words[i]);
        CU_ASSERT_EQUAL(h1, h2);
        CU_ASSERT_EQUAL(h2, h3);
        CU_ASSERT_EQUAL(h3, h1);
    }

    /* with some seed, same input should produce same hash */
    for (j = 0; j < 5; j++) {
        uint32_t seed;
        do {
            seed = rand();
        } while (seed == 0);

        for (i = 0; i < n_words; i++) {
            unsigned h1 = strhash_seeded(seed, words[i]);
            unsigned h2 = strhash_seeded(seed, words[i]);
            CU_ASSERT_EQUAL(h1, h2);
        }
    }

    /* with different seed, same input should produce different hash */
    for (i = 0; i < n_words; i++) {
        uint32_t seed1, seed2;
        do {
            seed1 = rand();
            seed2 = rand();
        } while (seed1 == 0 || seed2 == 0 || seed1 == seed2);

        unsigned h1 = strhash_seeded(seed1, words[i]);
        unsigned h2 = strhash_seeded(seed2, words[i]);

        CU_ASSERT_NOT_EQUAL(h1, h2);
    }
}

/* We can't define-out an entire test function when a feature is missing
 * (in this case getline), because it confuses cunit.pl. So instead we
 * make sure it will at least compile, but then return early without doing
 * anything if the feature we wanted was missing.
 */
#ifndef HAVE_GETLINE
#define getline(a,b,c) (((void)(b)), -1)
#endif

#define NBUCKETS (0x10000)
static void test_quality(void)
{
    const char *wordsfile = "/usr/share/dict/words";
    unsigned buckets[NBUCKETS] = {0};
    FILE *stream;
    char *line = NULL;
    size_t len = 0;
    ssize_t nread;
    unsigned i;
    unsigned inputs = 0;
    unsigned contains_none = 0;
    unsigned contains_one = 0;
    unsigned contains_many = 0;
    unsigned contains_many_sum = 0;
    unsigned highest_count = 0;
    unsigned highest_count_freq = 0;
    unsigned max_acceptable_count;
    double load;

#ifndef HAVE_GETLINE
    /* can't do anything anyway */
    return;
#endif

    stream = fopen(wordsfile, "r");
    if (!stream) {
        if (verbose)
            fprintf(stderr, "%s: %s (skipping) ", wordsfile, strerror(errno));
        return;
    }

    while ((nread = getline(&line, &len, stream)) != -1) {
        /* chomp */
        if (line[nread - 1] == '\n')
            line[nread - 1] = '\0';

        unsigned hash = strhash_seeded_djb2(0, line) % NBUCKETS;
//        unsigned hash = strhash_legacy(line) % NBUCKETS;

        buckets[hash]++;
        inputs++;
    }
    free(line);

    /* arbitrary declaration of quality: no buckets should have more
     * than ten times the expected load
     */
    load = inputs * 1.0 / NBUCKETS;
    max_acceptable_count = load * 10;

    unsigned bucket_counts[max_acceptable_count + 2];
    memset(bucket_counts, 0, sizeof(bucket_counts));

    for (i = 0; i < NBUCKETS; i++) {
        switch (buckets[i]) {
        case 0:
            contains_none++;
            break;
        case 1:
            contains_one++;
            break;
        default:
            contains_many++;
            contains_many_sum += buckets[i];
            break;
        }

        if (buckets[i] > max_acceptable_count) {
            bucket_counts[max_acceptable_count+1]++;
        }
        else {
            bucket_counts[buckets[i]]++;
        }

        if (buckets[i] > highest_count) {
            highest_count = buckets[i];
            highest_count_freq = 1;
        }
        else if (buckets[i] == highest_count) {
            highest_count_freq++;
        }
    }

    if (verbose) {
        putc('\n', stderr);
        fprintf(stderr, "buckets: %u inputs: %u load: %g\n",
                        NBUCKETS, inputs, load);
        fprintf(stderr, "empty: %u unique: %u busy: %u\n",
                        contains_none, contains_one, contains_many);
        fprintf(stderr, "avg count in busy buckets: %g\n",
                        contains_many_sum * 1.0 / contains_many);
        fprintf(stderr, "busiest %u buckets contain %u each\n",
                        highest_count_freq, highest_count);
        fprintf(stderr, "max acceptable count: %u\n", max_acceptable_count);
        fprintf(stderr, "\nbucket count histogram:\ncount frequency\n");
        for (i = 0; i <= max_acceptable_count; i++) {
            fprintf(stderr, "%4u  %u\n", i, bucket_counts[i]);
        }
        fprintf(stderr, "%4u+ %u\n", max_acceptable_count + 1,
                                     bucket_counts[max_acceptable_count + 1]);
    }

    CU_ASSERT_EQUAL(bucket_counts[max_acceptable_count + 1], 0);
}
#undef NBUCKETS

/* vim: set ft=c: */