File: LinearCounter.cpp

package info (click to toggle)

mapsembler2 2.2.4%2Bdfsg1-5

links: PTS, VCS
area: main
in suites: sid
size: 7,344 kB
sloc: cpp: 51,204; ansic: 13,165; sh: 542; makefile: 396; asm: 271; python: 28

file content (47 lines) | stat: -rw-r--r-- 1,409 bytes

parent folder | download | duplicates (14)

#include <algorithm> // for max
#include "LinearCounter.h"

using namespace std; // for max

// counter the number of distinct kmers
// implements a linear counter following [1] K. Whang, B. T. Vander-Zaden, H.M. Taylor. A Liner-Time Probabilistic Counting Algorithm for Database Applications
// an easier presentation is there: http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
// here, it's implements as a wrapper around a special Bloom

LinearCounter::LinearCounter(long size) : desired_size(size)
{
    int bloom_nbits = max( (int)ceilf(log2f(size)), 1);
    bloom = new Bloom(bloom_nbits);
    bloom->set_number_of_hash_func(1);
    bloom_size = 1L << bloom_nbits;
}

void LinearCounter::add(bloom_elem kmer)
{
    bloom->add(kmer);
}

int LinearCounter::contains(bloom_elem kmer)
{
    // dummy, because bloom_pass_reads wants this method to be exposed
  return 0;
}

long LinearCounter::count()
{
    long weight = bloom->weight();
    //printf("linear counter load factor: %0.2f\n",(1.0*weight/bloom_size));
    return (long) ( (-1.0*bloom_size) * logf( (1.0*bloom_size - weight) / bloom_size ) );  // linear counter cardinality estimation
}

bool LinearCounter::is_accurate()
{
    long weight = bloom->weight();
    float load_factor = (1.0*weight/bloom_size);
    return load_factor < 0.99;
}

LinearCounter::~LinearCounter()
{
    delete bloom;
}