File: LinearCounter.cpp

package info (click to toggle)
mapsembler2 2.2.4%2Bdfsg1-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 7,344 kB
  • sloc: cpp: 51,204; ansic: 13,165; sh: 542; makefile: 396; asm: 271; python: 28
file content (47 lines) | stat: -rw-r--r-- 1,409 bytes parent folder | download | duplicates (14)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#include <algorithm> // for max
#include "LinearCounter.h"

using namespace std; // for max

// counter the number of distinct kmers
// implements a linear counter following [1] K. Whang, B. T. Vander-Zaden, H.M. Taylor. A Liner-Time Probabilistic Counting Algorithm for Database Applications
// an easier presentation is there: http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
// here, it's implements as a wrapper around a special Bloom

LinearCounter::LinearCounter(long size) : desired_size(size)
{
    int bloom_nbits = max( (int)ceilf(log2f(size)), 1);
    bloom = new Bloom(bloom_nbits);
    bloom->set_number_of_hash_func(1);
    bloom_size = 1L << bloom_nbits;
}

void LinearCounter::add(bloom_elem kmer)
{
    bloom->add(kmer);
}

int LinearCounter::contains(bloom_elem kmer)
{
    // dummy, because bloom_pass_reads wants this method to be exposed
  return 0;
}

long LinearCounter::count()
{
    long weight = bloom->weight();
    //printf("linear counter load factor: %0.2f\n",(1.0*weight/bloom_size));
    return (long) ( (-1.0*bloom_size) * logf( (1.0*bloom_size - weight) / bloom_size ) );  // linear counter cardinality estimation
}

bool LinearCounter::is_accurate()
{
    long weight = bloom->weight();
    float load_factor = (1.0*weight/bloom_size);
    return load_factor < 0.99;
}

LinearCounter::~LinearCounter()
{
    delete bloom;
}