File: PP.pm

package info (click to toggle)
libalgorithm-hyperloglog-perl 0.24-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid
  • size: 184 kB
  • sloc: perl: 222; ansic: 87; makefile: 3
file content (143 lines) | stat: -rw-r--r-- 2,939 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package Algorithm::HyperLogLog::PP;
use strict;
use warnings;
use 5.008008;
use Carp ();
use Digest::MurmurHash3::PurePerl qw(murmur32);
use constant {
    HLL_HASH_SEED => 313,
    TWO_32        => 4294967296.0,
    NEG_TWO_32    => -4294967296.0,
};

our $VERSION = "0.24";

require Algorithm::HyperLogLog;

{

    package Algorithm::HyperLogLog;
    our @ISA = qw(Algorithm::HyperLogLog::PP);
}

sub new {
    my ( $class, $k ) = @_;

    if ( $k < 4 || $k > 16 ) {
        Carp::croak "Number of ragisters must be in the range [4,16]";
    }

    my $m         = 1 << $k;
    my $registers = [ (0) x $m ];
    my $alpha     = 0;
    if ( $m == 16 ) {
        $alpha = 0.673;
    }
    elsif ( $m == 32 ) {
        $alpha = 0.697;
    }
    elsif ( $m == 64 ) {
        $alpha = 0.709;
    }
    else {
        $alpha = 0.7213 / ( 1.0 + 1.079 / $m );
    }

    my $self = {
        k         => $k,
        m         => $m,
        registers => $registers,
        alphaMM   => $alpha * $m * $m,
    };
    bless $self, $class;
    return $self;
}

sub _new_from_dump {
    my ( $class, $k, $data ) = @_;
    my $self = $class->new($k);
    $self->{registers} = $data;
    return $self;
}

sub _dump_register {
    my $self = shift;
    return $self->{registers};
}

sub register_size {
    my $self = shift;
    return $self->{m};
}

sub add {
    my ( $self, @data_list ) = @_;
    for my $data (@data_list) {
        my $hash = murmur32( $data, HLL_HASH_SEED );
        my $index = ( $hash >> ( 32 - $self->{'k'} ) );
        my $rank = _rho( ( $hash << $self->{k} ), 32 - $self->{k} );
        if ( $rank > $self->{registers}[$index] ) {
            $self->{registers}[$index] = $rank;
        }
    }
}

sub estimate {
    my $self = shift;
    my $m    = $self->{m};

    my $rank = 0;
    my $sum  = 0.0;
    for my $i ( 0 .. ( $m - 1 ) ) {
        $rank = $self->{registers}[$i];
        $sum += 1.0 / ( 2.0**$rank );
    }

    my $estimate = $self->{alphaMM} * ( 1.0 / $sum );    # E in the original paper
    if ( $estimate <= 2.5 * $m ) {
        my $v = 0;
        for my $i ( 0 .. ( $m - 1 ) ) {
            if ( $self->{registers}[$i] == 0 ) {
                $v++;
            }
        }

        if ( $v != 0 ) {
            $estimate = $m * log( $m / $v );
        }
    }
    elsif ( $estimate > ( 1.0 / 30.0 ) * TWO_32 ) {
        $estimate = NEG_TWO_32 * log( 1.0 - ( $estimate / TWO_32 ) );
    }
    return $estimate;
}

sub merge {
    my ($self, $other) = @_;
    my $m    = $self->{m};

    die "hll size misatch" if $self->{m} != $other->{m};

    for (my $i=0; $i<$m; $i++) {
        if ($self->{registers}[$i] < $other->{registers}[$i]) {
            $self->{registers}[$i] = $other->{registers}[$i];
        }
    }
}

sub XS {
    0;
}

sub _rho {
    my ( $x, $b ) = @_;
    my $v = 1;
    while ( $v <= $b && !( $x & 0x80000000 ) ) {
        $v++;
        $x <<= 1;
    }
    return $v;
}

1;
__END__