File: downsample_vector.h

package info (click to toggle)
r-bioc-scuttle 1.8.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 888 kB
  • sloc: cpp: 508; sh: 7; makefile: 2
file content (99 lines) | stat: -rw-r--r-- 3,545 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include "Rcpp.h"
#include <cmath>
#include <cstdint>

namespace scuttle {

namespace {

// Using big integers to avoid overflow when summing across a large matrix.

typedef std::uint64_t bigint_t;

/* This function considers sampling events without replacement from a vector.
 * Here, though, the vector contains frequencies of events rather than the events themselves.
 * The sampling scheme is adapted from John D. Cook, https://stackoverflow.com/a/311716/15485.
 * 
 * freqIt: An iterator pointing to the start of the frequency vector.
 * freqEnd: An interator pointing to the end of the frequency vector.
 * freqOut: An iterator pointing to an output vector, indicating how many instances of each event have been sampled.
 * remaining_total: An integer scalar specifying the remaining number of instances yet to be considered for sampling.
 * remaining_required: An integer scalar specifying the number of events that still need to be sampled.
 * 
 * Note that remaining_total's initial value may not be simply a sum of all values from [freqIt, freqEnd).
 * This is because we allow multiple applications of this function to sample without replacement from a series of vectors.
 * We keep track of 'remaining_total' and 'remaining_required' to ensure correct sampling when moving from one vector to another.
 */
template<class IN, class OUT> 
void downsample(IN freqIt, IN freqEnd, OUT freqOut, bigint_t& remaining_total, bigint_t& remaining_required) 
{        
    while (freqIt!=freqEnd && remaining_required) {
        // It is allowed for this function to modify in place,
        // so *freqIt must be read before *freqOut is modified. 
        const int full_count=*freqIt;
        auto& downsampled=(*freqOut=0);

        for (int i=0; i<full_count && remaining_required; ++i) {
            // Deciding whether or not to keep this instance of this event.
            // This is a safe way of computing NUM_YET_TO_SELECT/NUM_YET_TO_PROCESS > runif(1), 
            // avoiding issues with integer division.
            if ( remaining_total*R::unif_rand() < remaining_required) {
                ++downsampled;
                --remaining_required;
            }
            --remaining_total;
        }
     
        ++freqIt;
        ++freqOut;
    }

    // Mopping up the leftovers.
    while (freqIt!=freqEnd) {
        ++freqIt;
        *freqOut=0;
        ++freqOut;
    }

    return;
}  

}

/* Downsampling a vector to a given proportion. */

template<class IN, class OUT> 
void downsample_vector(IN freqIt, IN freqEnd, OUT freqOut, double prop) {
    double total=0;
    {
        auto freqCopy=freqIt;
        while (freqCopy!=freqEnd) {
            total+=*freqCopy;
            ++freqCopy;
        }
    }

    bigint_t remaining_total=std::round(total);
    bigint_t remaining_required=std::round(std::min(1.0, prop)*total);
    downsample(freqIt, freqEnd, freqOut, remaining_total, remaining_required);
    return;
}

/* Progressively downsampling parts of a vector to a given proportion. */

class downsample_vector_part {
public:
    downsample_vector_part(double total, double required, bool as_prop=true) : 
        remaining_total(std::round(total)), 
        remaining_required(as_prop ? std::round(std::min(1.0, required) * total) : required) {}

    template<class IN, class OUT> 
    void operator()(IN freqIt, IN freqEnd, OUT freqOut) { 
        downsample(freqIt, freqEnd, freqOut, remaining_total, remaining_required);
        return;
    }
private:
    bigint_t remaining_total, remaining_required;
};

}