File: FindRandomSequence.hpp

package info (click to toggle)
pbseqlib 5.3.4%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 7,020 kB
  • sloc: cpp: 77,246; python: 331; sh: 103; makefile: 42
file content (69 lines) | stat: -rw-r--r-- 2,280 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#ifndef _FIND_RANDOM_SEQUENCE_HPP_
#define _FIND_RANDOM_SEQUENCE_HPP_

#include <vector>

#include <alignment/statistics/StatUtils.hpp>  // Where does this come from? Does this compile anymore?
#include <pbdata/DNASequence.hpp>

template <typename T_Sequence>
void FindRandomPos(std::vector<T_Sequence> &sequences, DNALength &seqIndex, DNALength &seqPos,
                   DNALength seqLength = 0)
{
    std::vector<UInt> cumulativeLengths;
    cumulativeLengths.resize(sequences.size());
    if (sequences.size() == 0) {
        return;
    }
    DNALength cumulativeLength;
    cumulativeLengths[0] = sequences[0].length;
    cumulativeLength = cumulativeLengths[0];
    for (unsigned i = 1; i < sequences.size(); i++) {
        cumulativeLengths[i] = cumulativeLength = cumulativeLengths[i - 1] + sequences[i].length;
    }
    bool validPosFound = false;
    int iter = 0;
    int max_iter = 100000;
    while (validPosFound == false and iter < max_iter) {
        ++iter;
        if (seqLength > cumulativeLength) {
            validPosFound = false;
            iter = max_iter;
            break;
        }
        DNALength pos = RandomUnsignedInt(cumulativeLength - seqLength);
        // Make sure this sequence fits
        for (seqIndex = 0; seqIndex < sequences.size(); seqIndex++) {
            if (cumulativeLengths[seqIndex] > pos) break;
        }
        if (cumulativeLengths[seqIndex] - pos < seqLength) {
            continue;
        }
        UInt pi;
        if (seqIndex == 0) {
            seqPos = pos;
        } else {
            seqPos = pos - cumulativeLengths[seqIndex - 1];
        }
        bool seqContainsN = false;
        for (pi = seqPos; pi < seqPos + seqLength; pi++) {
            if (toupper(sequences[seqIndex].seq[pi]) == 'N') {
                seqContainsN = true;
                break;
            }
        }
        if (seqContainsN) {
            continue;
        } else {
            validPosFound = true;
        }
    }
    if (iter == max_iter) {
        std::cout << "ERROR! Unable to generate a random seq/pos pair!, maybe length " << seqLength
                  << std::endl
                  << " is too high, or there are too many N's in the references." << std::endl;
        std::exit(EXIT_FAILURE);
    }
}

#endif