File: overlap_analysis.hpp

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (123 lines) | stat: -rw-r--r-- 3,787 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#pragma once

#include "utils/logger/logger.hpp"
#include "sequence/range.hpp"
#include "ssw_cpp.h"

namespace debruijn_graph {

struct OverlapInfo {
    Range r1;
    Range r2;
    size_t match_cnt;

    OverlapInfo(const Range& r1_, const Range& r2_, size_t match_cnt_)
            : r1(r1_),
              r2(r2_),
              match_cnt(match_cnt_) {
        VERIFY(match_cnt <= std::min(r1.size(), r2.size()));
    }

    OverlapInfo()
            : match_cnt(0) {
    }

    double identity() const {
        if (match_cnt == 0)
            return 0.;
        return (double)match_cnt / (double)size();
    }

    size_t size() const {
        return std::max(r1.size(), r2.size());
    }

    bool operator==(const OverlapInfo &that) const {
        return r1 == that.r1 && r2 == that.r2 && match_cnt == that.match_cnt;
    }

    bool operator!=(const OverlapInfo &that) const {
        return !(*this == that);
    }
};

inline std::ostream& operator<<(std::ostream& os, const OverlapInfo& info) {
    return os << "R1: [" << info.r1.start_pos << ", " << info.r1.end_pos
            << "]; R2: [" << info.r2.start_pos << ", " << info.r2.end_pos << "]"
            << "; match_cnt: " << info.match_cnt;
}

class SWOverlapAnalyzer {
    static const uint32_t CIGAR_FLAG_MASK = (1 << 4) - 1;
    static const uint32_t CIGAR_MATCH_FLAG = 7;
    typedef typename Graph::EdgeId EdgeId;
    size_t flank_length_;

    const StripedSmithWaterman::Aligner aligner_;
    const StripedSmithWaterman::Filter filter_;

    size_t CountMatches(std::vector<uint32_t> cigar) const {
        size_t match_cnt = 0;
        for (uint32_t entry : cigar) {
            if ((entry & CIGAR_FLAG_MASK) == CIGAR_MATCH_FLAG) {
                match_cnt += (entry >> 4);
            }
        }
        return match_cnt;
    }

    OverlapInfo InnerAnalyze(const Sequence& s1, const Sequence& s2) const {
        if (s1.size() == 0 || s2.size() == 0) {
            return OverlapInfo();
        }
        StripedSmithWaterman::Alignment alignment;
        if (aligner_.Align(s1.str().c_str(), s2.str().c_str(), int(s2.size()), filter_, &alignment)) {
            if (alignment.sw_score > 0) {
                return OverlapInfo(Range(alignment.query_begin, alignment.query_end + 1),
                            Range(alignment.ref_begin, alignment.ref_end + 1),
                            CountMatches(alignment.cigar));
            }
        }
        return OverlapInfo();
    }

public:
    SWOverlapAnalyzer(size_t flank_length)
            : flank_length_(flank_length),
              aligner_(/*match_score*/1,
              /*mismatch_penalty*/3,
                       /*gap_opening_penalty*/4,
                       /*gap_extending_penalty*/3) {
        DEBUG("Considered max overlap " << flank_length);
    }


    OverlapInfo AnalyzeOverlap(const Sequence& s1, const Sequence& s2) const {
        DEBUG("Analysis started");
        size_t start1 = flank_length_ > s1.size() ? 0 : s1.size() - flank_length_;
        size_t end2 = flank_length_ > s2.size() ? s2.size() : flank_length_;

        DEBUG("s1 " << s1.Subseq(start1, s1.size()));
        DEBUG("s2 " << s2.Subseq(0, end2));
        OverlapInfo result = InnerAnalyze(s1.Subseq(start1, s1.size()), s2.Subseq(0, end2));
        if (result == OverlapInfo()) {
            DEBUG("Empty overlap")
            return result;
        }

        result.r1.shift(int(start1));
        DEBUG("Result " << result)
        return result;
    }

    template<class Graph>
    OverlapInfo AnalyzeOverlap(const Graph& g, EdgeId e1, EdgeId e2) const {
        DEBUG("Analyzing edges " << g.str(e1) << " and " << g.str(e2));
        return AnalyzeOverlap(g.EdgeNucls(e1), g.EdgeNucls(e2));
    }

private:
    DECL_LOGGER("SWOverlapAnalyzer");
};

}