File: Transcript_convertGenomeCigar.cpp

package info (click to toggle)
rna-star 2.7.11b%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,492 kB
  • sloc: cpp: 21,951; awk: 827; ansic: 457; makefile: 192; sh: 31
file content (161 lines) | stat: -rwxr-xr-x 6,347 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include "Transcript.h"
bool Transcript::convertGenomeCigar(Genome &genOut, Transcript & A)
{
    auto &coBl=genOut.genomeOut.convBlocks;    
    auto cBit=coBl.begin();
    cBit = std::upper_bound(cBit, coBl.end(), array<uint64,3> {gStart,0,0},
                               [](const array<uint64,3> &t1, const array<uint64,3> &t2)
                               {
                                  return t1[0] < t2[0];
                               });
    --cBit;
    
    uint64 icB = cBit-coBl.begin();
    
    A.gStart=gStart-coBl[icB][0]+coBl[icB][2];
    
    A.cigar.clear();
    A.cigar.reserve(cigar.size()+100); //add 100 for junctions
    
    uint64 gEnd=gStart;//gEnd is end+1
    //uint32 l1=0, l2=0;
    //uint32 gGapNew=0;
    for (uint32 iC=0; iC<cigar.size(); iC++) {
        auto cc=cigar[iC];        
        //if (cc[0]==BAM_CIGAR_M)
        //    l1+=cc[1];
        if (cc[0]==BAM_CIGAR_M || cc[0]==BAM_CIGAR_D || cc[0]==BAM_CIGAR_N) {//M,D,N: blocks that have to be split if they overlap blocks boundaries
            uint64 gStart1=gEnd;
            gEnd += cc[1];
            uint64 bE;
            while (( bE=coBl[icB][0]+coBl[icB][1] ) <= gEnd) {//MDN-block ends at the c-block end, or goes past
                uint32 len1 = (uint32)( bE-gStart1 );//first part of the a-block
                
                //if (cc[0]==BAM_CIGAR_M)
                //    l2+=len1;

                A.cigar.push_back({cc[0], len1});
                
                len1 = (uint32)( coBl[icB+1][2]-coBl[icB][2]-coBl[icB][1] );//insert junction gap
                //gGapNew += len1;
                
                A.cigar.push_back({BAM_CIGAR_N, len1});
                
                gStart1=bE;
                ++icB;
            };
            
            if (gEnd>gStart1)
                A.cigar.push_back({ cc[0], (uint32) (gEnd-gStart1) });
              
            //if(cc[0]==BAM_CIGAR_M)
            //    l2 += gEnd-gStart1;
            //if (l1!=l2) {
            //    cout << l1 <<" "<< l2 <<endl;                
            //};
            
            if (gEnd<coBl[icB][0]) //step back
                --icB;
                
        } else {//S,I
            //l1+=cc[1];
            //l2+=cc[1];
            A.cigar.push_back(cc);
        };        
    };
    //if (l1!=l2) {
    //    cout << l1 <<" "<< l2 <<endl;                
    //};
    
    {//merge consecutive identical operations
        auto c0=A.cigar.begin();
        for (auto c1=A.cigar.begin()+1; c1!=A.cigar.end(); c1++) {
            if ( c1[0][0]==c0[0][0] ) {//add length to the prev element
                c0[0][1] += c1[0][1];
            } else if ( c1[0][0]==BAM_CIGAR_N && c0[0][0]==BAM_CIGAR_I && c0[-1][0]==BAM_CIGAR_N){
                //NIN : add junction to the junction before I, since I placement does not matter
                //relying on I operation not being the first one, so c0 is at least cigar.begin+1
                c0[-1][1] += c1[0][1];
            } else{
                c0++;
                *c0=*c1;
            };
        };
        A.cigar.resize(c0+1-A.cigar.begin());
    };
     
    {//remove last junction, or the junction before last S - it could appear if the last cigar M lands at the end of the c-block
        if (A.cigar.back()[0] == BAM_CIGAR_N) {
            //gGapNew-=A.cigar.back()[1];
            A.cigar.pop_back(); 
        };
        if (A.cigar.back()[0] == BAM_CIGAR_S && (*(A.cigar.end()-2))[0] == BAM_CIGAR_N) {
            //gGapNew-=(*(A.cigar.end()-2))[1];
            A.cigar.erase(A.cigar.end()-2);
        };
    };
    
    {//remove start junctions with too short overhangs
        //TODO: loop over until all such junctions are removed
        array<uint32,5> nOp={0,0,0,0,0};
        for (auto c1=A.cigar.begin(); c1!=A.cigar.end(); c1++) {
            nOp[(*c1)[0]] += (*c1)[1];
            if ( (*c1)[0] == BAM_CIGAR_M ) {
                if (nOp[BAM_CIGAR_M] >= genOut.P.alignSJDBoverhangMin) //TODO Transcript class should have P
                    break; //overhang is long enough
            } else if ( (*c1)[0]==BAM_CIGAR_N ) {//short overhang for this junction
                //find the next M
                c1++;
                while ( (*c1)[0]!=BAM_CIGAR_M ) {
                    nOp[(*c1)[0]] += (*c1)[1];
                    c1++;
                };
                //remove junction
                A.cigar.erase(A.cigar.begin(), c1-1);//keep one element at the beginning to convert it to S
                A.cigar.at(0)={BAM_CIGAR_S, nOp[BAM_CIGAR_M]+nOp[BAM_CIGAR_S]+nOp[BAM_CIGAR_I]};
                A.gStart += nOp[BAM_CIGAR_M]+nOp[BAM_CIGAR_D]+nOp[BAM_CIGAR_N];
                break;
            };
        };
    };
    
    {//remove terminal junctions with too short overhangs
        //TODO: loop over until all such junctions are removed
        array<uint32,5> nOp={0,0,0,0,0};
        for (auto c1=A.cigar.end()-1; c1!=A.cigar.begin(); c1--) {
            nOp[(*c1)[0]] += (*c1)[1];
            if ( (*c1)[0] == BAM_CIGAR_M ) {
                if (nOp[BAM_CIGAR_M] >= genOut.P.alignSJDBoverhangMin) //TODO Transcript class should have P
                    break; //overhang is long enough
            } else if ( (*c1)[0]==BAM_CIGAR_N ) {//short overhang for this junction
                //find the next M
                c1--;
                while ( (*c1)[0]!=BAM_CIGAR_M ) {
                    nOp[(*c1)[0]] += (*c1)[1];
                    c1--;
                };
                //remove junction
                A.cigar.erase(c1+1, A.cigar.end());
                A.cigar.push_back({BAM_CIGAR_S, nOp[BAM_CIGAR_M]+nOp[BAM_CIGAR_S]+nOp[BAM_CIGAR_I]});
                break;
            };
        };
    };
    
    //TODO: also need to recalculate nMM
    A.gLength=0;
    for (const auto &cc : A.cigar)
        if (cc[0]==BAM_CIGAR_M || cc[0]==BAM_CIGAR_D || cc[0]==BAM_CIGAR_N)
            A.gLength += cc[1];
        
    A.Str = Str;
    if (A.gStart >= genOut.genomeOut.nMinusStrandOffset) {//convert to +strand
        A.Str = 1-A.Str;
        A.gStart = 2*genOut.genomeOut.nMinusStrandOffset - (A.gStart+A.gLength);
        std::reverse(A.cigar.begin(), A.cigar.end());
    };
        
    A.Chr = genOut.chrBin[A.gStart >> genOut.pGe.gChrBinNbits];
    
    return (true); //conversion is always possible
};