File: ReadAlign_stitchWindowSeeds.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (278 lines) | stat: -rwxr-xr-x 12,229 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#include <cmath>

#include "IncludeDefine.h"
#include "Parameters.h"
#include "Transcript.h"
#include "ReadAlign.h"
#include "stitchAlignToTranscript.h"
#include "extendAlign.h"
#include "binarySearch2.h"
#include "ErrorWarning.h"

void ReadAlign::stitchWindowSeeds (uint iW, uint iWrec, bool *WAexcl, char *R) {//stitches all seeds in one window: iW

    for (uint iS1=0;iS1<nWA[iW];iS1++) {
        scoreSeedBest[iS1]=0;
        scoreSeedBestMM[iS1]=0;
        scoreSeedBestInd[iS1]=-1;
        if (WAexcl!=NULL && WAexcl[iS1]) continue; //do not include this seed
        intScore score2=0;
        for (uint iS2=0;iS2<=iS1;iS2++) {
            if (WAexcl!=NULL && WAexcl[iS1]) continue; //do not include this seed
            trA1=*trInit;//initialize trA1
            if (iS2<iS1) {
                trA1.nExons=1;
                trA1.nMM=scoreSeedBestMM[iS2];
                trA1.exons[0][EX_R] = WA[iW][iS2][WA_rStart];
                trA1.exons[0][EX_G] = WA[iW][iS2][WA_gStart];
                trA1.exons[0][EX_L] = WA[iW][iS2][WA_Length];
                trA1.exons[0][EX_iFrag]=WA[iW][iS2][WA_iFrag];
                trA1.exons[0][EX_sjA]=WA[iW][iS2][WA_sjA];
                score2=\
                    stitchAlignToTranscript(WA[iW][iS2][WA_rStart]+WA[iW][iS2][WA_Length]-1, WA[iW][iS2][WA_gStart]+WA[iW][iS2][WA_Length]-1,\
                                        WA[iW][iS1][WA_rStart], WA[iW][iS1][WA_gStart], WA[iW][iS1][WA_Length], WA[iW][iS1][WA_iFrag],  WA[iW][iS1][WA_sjA], \
                                        P, R, mapGen, &trA1, outFilterMismatchNmaxTotal);

                if (P.outFilterBySJoutStage==2 && trA1.nExons>1)
                {//junctions have to be present in the filtered set P.sjnovel
                    uint iex=0;
                    if (trA1.canonSJ[iex]>=0 && trA1.sjAnnot[iex]==0)
                    {
                        uint jS=trA1.exons[iex][EX_G]+trA1.exons[iex][EX_L];
                        uint jE=trA1.exons[iex+1][EX_G]-1;
                        if ( binarySearch2(jS,jE,P.sjNovelStart,P.sjNovelEnd,P.sjNovelN) < 0 ) return;
                    };
                };

                //check the length of the iS2 exon. TODO: build the transcripts vs iS1, check the actual exon length
                bool exonLongEnough = trA1.exons[0][EX_L] >= ( trA1.sjAnnot[0]==0 ? P.alignSJoverhangMin : P.alignSJDBoverhangMin );

                if (exonLongEnough && score2>0 && score2+scoreSeedBest[iS2] > scoreSeedBest[iS1] ) {
                    scoreSeedBest[iS1]=score2+scoreSeedBest[iS2];
                    scoreSeedBestMM[iS1]=trA1.nMM;
                    scoreSeedBestInd[iS1]=iS2;
                };
            } else {//extend to the left
                score2=WA[iW][iS1][WA_Length];
                if ( WA[iW][iS1][WA_rStart]>0 \
                     && extendAlign(R, mapGen.G, WA[iW][iS1][WA_rStart]-1, WA[iW][iS1][WA_gStart]-1, -1, -1, WA[iW][iS1][WA_rStart], 100000, 0, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \
                                    P.alignEndsType.ext[WA[iW][iS1][WA_iFrag]][trA.Str], &trA1) ) {//if could extend
                    score2 += trA1.maxScore;
                };

                bool exonLongEnough = (WA[iW][iS1][WA_Length]+trA1.extendL) >= P.alignSJoverhangMin; //TODO new parameter to control end exons length

                if (exonLongEnough && score2 > scoreSeedBest[iS1] ) {
                    scoreSeedBest[iS1]=score2;
                    scoreSeedBestInd[iS1]=iS1;
//                     scoreSeedBestMM[iS1]=trA1.nMM;
                };
            };
        };
    };

    intScore scoreBest=0;
    uint scoreBestInd=0;


    for (uint iS1=0;iS1<nWA[iW];iS1++) {//find the best alignment
        trA1=*trInit;//initialize trA1
        uint tR2=WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length];
        uint tG2=WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length];
        if ( tR2 < Lread-1 \
            && extendAlign(R, mapGen.G, tR2, tG2, +1, +1, Lread-tR2, 100000, scoreSeedBestMM[iS1], outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \
                           P.alignEndsType.ext[WA[iW][iS1][WA_iFrag]][1-trA.Str], &trA1) )
        {//extend to the right
            scoreSeedBest[iS1]+=trA1.maxScore;
        };

        bool exonLongEnough = (WA[iW][iS1][WA_Length]+trA1.extendL) >= P.alignSJoverhangMin; //TODO new parameter to control end exons length

        if (exonLongEnough && scoreSeedBest[iS1]>scoreBest) {//record new best transcript
            scoreBest=scoreSeedBest[iS1];
            scoreBestInd=iS1;
        };
    };

    uint seedN=0;
    while (true) {//construct the sequence of seeds
        seedChain[seedN++]=scoreBestInd;
        WAincl[scoreBestInd]=true;
        if (scoreBestInd>scoreSeedBestInd[scoreBestInd]){//keep going
            scoreBestInd=scoreSeedBestInd[scoreBestInd];
        } else {//this seed is the first one
            break;
        };
    };

    int Score=0;
    {//build final transcript form seedChain
        {//initiate transcript

            uint iS1=seedChain[seedN-1];
            Score= WA[iW][iS1][WA_Length];
            trA.maxScore = Score;
            trA.nMatch = WA[iW][iS1][WA_Length]; //# of matches
            trA.nMM = 0;

            trA.exons[0][EX_R] = trA.rStart = WA[iW][iS1][WA_rStart];
            trA.exons[0][EX_G] = trA.gStart = WA[iW][iS1][WA_gStart];
            trA.exons[0][EX_L] = WA[iW][iS1][WA_Length];
            trA.exons[0][EX_iFrag]=WA[iW][iS1][WA_iFrag];
            trA.exons[0][EX_sjA]=WA[iW][iS1][WA_sjA];

            trA.nExons=1;

        };

        for (uint iSc=seedN-1; iSc>0; iSc--) {//stitch seeds from the chain
            uint iS1=seedChain[iSc], iS2=seedChain[iSc-1];
            int scoreStitch= stitchAlignToTranscript(WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length]-1, WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length]-1,\
                                        WA[iW][iS2][WA_rStart], WA[iW][iS2][WA_gStart], WA[iW][iS2][WA_Length], WA[iW][iS2][WA_iFrag],  WA[iW][iS2][WA_sjA], \
                                        P, R, mapGen, &trA, outFilterMismatchNmaxTotal);
//            if (scoreStitch>0) {
                Score+=scoreStitch;
//           } else {
//                 cout <<"BUG"<<endl;
//                return;//this should not happen
//            };
        };

        trA.maxScore=Score;

        {//extend to the left
            trA1=*trInit;
            if ( trA.exons[0][EX_R]>0 \
                 && extendAlign(R, mapGen.G, trA.exons[0][EX_R]-1, trA.exons[0][EX_G]-1, -1, -1, trA.exons[0][EX_R], 100000, 0, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax,
                    P.alignEndsType.ext[trA.exons[0][EX_iFrag]][trA.Str], &trA1) ) {//if could extend

                trA.add(&trA1);

                trA.exons[0][EX_R] -=  trA1.extendL;
                trA.exons[0][EX_G] -=  trA1.extendL;
                trA.exons[0][EX_L] +=  trA1.extendL;
                trA.rStart = trA.exons[0][EX_R];
                trA.gStart = trA.exons[0][EX_G];
            };
        };

        {//extend to the right
            uint iS1=seedChain[0];
            trA1=*trInit;//initialize trA1
            uint tR2=WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length];
            uint tG2=WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length];
            if ( tR2 < Lread \
                && extendAlign(R, mapGen.G, tR2, tG2, +1, +1, Lread-tR2, 100000, scoreSeedBestMM[iS1], outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \
                    P.alignEndsType.ext[trA.exons[trA.nExons-1][EX_iFrag]][1-trA.Str], &trA1) ) {//if could extend
                    trA.add(&trA1);
                    trA.exons[trA.nExons-1][EX_L] += trA1.extendL;//extend the length of the last exon
            };
        };

    };

    //debug: recalculate the number of MM
//     {
//         uint nMM1=0;
//         for (uint iex=0;iex<trA.nExons;iex++) {
//             for (uint ii=0;ii<trA.exons[iex][EX_L];ii++) {
//                 if ( R[ii+trA.exons[iex][EX_R]]!=G[ii+trA.exons[iex][EX_G]] && mapGen.G[ii+trA.exons[iex][EX_G]]<4 && R[ii+trA.exons[iex][EX_R]]<4) {
//                     nMM1++;
//                 };
//             };
//         };
//         if (nMM1!=trA.nMM) {
//             cout <<nMM1<<"   "<<trA.nMM<<"    "<<readName<<"   "<<iRead<<endl;
//         };
//     };

    {//calculate some final values for the transcript
        trA.rLength = 0;
        for (uint isj=0;isj<trA.nExons;isj++) {
            trA.rLength += trA.exons[isj][EX_L];
        };
        trA.gLength = trA.exons[trA.nExons-1][EX_G]+1-trA.gStart;

        //calculate some final values for the transcript
        trA.roStart = (trA.roStr == 0) ? trA.rStart : Lread - trA.rStart - trA.rLength;

        if (trA.exons[0][EX_iFrag]==trA.exons[trA.nExons-1][EX_iFrag]) {//mark single fragment transcripts
            trA.iFrag=trA.exons[0][EX_iFrag];
            maxScoreMate[trA.iFrag] = max (maxScoreMate[trA.iFrag] , trA.maxScore);
        } else {
            trA.iFrag=-1;
        };

        if (P.scoreGenomicLengthLog2scale!=0) {//add gap length score
            trA.maxScore += int(ceil( log2( (double) ( trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] - trA.exons[0][EX_G]) ) \
                     * P.scoreGenomicLengthLog2scale - 0.5));
            trA.maxScore = max(0,trA.maxScore);
        };

        //filter strand consistency
        uint sjN=0;
        trA.intronMotifs[0]=0;trA.intronMotifs[1]=0;trA.intronMotifs[2]=0;
        for (uint iex=0;iex<trA.nExons-1;iex++) {
            if (trA.canonSJ[iex]>=0)
            {//junctions - others are indels
                sjN++;
                trA.intronMotifs[trA.sjStr[iex]]++;
            };
        };

        if (trA.intronMotifs[1]>0 && trA.intronMotifs[2]==0)
            trA.sjMotifStrand=1;
        else if (trA.intronMotifs[1]==0 && trA.intronMotifs[2]>0)
            trA.sjMotifStrand=2;
        else
            trA.sjMotifStrand=0;

        if (trA.intronMotifs[1]>0 && trA.intronMotifs[2]>0 && P.outFilterIntronStrands=="RemoveInconsistentStrands")
                return;

        if (sjN>0 && trA.sjMotifStrand==0 && P.outSAMstrandField.type==1) {//strand not defined for a junction
            return;
        };

        if (P.outFilterIntronMotifs=="None") {//no filtering

        } else if (P.outFilterIntronMotifs=="RemoveNoncanonical") {
            for (uint iex=0;iex<trA.nExons-1;iex++) {
                if (trA.canonSJ[iex]==0) return;
            };
        } else if (P.outFilterIntronMotifs=="RemoveNoncanonicalUnannotated") {
            for (uint iex=0;iex<trA.nExons-1;iex++) {
                if (trA.canonSJ[iex]==0 && trA.sjAnnot[iex]==0) return;
            };
        } else {
            ostringstream errOut;
            errOut << "EXITING because of FATAL INPUT error: unrecognized value of --outFilterIntronMotifs=" <<P.outFilterIntronMotifs <<"\n";
            errOut << "SOLUTION: re-run STAR with --outFilterIntronMotifs = None -OR- RemoveNoncanonical -OR- RemoveNoncanonicalUnannotated\n";
            exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
        };

//         if (P.outFilterIntronMotifs=="KeepCanonical" && (trA.intronMotifs[0]>0 || (trA.intronMotifs[1]>0 && trA.intronMotifs[2]>0) ) ) {//keep only conistent canonical introns
//             return;
//         };


        //check exons lengths including repeats, do not report a transcript with short exons
//        for (uint isj=0;isj<trA.nExons-1;isj++) {//check exons for min length, if they precede a junction
//            if ( trA.canonSJ[isj]>=0 &&
//               ( trA.exons[isj][EX_L] < P.alignSJoverhangMin + trA.shiftSJ[isj][0]
//              || trA.exons[isj+1][EX_L] < P.alignSJoverhangMin + trA.shiftSJ[isj][1]) ) {
//                return;//do not record this transcript in wTr
//            };
//        };
    };

    if (WAexcl==NULL)
    {//record the transcript TODO: allow for multiple transcripts in one window
        *(trAll[iWrec][0])=trA;
        nWinTr[iWrec]=1;
    } else
    {//record 2nd best alignment in this window
        *(trAll[iWrec][1])=trA;
        nWinTr[iWrec]=2;
    };
};