File: SuffixArrayFuns.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (410 lines) | stat: -rw-r--r-- 12,020 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
#include "SuffixArrayFuns.h"
#include "PackedArray.h"

inline uint medianUint2(uint a, uint b)
{
    // returns (a+b)/2
    return a/2 + b/2 + (a%2 + b%2)/2;
};

uint compareSeqToGenome(Genome &mapGen, char** s2, uint S, uint N, uint L, uint iSA, bool dirR, bool& compRes)
{
    /* compare s to g, find the maximum identity length
     * s2[0] read sequence; s2[1] complementary sequence
     * S position to start search from in s2[0],s2[1]
     * dirR forward or reverse direction search on read sequence
     */

    register int64 ii;

    uint SAstr=mapGen.SA[iSA];
    bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome
    SAstr &= mapGen.GstrandMask;

    char *g=mapGen.G;

    if (dirR && dirG) {//forward on read, forward on genome
        char* s  = s2[0] + S + L;
        g += SAstr + L;
        for (ii=0;(uint) ii < N-L; ii++)
        {
            if (s[ii]!=g[ii])
            {
                if (s[ii]>g[ii])
                {
                    compRes=true;
                    return ii+L;
                } else
                {
                    compRes=false;
                    return ii+L;
                };
            };
        };
//         if (s[ii]>g[ii]) {compRes=true;} else {compRes=false;};
        return N; //exact match
    } else if (dirR && !dirG) {
        char* s  = s2[1] + S + L;
        g += mapGen.nGenome-1-SAstr - L;
        for (ii=0; (uint) ii < N-L; ii++)
        {
            if (s[ii]!=g[-ii])
            {
                if (s[ii]>g[-ii] || g[-ii]>3)
                {
                    compRes=false;
                    return ii+L;
                } else
                {
                    compRes=true;
                    return ii+L;
                };
            };
        };
        return N;
    } else if (!dirR && dirG) {
        char* s  = s2[1] + S - L;
        g += SAstr + L;
        for (ii=0; (uint) ii < N-L; ii++)
        {
            if (s[-ii]!=g[ii])
            {
                if (s[-ii]>g[ii]) {
                    compRes=true;
                    return ii+L;

                } else
                {
                    compRes=false;
                    return ii+L;
                };
            };
        };
        return N;
    } else {//if (!dirR && !dirG)
        char* s  = s2[0] + S - L;
        g += mapGen.nGenome-1-SAstr - L;
        for (ii=0; (uint) ii < N-L; ii++)
        {
            if (s[-ii]!=g[-ii])
            {
                if (s[-ii]>g[-ii] || g[-ii]>3)
                {
                    compRes=false;
                    return ii+L;
                } else
                {
                    compRes=true;
                    return ii+L;
                };
            };
        };
        return N;
    };
};

uint findMultRange(Genome &mapGen, uint i3, uint L3, uint i1, uint L1, uint i1a, uint L1a, uint i1b, uint L1b, char** s, bool dirR, uint S)
{    // given SA index i3 and identity length L3, return the index of the farthest element with the same length, starting from i1,L1 or i1a,L1a, or i1b,L1b

    bool compRes;

    if (L1<L3) { //search between i1 and i3
        L1b=L1; i1b=i1; i1a=i3;
    }
    else {
        if (L1a<L1) {//search between i1a and i1b, else: search bewtween i1a and i1b
            L1b=L1a; i1b=i1a; i1a=i1;
        };
    };
    while ( (i1b+1<i1a)|(i1b>i1a+1) ) { //L1a is the target length, i1a...i1b is the initial range, i1c,L1c is the value in the middle
        uint i1c=medianUint2(i1a,i1b);
        //uint L1c=identityLength(&g[mapGen.SA[i3]+L1b],&g[mapGen.SA[i1c]+L1b],L3-L1b)+L1b;
        uint L1c=compareSeqToGenome(mapGen,s,S,L3,L1b,i1c,dirR,compRes);
        if (L1c==L3) {
            i1a=i1c;
        }
        else { //L1c<L3, move i1c
            i1b=i1c;L1b=L1c;
        };
    };
    return i1a;
};

uint maxMappableLength(Genome &mapGen, char** s, uint S, uint N, uint i1, uint i2, bool dirR, uint& L, uint* indStartEnd)
{
    /* find minimum mappable length of sequence s to the genome g with suffix array SA; length(s)=N; [i1 i2] is initial suffix array search bounds.
     * returns number of mappings (1=unique);range indStartEnd; min mapped length = L
     * binary search in SA space
     */

    bool compRes;

    uint L1,L2,i3,L3,L1a,L1b,L2a,L2b,i1a,i1b,i2a,i2b;

    L1=compareSeqToGenome(mapGen,s,S,N,L,i1,dirR,compRes);
    L2=compareSeqToGenome(mapGen,s,S,N,L,i2,dirR,compRes);

//     L1=identityLength(&s[L],&g[mapGen.SA[i1]]);
//     L2=identityLength(&s[L],&g[mapGen.SA[i2]]);
    L= min(L1,L2);

    L1a=L1;L1b=L1;i1a=i1;i1b=i1;
    L2a=L2;L2b=L2;i2a=i2;i2b=i2;   // track boundaries of best matching suffix array ranges


    //  suffix array
    // ---------------------------
    //   i1, L1
    //
    //   i3, L3
    //
    //   i2, L2
    // --------------------------

    i3=i1;L3=L1; //in case i1+1>=i2 an not iteration of the loope below is ever made
    while (i1+1<i2) {//main binary search loop
        i3=medianUint2(i1,i2);
        L3=compareSeqToGenome(mapGen,s,S,N,L,i3,dirR,compRes);

        if (L3==N) break; //found exact match, exit the binary search

        if (compRes) { //move 1 to 3
            if (L3>L1) {
               L1b=L1a; L1a=L1; i1b=i1a; i1a=i1;
               // L1b, i1b - captures history of last time the max score shifted.
               // L1a, i1a - tracks current shift.
            };
            i1=i3;L1=L3;
        }
        else {
            if (L3>L2) { //move 2 to 3
               L2b=L2a; L2a=L2; i2b=i2a; i2a=i2;
            };
            i2=i3;L2=L3;
        };
        L= min(L1,L2);

    };

    if (L3<N) {//choose longest alignment length between L1 and L2
        if (L1>L2) {
            i3=i1;L3=L1;
        } else {
            i3=i2;L3=L2;
        };
    };
    // now i3,L3 is the "best" alignment, i.e. longest length

    // find the range of SA indices in which the identiyLength is the same
    i1=findMultRange(mapGen,i3,L3,i1,L1,i1a,L1a,i1b,L1b,s,dirR,S);
    i2=findMultRange(mapGen,i3,L3,i2,L2,i2a,L2a,i2b,L2b,s,dirR,S);

    L=L3; //output
    indStartEnd[0]=i1;
    indStartEnd[1]=i2;

    return i2-i1+1;
};


int compareRefEnds (Genome &mapGen, uint64 SAstr,  uint64 gInsert, bool strG, bool strR)
{
    if ( strG)
    {// + strand g
       return strR ? (SAstr < gInsert ? 1:-1) : 1;
    } else
    {// - strand g
       return strR ? -1 : ( gInsert==-1LLU ? -1 : ( SAstr < mapGen.nGenome-gInsert ? 1:-1) );
    };
};

uint compareSeqToGenome1(Genome &mapGen, char** s2, uint S, uint N, uint L, uint iSA, bool dirR, uint64 gInsert, int & compRes)
{
    /* compare s to g, find the maximum identity length
     * s2[0] read sequence; s2[1] complementary sequence
     * S position to start search from in s2[0],s2[1]
     * dirR: strand of the s
     * different treatment of 5 (spacer) in the sequence and genome
     * 5 is allowed in the sequence
     * 5 in the genome is < than 5 in the sequence
     */

    //TODO no need for complementary sequence

    register int64 ii;

    uint SAstr=mapGen.SA[iSA];
    bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome
    SAstr &= mapGen.GstrandMask;
    char *g=mapGen.G;

    if (dirG) {//forward on read, forward on genome
        char* s  = s2[0] + S + L;
        g += SAstr + L;
        for (ii=0;(uint) ii < N-L; ii++)
        {
            if (s[ii]!=g[ii])
            {
                if (s[ii]>g[ii])
                {
                    compRes=1;
                    return ii+L;
                } else
                {
                    compRes=-1;
                    return ii+L;
                };
            } else if (s[ii]==GENOME_spacingChar)
            {//this already implies the s[ii]==g[ii]
                compRes=compareRefEnds (mapGen, SAstr, gInsert, dirG, dirR);
                return ii+L;
            };
        };
//         if (s[ii]>g[ii]) {compRes=true;} else {compRes=false;};
        return N; //exact match
    }
    else
    {
        char* s  = s2[1] + S + L;
        g += mapGen.nGenome-1-SAstr - L;
        for (ii=0; (uint) ii < N-L; ii++)
        {
            if (s[ii]!=g[-ii])
            {
                char s1=s[ii],g1=g[-ii];
                if (s1<4) s1=3-s1;
                if (g1<4) g1=3-g1;
                if (s1>g1) {
                    compRes=1;
                    return ii+L;
                } else
                {
                    compRes=-1;
                    return ii+L;
                };
                break;
            } else if (s[ii]==GENOME_spacingChar)
            {//this already implies the s[ii]==g[ii]
                compRes=compareRefEnds (mapGen, SAstr, gInsert, dirG, dirR);
                return ii+L;
            };
        };
        return N;
    };
};


uint suffixArraySearch1(Genome &mapGen, char** s, uint S, uint N, uint64 gInsert, bool strR, uint i1, uint i2, uint L)
{
    /* binary search in SA space
     * s[0],s[1] - query sequence, complementary sequence
     * S - start offset
     * N - sequence length
     * g - genome sequence
     * gInsert - index where the sequence insertion happened
     * SA - suffix array
     * strR - strand of the query sequence
     * i1,i2 = starting indices in SA
     * L - starting length
     * output: first SA index > searched string, i.e. g[SA[index-1]]<s<g[SA[index]]
     */

    int compRes;

    uint L1=compareSeqToGenome1(mapGen,s,S,N,L,i1,strR,gInsert,compRes);
    if (compRes<0)
    {// the sequence is smaller than the first index of the SA, cannot proceed
        L=L1;
        return 0;
    };

    uint L2=compareSeqToGenome1(mapGen, s,S,N,L,i2,strR,gInsert,compRes);
    if (compRes>0)
    {//the sequence is bigger than the last SA index, return a huge number
        L=L2;
        return -2llu;
    };

    L=min(L1,L2);

    uint i3=i1,L3=L1; //in case i1+1>=i2 an not iteration of the loope below is ever made
    while (i1+1<i2) {//main binary search loop
        i3=medianUint2(i1,i2);
        L3=compareSeqToGenome1(mapGen,s,S,N,L,i3,strR,gInsert,compRes);//cannot do this because these sj sequences contains spacers=5
        if (L3==N) {//this should not really happen
            L=N;
            return i3;
//             cerr << "Bug L3==N"<<endl;
//             exit(-1); //found exact match of the whole read length, exit the binary search
        };

        if (compRes>0)
        { //move 1 to 3
            i1=i3;L1=L3;
        } else if (compRes<0)
        {//move 2 to 3
            i2=i3;L2=L3;
        }
        L= min(L1,L2);
    };
    return i2; //index at i2 is always bigger than the sequence
};

uint funCalcSAiFromSA(char* gSeq, PackedArray& gSA, Genome &mapGen, uint iSA, int L, int & iL4)
{
    uint SAstr=gSA[iSA];
    bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome
    SAstr &= mapGen.GstrandMask;
    iL4=-1;
    register uint saind=0;
    if (dirG)
    {
        register uint128 g1=*( (uint128*) (gSeq+SAstr) );
        for (int ii=0; ii<L; ii++)
        {
            register char g2=(char) g1;
            if (g2>3)
            {
                iL4=ii;
                saind <<= 2*(L-ii);
                return saind;
            };
            saind=saind<<2;
            saind+=g2;
            g1=g1>>8;
        };
        return saind;
    } else
    {
        register uint128 g1=*( (uint128*) (gSeq+mapGen.nGenome-SAstr-16) );
        for (int ii=0; ii<L; ii++)
        {
            register char g2=(char) (g1>>(8*(15-ii)));
            if (g2>3)
            {
                iL4=ii;
                saind <<= 2*(L-ii);
                return saind;
            };
            saind=saind<<2;
            saind+=3-g2;
        };
        return saind;
    };

};

int64 funCalcSAi(char *G, uint iL)
{
    int64 ind1=0;
    for (uint iL1=0;iL1<=iL;iL1++) {
        uint g=(uint) G[iL1];
        if (g>3) {
            return -ind1;
        } else {
            ind1 <<= 2;
            ind1 += g;
        };
   };
   return ind1;
};