File: SoloBarcode.cpp

package info (click to toggle)
rna-star 2.7.11b%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,492 kB
  • sloc: cpp: 21,951; awk: 827; ansic: 457; makefile: 192; sh: 31
file content (152 lines) | stat: -rw-r--r-- 6,348 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include "SoloBarcode.h"
#include "serviceFuns.cpp"
#include "SoloCommon.h"
#include "SequenceFuns.h"
#include "ParametersSolo.h"

void wlAddMismatches(uint32 nMM, uint32 cbLen, vector<uintCB> &wl, vector<uintCB> &wlEd1, vector<uint32> &wlInd1);

void SoloBarcode::sortWhiteList(ParametersSolo *pSolo)
{
    totalSize=0;
    minLen=(uint32)-1;
    wlAdd.resize( wl.size() );
    if (pSolo->CBmatchWL.EditDist_2) {
        wlEd.resize( wl.size() );
        wlEdInd.resize( wl.size() );
    };

    for (uint32 ilen1=1; ilen1 < wl.size(); ilen1++) {//scan through different lengths for this CB
        wlAdd[ilen1]=totalSize;
        if (wl[ilen1].size()>0) {
            if (ilen1<minLen)
                minLen=ilen1;
            std::sort(wl[ilen1].begin(),wl[ilen1].end());//sort
            auto un1=std::unique(wl[ilen1].begin(),wl[ilen1].end());//collapse identical
            wl[ilen1].resize(std::distance(wl[ilen1].begin(),un1));
            totalSize += wl[ilen1].size();

            if (pSolo->CBmatchWL.EditDist_2) {//add mismatches
                wlAddMismatches(2, ilen1, wl[ilen1], wlEd[ilen1], wlEdInd[ilen1]);
            };

        };
    };
};

void SoloBarcode::extractPositionsFromString(string &strIn)
{
    vector<string> p(4);
    splitString(strIn,'_',p);
    anchorType[0] = std::stoi(p[0]);
    anchorType[1] = std::stoi(p[2]);
    anchorDist[0] = std::stoi(p[1]);
    anchorDist[1] = std::stoi(p[3]);
};

void wlAddMismatches(uint32 nMM, uint32 cbLen, vector<uintCB> &wl, vector<uintCB> &wlEd1, vector<uint32> &wlEdInd1)
{
    struct type_cbMMind {
        uintCB cb;
        uint32 ind;
        uint32  mm;
    };

    //max size
    uint64 ntot=wl.size()*(std::pow(cbLen*3,nMM+1)-1)/(cbLen*3-1); //geometric series 1+L+L*L+...

    vector<type_cbMMind> cbMMind;
    cbMMind.reserve(ntot);

    for (uint32 icb=0; icb<wl.size(); icb++) {//fill in 0-MM CBs from wl
        cbMMind.push_back({wl[icb], icb, 0});
    };

    uint64 ind1=0, ind2=wl.size(); //define boundaries of the previous mismatch level
    for (uint32 mm=1; mm<=nMM; mm++) {//main cycle over number of MM
        uint64 ind3=ind2; //current index
        for (uint64 ii=ind1; ii<ind2; ii++) {//add 1 mismatch to previous-mismatch-level sequences
            for (uint32 ll=0; ll<cbLen*2; ll+=2) {//2 bits for each base of CB
                for (uintCB jj=1; jj<4; jj++) {//change to 1 of 3 values. Note that jj=0 corresponds to no change
                    uintCB cbmm = cbMMind[ii].cb^(jj<<ll);
                    //if (cbmm==27 && mm==3)
                    //    cout << ii;
                    cbMMind.push_back({cbmm, cbMMind[ii].ind, mm});
                    ++ind3;
                };
            };
        };

        if (mm==2) {//ins+del only added at mm=ed=2
            for (uint64 ii=0; ii<wl.size(); ii++) {//ins+del added to original barcodes (without mm)
                uintCB cbmm = cbMMind[ii].cb;
                //cout << convertNuclInt64toString(cbmm, cbLen)<<endl;
                for (uint32 ld=0; ld<cbLen*2; ld+=2) {//del
                    uintCB maskd1 = ((uintCB)-1)<<(ld+2);
                    uintCB maskd = (~maskd1)>>2;
                    uintCB cbmmd = (cbmm & maskd) | ((cbmm & maskd1)>>2);
                    //cout << convertNuclInt64toString(cbmmd, cbLen)<<endl;
                    for (uint32 ll=0; ll<cbLen*2; ll+=2) {//ins
                        uintCB cbmm1 = cbmmd<<2;
                        //cout << convertNuclInt64toString(cbmm1, cbLen) <<" "<< ll <<endl;
                        uintCB mask1 = ((uintCB)-1)<<(ll+2);
                        //cout << convertNuclInt64toString(mask1, cbLen)<<endl;
                        uintCB mask = (~mask1)>>2;
                        //cout << convertNuclInt64toString(mask, cbLen)<<endl;                
                        uintCB cbmm2 = (cbmmd & mask) | (cbmm1 & mask1);
                        //cout << convertNuclInt64toString(cbmm2, cbLen)<<endl;
                        
                        for (uintCB jj=0; jj<4; jj++) {
                            uintCB cbmm3 = cbmm2 | (jj<<ll);//cbmm2 has A=00 in ll-position. Change this to one of 4 bases
                            //cout << convertNuclInt64toString(cbmm3, cbLen) <<" "<< jj<<endl;
                            cbMMind.push_back({cbmm3, cbMMind[ii].ind, mm});
                            ++ind3;
                        };
                        //cout << flush;
                    };
                };
            };
        };
        ind1 = ind2;
        ind2 = ind3;
    };

    //select edited SB sequences that have a unique match in 
    std::sort(cbMMind.begin(), cbMMind.end(), [](const type_cbMMind &c1, const type_cbMMind &c2) 
                                                    {return (c1.cb<c2.cb) || (c1.cb==c2.cb && c1.mm<c2.mm) || (c1.cb==c2.cb && c1.mm==c2.mm && c1.ind<c2.ind);
                                                    } );

    uint64 nCBout = 0;
    vector<uint64> nCBmm(nMM+1,0);
    uintCB prevCB = (uintCB)(-1);
    for (uint64 ii=0; ii<cbMMind.size(); ii++) {

        if (ii<cbMMind.size()-1 && cbMMind[ii].cb==cbMMind[ii+1].cb && cbMMind[ii].mm==cbMMind[ii+1].mm && cbMMind[ii].ind==cbMMind[ii+1].ind) {
            cbMMind[ii].mm = nMM+1;
            continue;//skip identical records
        };

        if (    ( ii>0 && cbMMind[ii].cb == prevCB ) 
             || ( ii<cbMMind.size()-1 && cbMMind[ii].cb == cbMMind[ii+1].cb &&  cbMMind[ii].mm == cbMMind[ii+1].mm ) ) {
            cbMMind[ii].mm = nMM+1; //this cb is equal to previous, mark to skip
            //cout << convertNuclInt64toString(cbMMind[ii].cb, cbLen) <<' '<< cbMMind[ii].mm << " 2" << ' '<< convertNuclInt64toString(wl[cbMMind[ii].ind], cbLen)<<' '<<wl[cbMMind[ii].ind]<<'\n';
        } else {
            nCBout++;
            nCBmm[cbMMind[ii].mm]++;
            //cout << convertNuclInt64toString(cbMMind[ii].cb, cbLen) <<' '<< cbMMind[ii].mm << " 1" << ' '<< convertNuclInt64toString(wl[cbMMind[ii].ind], cbLen); //<<' '<< cbMMind[ii].cb;
            //cout <<'\n';
        };
        prevCB = cbMMind[ii].cb;
    };

    wlEd1.resize(nCBout);
    wlEdInd1.resize(nCBout);
    uint32 icb=0;
    for (auto &cb1: cbMMind) {
        if (cb1.mm<=nMM) {
            wlEd1[icb] = cb1.cb;
            wlEdInd1[icb] = cb1.ind;
            ++icb;
        };
    };
};