File: Variation.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (151 lines) | stat: -rw-r--r-- 5,735 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#include "Variation.h"
#include "streamFuns.h"
#include "SequenceFuns.h"
#include "TimeFunctions.h"
#include "serviceFuns.cpp"
#include "ErrorWarning.h"

Variation::Variation (Parameters &Pin, vector <uint> &chrStartIn, map <string,uint> &chrNameIndexIn) : P(Pin), chrStart(chrStartIn), chrNameIndex(chrNameIndexIn) {
    if (!P.var.yes) {
        yes=false;
        return;
    };

    yes=true;

    //not used yet
    //varOutFileName=P.outFileNamePrefix+"Variation.out";
    //varOutStream.open(varOutFileName);

    vcfFile=P.var.vcfFile;
    loadVCF(vcfFile);

};

void scanVCF(ifstream& vcf, Parameters& P, SNP& snp, vector <uint> &chrStart, map <string,uint> &chrNameIndex) {
    snp.N=0;
    uint nlines=0;
    while (true) {
        string chr,id, ref, alt, dummy, sample;
        uint pos;
        nlines++;

        vcf >> chr;
        if (!vcf.good()) {
            break;
        };

        if (chr.at(0)!='#') {
            vcf >> pos >> id >> ref >> alt >> dummy >> dummy >> dummy >> dummy >> sample;

            vector <string> altV(3);

            if (ref.size()==1 && splitString(alt,',',altV)==1) {//only SNVs allowed - ref=1-char, alt could be comma separated list of 1-char. splitString returns the max lenght of the split strings
                altV.insert(altV.begin(),ref);//add ref to the beginning

                if (chrNameIndex.count(chr)==0) {//chr not in Genome
                    P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": chromosome '"<<chr<<"' not found in Genome fasta file\n";
                } else if (sample.size()<3) {
                    //undefined genotype
                } else if (sample.size()>3 && sample.at(3)!=':') {
                    P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": more than 2 alleles per sample for line number "<<nlines<<"\n";
                } else if (sample.at(0)=='0' && sample.at(2)=='0') {
                    //both alleles are reference, no need to do anything
                } else if (altV.at( atoi(&sample.at(0)) ).at(0)==ref.at(0) && altV.at( atoi(&sample.at(2)) ).at(0)==ref.at(0)) {
                    //both alleles are reference, no need to do anything
                    //this is a strange case in VCF when ALT allele(s) are equal to REF
                } else {
                    array<char,3> nt1;
                    nt1[0]=convertNt01234( ref.at(0) );
                    nt1[1]=convertNt01234( altV.at( atoi(&sample.at(0)) ).at(0) );
                    nt1[2]=convertNt01234( altV.at( atoi(&sample.at(2)) ).at(0) );
                    if (nt1[0]<4 && nt1[1]<4 && nt1[2]<4) {//only record if variant is ACGT
                        snp.lociV.push_back(pos-1+chrStart[chrNameIndex[chr]]);
                        snp.nt.push_back(nt1);
                        snp.N++;
                    };
                };
            };
        };
        getline(vcf,dummy);
    };
};


void Variation::loadVCF(string fileIn) {
    time_t rawTime;
    time(&rawTime);
    P.inOut->logMain     << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <<flush;
    *P.inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <<flush;

    ifstream & vcf = ifstrOpen(fileIn, ERROR_OUT, "SOLUTION: check the path and permissions of the VCF file: "+fileIn, P);
    scanVCF(vcf, P, snp, chrStart, chrNameIndex);

    snp.loci=new uint[snp.N];
    for (uint ii=0;ii<snp.N;ii++)
        snp.loci[ii]=snp.lociV[ii];
    snp.lociV.clear();


    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Loaded VCF data, found "<<snp.N<< " SNPs"<<endl;
    if (snp.N==0) {
        ostringstream errOut;
        errOut <<"EXITING because of FATAL INPUT FILE ERROR: could not find any SNPs in VCF file: " <<fileIn<< "\n";
        errOut <<"SOLUTION: check formatting of the VCF file; unzip VCF file or use process substitution.\n";
        exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
    };


    uint *s1=new uint[2*snp.N];
    for (uint ii=0;ii<snp.N; ii++) {
        s1[2*ii]=snp.loci[ii];
        s1[2*ii+1]=ii;
    };

    qsort((void*)s1, snp.N, 2*sizeof(uint), funCompareUint1);

    vector< array<char,3> > nt1=snp.nt;
    for (uint ii=0;ii<snp.N; ii++) {
        snp.loci[ii]=s1[2*ii];
        snp.nt[ii]=nt1.at(s1[2*ii+1]);
    };
    //sort SNPs by coordinate
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Finished sorting VCF data"<<endl;
};

void SNP::snpOnBlocks(uint blockStart, uint blockL, int blockShift, vector<vector<array<int,2>>> &snpV) {
    int32 isnp=binarySearch1b <uint> (blockStart, loci, N);
    while ((uint)isnp<N && loci[isnp]<(blockStart+blockL)) {
        for (int ii=0;ii<2;ii++) {
            if (nt[isnp][ii+1]!=nt[isnp][0]) {//allele different from reference
                array<int,2> snp1;
                snp1[0]=(int) (loci[isnp]-blockStart)+blockShift;
                snp1[1]=(int) nt[isnp][ii+1];
                snpV[ii].push_back(snp1);
            };
        };
        ++isnp;
    };
};

vector<vector<array<int,2>>> Variation::sjdbSnp(uint sjStart, uint sjEnd, uint sjdbOverhang1) {
    vector<vector<array<int,2>>> snpV(2);

    if (!yes) {//no variation, return 1 empty element
        vector<vector<array<int,2>>> snpV1(1);
        return snpV1;
    };

    snp.snpOnBlocks(sjStart-sjdbOverhang1, sjdbOverhang1, 0,            snpV);
    snp.snpOnBlocks(sjEnd+1,              sjdbOverhang1, sjdbOverhang1, snpV);

    if (snpV.at(0).empty() && snpV.at(1).empty()) {
        snpV.pop_back();
    } else if (snpV.at(0) == snpV.at(1)) {
        snpV.pop_back();
    };

    return snpV;
};