File: Variation.cpp

package info (click to toggle)
rna-star 2.7.11b%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,492 kB
  • sloc: cpp: 21,951; awk: 827; ansic: 457; makefile: 192; sh: 31
file content (157 lines) | stat: -rw-r--r-- 6,139 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#include "Variation.h"
#include "streamFuns.h"
#include "SequenceFuns.h"
#include "TimeFunctions.h"
#include "serviceFuns.cpp"
#include "ErrorWarning.h"

Variation::Variation (Parameters &Pin, vector <uint> &chrStartIn, map <string,uint> &chrNameIndexIn, bool yesVar) : P(Pin), chrStart(chrStartIn), chrNameIndex(chrNameIndexIn) {
    if (yesVar) {
        yes=true;

        //not used yet
        //varOutFileName=P.outFileNamePrefix+"Variation.out";
        //varOutStream.open(varOutFileName);

        vcfFile=P.var.vcfFile;
        loadVCF(vcfFile);
    } else{
        yes=false;
    };
};

void scanVCF(ifstream& vcf, Parameters& P, SNP& snp, vector <uint> &chrStart, map <string,uint> &chrNameIndex) {
    snp.N=0;
    uint nlines=0;
    uint32 nHomoz=0;//number of homoz alleles
    while (true) {
        string chr,id, ref, alt, dummy, sample;
        uint pos;
        nlines++;

        vcf >> chr;
        if (!vcf.good()) {
            break;
        };

        if (chr.at(0)!='#') {
            vcf >> pos >> id >> ref >> alt >> dummy >> dummy >> dummy >> dummy >> sample;

            vector <string> altV(3);

            if (ref.size()==1 && splitString(alt,',',altV)==1) {//only SNVs allowed - ref=1-char, alt could be comma separated list of 1-char. splitString returns the max lenght of the split strings
                altV.insert(altV.begin(),ref);//add ref to the beginning

                if (chrNameIndex.count(chr)==0) {//chr not in Genome
                    P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": chromosome '"<<chr<<"' not found in Genome fasta file\n";
                } else if (sample.size()<3) {
                    //undefined genotype
                } else if (sample.size()>3 && sample.at(3)!=':') {
                    P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": more than 2 alleles per sample for line number "<<nlines<<"\n";
                } else if (sample.at(0)=='0' && sample.at(2)=='0') {
                    //both alleles are reference, no need to do anything
                } else if (altV.at( atoi(&sample.at(0)) ).at(0)==ref.at(0) && altV.at( atoi(&sample.at(2)) ).at(0)==ref.at(0)) {
                    //both alleles are reference, no need to do anything
                    //this is a strange case in VCF when ALT allele(s) are equal to REF
                } else if ( P.var.heteroOnly && sample.at(0)==sample.at(2) )  {
                    nHomoz++;
                } else {
                    array<char,3> nt1;
                    nt1[0]=convertNt01234( ref.at(0) );
                    nt1[1]=convertNt01234( altV.at( atoi(&sample.at(0)) ).at(0) );
                    nt1[2]=convertNt01234( altV.at( atoi(&sample.at(2)) ).at(0) );
                    if (nt1[0]<4 && nt1[1]<4 && nt1[2]<4) {//only record if variant is ACGT
                        snp.lociV.push_back(pos-1+chrStart[chrNameIndex[chr]]);
                        snp.nt.push_back(nt1);
                        snp.N++;
                    };
                };
            };
        };
        getline(vcf,dummy);
    };

    if (nHomoz>0) {
        P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile 
                         <<": found " << nHomoz << " homozygous SNPs, they will NOT be used." <<endl;
    };
};


void Variation::loadVCF(string fileIn) {
    time_t rawTime;
    time(&rawTime);
    P.inOut->logMain     << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <<flush;
    *P.inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <<flush;

    ifstream & vcf = ifstrOpen(fileIn, ERROR_OUT, "SOLUTION: check the path and permissions of the VCF file: "+fileIn, P);
    scanVCF(vcf, P, snp, chrStart, chrNameIndex);

    snp.loci=new uint[snp.N];
    for (uint ii=0;ii<snp.N;ii++)
        snp.loci[ii]=snp.lociV[ii];
    snp.lociV.clear();


    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Loaded VCF data, found "<<snp.N<< " SNPs"<<endl;
    if (snp.N==0) {
        ostringstream errOut;
        errOut <<"EXITING because of FATAL INPUT FILE ERROR: could not find any SNPs in VCF file: " <<fileIn<< "\n";
        errOut <<"SOLUTION: check formatting of the VCF file; unzip VCF file or use process substitution.\n";
        exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
    };


    uint *s1=new uint[2*snp.N];
    for (uint ii=0;ii<snp.N; ii++) {
        s1[2*ii]=snp.loci[ii];
        s1[2*ii+1]=ii;
    };

    qsort((void*)s1, snp.N, 2*sizeof(uint), funCompareUint1);

    vector< array<char,3> > nt1=snp.nt;
    for (uint ii=0;ii<snp.N; ii++) {
        snp.loci[ii]=s1[2*ii];
        snp.nt[ii]=nt1.at(s1[2*ii+1]);
    };
    //sort SNPs by coordinate
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Finished sorting VCF data"<<endl;
};

void SNP::snpOnBlocks(uint blockStart, uint blockL, int blockShift, vector<vector<array<int,2>>> &snpV) {
    int32 isnp=binarySearch1b <uint> (blockStart, loci, N);
    while ((uint)isnp<N && loci[isnp]<(blockStart+blockL)) {
        for (int ii=0;ii<2;ii++) {
            if (nt[isnp][ii+1]!=nt[isnp][0]) {//allele different from reference
                array<int,2> snp1;
                snp1[0]=(int) (loci[isnp]-blockStart)+blockShift;
                snp1[1]=(int) nt[isnp][ii+1];
                snpV[ii].push_back(snp1);
            };
        };
        ++isnp;
    };
};

vector<vector<array<int,2>>> Variation::sjdbSnp(uint sjStart, uint sjEnd, uint sjdbOverhang1) {
    vector<vector<array<int,2>>> snpV(2);

    if (!yes) {//no variation, return 1 empty element
        vector<vector<array<int,2>>> snpV1(1);
        return snpV1;
    };

    snp.snpOnBlocks(sjStart-sjdbOverhang1, sjdbOverhang1, 0,            snpV);
    snp.snpOnBlocks(sjEnd+1,              sjdbOverhang1, sjdbOverhang1, snpV);

    if (snpV.at(0).empty() && snpV.at(1).empty()) {
        snpV.pop_back();
    } else if (snpV.at(0) == snpV.at(1)) {
        snpV.pop_back();
    };

    return snpV;
};