1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
|
#include "Variation.h"
#include "streamFuns.h"
#include "SequenceFuns.h"
#include "TimeFunctions.h"
#include "serviceFuns.cpp"
#include "ErrorWarning.h"
Variation::Variation (Parameters &Pin, vector <uint> &chrStartIn, map <string,uint> &chrNameIndexIn) : P(Pin), chrStart(chrStartIn), chrNameIndex(chrNameIndexIn) {
if (!P.var.yes) {
yes=false;
return;
};
yes=true;
//not used yet
//varOutFileName=P.outFileNamePrefix+"Variation.out";
//varOutStream.open(varOutFileName);
vcfFile=P.var.vcfFile;
loadVCF(vcfFile);
};
void scanVCF(ifstream& vcf, Parameters& P, SNP& snp, vector <uint> &chrStart, map <string,uint> &chrNameIndex) {
snp.N=0;
uint nlines=0;
while (true) {
string chr,id, ref, alt, dummy, sample;
uint pos;
nlines++;
vcf >> chr;
if (!vcf.good()) {
break;
};
if (chr.at(0)!='#') {
vcf >> pos >> id >> ref >> alt >> dummy >> dummy >> dummy >> dummy >> sample;
vector <string> altV(3);
if (ref.size()==1 && splitString(alt,',',altV)==1) {//only SNVs allowed - ref=1-char, alt could be comma separated list of 1-char. splitString returns the max lenght of the split strings
altV.insert(altV.begin(),ref);//add ref to the beginning
if (chrNameIndex.count(chr)==0) {//chr not in Genome
P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": chromosome '"<<chr<<"' not found in Genome fasta file\n";
} else if (sample.size()<3) {
//undefined genotype
} else if (sample.size()>3 && sample.at(3)!=':') {
P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": more than 2 alleles per sample for line number "<<nlines<<"\n";
} else if (sample.at(0)=='0' && sample.at(2)=='0') {
//both alleles are reference, no need to do anything
} else if (altV.at( atoi(&sample.at(0)) ).at(0)==ref.at(0) && altV.at( atoi(&sample.at(2)) ).at(0)==ref.at(0)) {
//both alleles are reference, no need to do anything
//this is a strange case in VCF when ALT allele(s) are equal to REF
} else {
array<char,3> nt1;
nt1[0]=convertNt01234( ref.at(0) );
nt1[1]=convertNt01234( altV.at( atoi(&sample.at(0)) ).at(0) );
nt1[2]=convertNt01234( altV.at( atoi(&sample.at(2)) ).at(0) );
if (nt1[0]<4 && nt1[1]<4 && nt1[2]<4) {//only record if variant is ACGT
snp.lociV.push_back(pos-1+chrStart[chrNameIndex[chr]]);
snp.nt.push_back(nt1);
snp.N++;
};
};
};
};
getline(vcf,dummy);
};
};
void Variation::loadVCF(string fileIn) {
time_t rawTime;
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <<flush;
*P.inOut->logStdOut << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <<flush;
ifstream & vcf = ifstrOpen(fileIn, ERROR_OUT, "SOLUTION: check the path and permissions of the VCF file: "+fileIn, P);
scanVCF(vcf, P, snp, chrStart, chrNameIndex);
snp.loci=new uint[snp.N];
for (uint ii=0;ii<snp.N;ii++)
snp.loci[ii]=snp.lociV[ii];
snp.lociV.clear();
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Loaded VCF data, found "<<snp.N<< " SNPs"<<endl;
if (snp.N==0) {
ostringstream errOut;
errOut <<"EXITING because of FATAL INPUT FILE ERROR: could not find any SNPs in VCF file: " <<fileIn<< "\n";
errOut <<"SOLUTION: check formatting of the VCF file; unzip VCF file or use process substitution.\n";
exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
};
uint *s1=new uint[2*snp.N];
for (uint ii=0;ii<snp.N; ii++) {
s1[2*ii]=snp.loci[ii];
s1[2*ii+1]=ii;
};
qsort((void*)s1, snp.N, 2*sizeof(uint), funCompareUint1);
vector< array<char,3> > nt1=snp.nt;
for (uint ii=0;ii<snp.N; ii++) {
snp.loci[ii]=s1[2*ii];
snp.nt[ii]=nt1.at(s1[2*ii+1]);
};
//sort SNPs by coordinate
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Finished sorting VCF data"<<endl;
};
void SNP::snpOnBlocks(uint blockStart, uint blockL, int blockShift, vector<vector<array<int,2>>> &snpV) {
int32 isnp=binarySearch1b <uint> (blockStart, loci, N);
while ((uint)isnp<N && loci[isnp]<(blockStart+blockL)) {
for (int ii=0;ii<2;ii++) {
if (nt[isnp][ii+1]!=nt[isnp][0]) {//allele different from reference
array<int,2> snp1;
snp1[0]=(int) (loci[isnp]-blockStart)+blockShift;
snp1[1]=(int) nt[isnp][ii+1];
snpV[ii].push_back(snp1);
};
};
++isnp;
};
};
vector<vector<array<int,2>>> Variation::sjdbSnp(uint sjStart, uint sjEnd, uint sjdbOverhang1) {
vector<vector<array<int,2>>> snpV(2);
if (!yes) {//no variation, return 1 empty element
vector<vector<array<int,2>>> snpV1(1);
return snpV1;
};
snp.snpOnBlocks(sjStart-sjdbOverhang1, sjdbOverhang1, 0, snpV);
snp.snpOnBlocks(sjEnd+1, sjdbOverhang1, sjdbOverhang1, snpV);
if (snpV.at(0).empty() && snpV.at(1).empty()) {
snpV.pop_back();
} else if (snpV.at(0) == snpV.at(1)) {
snpV.pop_back();
};
return snpV;
};
|