1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
#include "GTF.h"
#include "ErrorWarning.h"
#include "streamFuns.h"
#include "TimeFunctions.h"
GTF::GTF(Genome &genome, Parameters &P, const string &dirOut, SjdbClass &sjdbLoci)
: genome(genome), P(P), dirOut(dirOut), sjdbLoci(sjdbLoci), superTrome(P)
{//initialize; load gtf file; returns number of added junctions
if (genome.sjdbOverhang==0 || genome.pGe.sjdbGTFfile=="-") {//no GTF
gtfYes=false;
return;
};
gtfYes=true;
mkdir(dirOut.c_str(), P.runDirPerm);
time_t rawTime;
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... processing annotations GTF\n" <<flush;
*P.inOut->logStdOut << timeMonthDayTime(rawTime) <<" ..... processing annotations GTF\n" <<flush;
std::map <string,uint64> transcriptIDnumber, geneIDnumber;
ifstream sjdbStreamIn ( genome.pGe.sjdbGTFfile.c_str() );
if (sjdbStreamIn.fail()) {
ostringstream errOut;
errOut << "FATAL error, could not open file pGe.sjdbGTFfile=" << genome.pGe.sjdbGTFfile <<"\n";
exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
};
if (genome.chrNameIndex.size()==0) {
for (uint64 ii=0;ii<genome.nChrReal;ii++) {
genome.chrNameIndex[genome.chrName[ii]]=ii;
};
};
exonN=0;
while (sjdbStreamIn.good()) {//count the number of exons
string oneLine,chr1,ddd2,featureType;
getline(sjdbStreamIn,oneLine);
istringstream oneLineStream (oneLine);
oneLineStream >> chr1 >> ddd2 >> featureType;
if (chr1.substr(0,1)!="#" && featureType==genome.pGe.sjdbGTFfeatureExon) {
exonN++;
};
};
if (exonN==0) {
ostringstream errOut;
errOut << "Fatal INPUT FILE error, no ""exon"" lines in the GTF file: " << genome.pGe.sjdbGTFfile <<"\n";
errOut << "Solution: check the formatting of the GTF file, it must contain some lines with ""exon"" in the 3rd column.\n";
errOut << " Make sure the GTF file is unzipped.\n";
errOut << " If exons are marked with a different word, use --sjdbGTFfeatureExon .\n";
exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
};
exonLoci.resize(exonN);
exonN=0;//will re-calculate
sjdbStreamIn.clear();
sjdbStreamIn.seekg(0,ios::beg);
while (sjdbStreamIn.good()) {
string oneLine,chr1,ddd2,featureType;
getline(sjdbStreamIn,oneLine);
istringstream oneLineStream (oneLine);
oneLineStream >> chr1 >> ddd2 >> featureType;
if (chr1.substr(0,1)!="#" && featureType==genome.pGe.sjdbGTFfeatureExon) {//exonic line, process
if (genome.pGe.sjdbGTFchrPrefix!="-") chr1=genome.pGe.sjdbGTFchrPrefix + chr1;
if (genome.chrNameIndex.count(chr1)==0) {//chr not in Genome
P.inOut->logMain << "WARNING: while processing sjdbGTFfile=" << genome.pGe.sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n";
P.inOut->logMain << oneLine <<"\n"<<flush;
continue; //do not process exons/transcripts on missing chromosomes
};
uint64 ex1,ex2;
char str1;
oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last
if ( ex2 > genome.chrLength[genome.chrNameIndex[chr1]] ) {
warningMessage("while processing sjdbGTFfile=" + genome.pGe.sjdbGTFfile + ", line:\n" + oneLine + "\n exon end = " + to_string(ex2) + \
" is larger than the chromosome " + chr1 + " length = " + to_string(genome.chrLength[genome.chrNameIndex[chr1]] ) + " , will skip this exon\n", \
std::cerr, P.inOut->logMain, P);
continue;
};
string oneLine1;
getline(oneLineStream, oneLine1);//get the last field
replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes
replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing
replace(oneLine1.begin(),oneLine1.end(),'\t',' ');//replace tabs
replace(oneLine1.begin(),oneLine1.end(),'\"',' ');//now the only separator is space
//string trID(""), gID(""), attr1(""),gName(""),gBiotype("");
vector<vector<string>> exAttrNames({ {genome.pGe.sjdbGTFtagExonParentTranscript}, {genome.pGe.sjdbGTFtagExonParentGene}, genome.pGe.sjdbGTFtagExonParentGeneName, genome.pGe.sjdbGTFtagExonParentGeneType }); //trID, gID, gName, gBiotype
vector<string> exAttr; //trID, gID, gName, gBiotype
exAttr.resize(exAttrNames.size());
for (uint32 ii=0; ii<exAttrNames.size(); ii++) {
for (auto &attr1 : exAttrNames[ii]) {//scan through possible names
size_t pos1=oneLine1.find(" " + attr1 + " "); //attribute name is separated by spaces
if (pos1!=string::npos)
pos1=oneLine1.find_first_not_of(" ", pos1+attr1.size()+1);
if (pos1!=string::npos) {
exAttr[ii]=oneLine1.substr(pos1, oneLine1.find_first_of(" ",pos1)-pos1);
};
};
};
if (exAttr[0]=="") {//no transcript ID
P.inOut->logMain << "WARNING: while processing pGe.sjdbGTFfile=" << genome.pGe.sjdbGTFfile <<": no transcript_id for line:\n";
P.inOut->logMain << oneLine <<"\n"<<flush;
exAttr[0]="tr_" + chr1 +"_"+ to_string(ex1) +"_"+ to_string(ex2) +"_"+ to_string(exonN); //unique name for the transcript
};
if (exAttr[1]=="") {//no gene ID
P.inOut->logMain << "WARNING: while processing pGe.sjdbGTFfile=" << genome.pGe.sjdbGTFfile <<": no gene_id for line:\n";
P.inOut->logMain << oneLine <<"\n"<<flush;
exAttr[1]="MissingGeneID";
};
if (exAttr[2]=="") {//no gene name
exAttr[2]=exAttr[1];
};
if (exAttr[3]=="") {//no gene name
exAttr[3]="MissingGeneType";
};
transcriptIDnumber.insert(std::pair <string,uint64> (exAttr[0],(uint64) transcriptIDnumber.size()));//insert new element if necessary with a new numeric value
if (transcriptID.size() < transcriptIDnumber.size()) {//new transcript
transcriptID.push_back(exAttr[0]);
if (str1=='+') {
transcriptStrand.push_back(1);
} else if (str1=='-') {
transcriptStrand.push_back(2);
} else {
transcriptStrand.push_back(0);
};
};
geneIDnumber.insert(std::pair <string,uint64> (exAttr[1],(uint64) geneIDnumber.size()));//insert new element if necessary with a $
if (geneID.size() < geneIDnumber.size()) {//new gene is added
geneID.push_back(exAttr[1]);
geneAttr.push_back({exAttr[2],exAttr[3]});
};
exonLoci[exonN][exT]=transcriptIDnumber[exAttr[0]];
exonLoci[exonN][exS]=ex1+genome.chrStart[genome.chrNameIndex[chr1]]-1;
exonLoci[exonN][exE]=ex2+genome.chrStart[genome.chrNameIndex[chr1]]-1;
exonLoci[exonN][exG]=geneIDnumber[exAttr[1]];
++exonN;
};//if (chr1.substr(0,1)!="#" && featureType=="exon")
};//
if (exonN==0) {
ostringstream errOut;
errOut << "Fatal INPUT FILE error, no valid ""exon"" lines in the GTF file: " << genome.pGe.sjdbGTFfile <<"\n";
errOut << "Solution: check the formatting of the GTF file. One likely cause is the difference in chromosome naming between GTF and FASTA file.\n";
exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
};
exonLoci.resize(exonN); //previous exonLoci.size() was an estimate, need to reszie to the actual value
return;
};
|