File: GTF_transcriptGeneSJ.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (183 lines) | stat: -rwxr-xr-x 8,662 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#include "GTF.h"
#include "serviceFuns.cpp"
#include "streamFuns.h"

//#include <ctime>
#include <map>

#define GTF_extrLoci_size 6
#define GTF_extrTrStart(ii) ((ii)*GTF_extrLoci_size)
#define GTF_extrTrEnd(ii) ((ii)*GTF_extrLoci_size+1)
#define GTF_extrTrID(ii) ((ii)*GTF_extrLoci_size+2)
#define GTF_extrExStart(ii) ((ii)*GTF_extrLoci_size+3)
#define GTF_extrExEnd(ii) ((ii)*GTF_extrLoci_size+4)
#define GTF_extrGeID(ii) ((ii)*GTF_extrLoci_size+5)

#define GTF_exgeLoci_size 5
#define GTF_exgeExStart(ii) ((ii)*GTF_exgeLoci_size+0)
#define GTF_exgeExEnd(ii) ((ii)*GTF_exgeLoci_size+1)
#define GTF_exgeExStrand(ii) ((ii)*GTF_exgeLoci_size+2)
#define GTF_exgeGeID(ii) ((ii)*GTF_exgeLoci_size+3)
#define GTF_exgeTrID(ii) ((ii)*GTF_exgeLoci_size+4)

uint64 GTF::transcriptGeneSJ(const string &dirOut) 
{//sort exonLoci by transcript ID and exon coordinates
 //fills sjdbLoci from GTF junctions
    
    if (!gtfYes)
        return 0;
    
    exonN=exonLoci.size();
    qsort((void*) exonLoci.data(), exonN, sizeof(uint64)*exL, funCompareUint2);

    {//exon-gene data structures: exon start/end/strand/gene/transcript
        //re-sort exons by exons loci
        uint64* exgeLoci=new uint64 [exonN*GTF_exgeLoci_size]; //this also contains transcripts start and end

        for (uint64 iex=0; iex<exonN; iex++) {
            exgeLoci[GTF_exgeExStart(iex)]=exonLoci[iex][exS];
            exgeLoci[GTF_exgeExEnd(iex)]=exonLoci[iex][exE];
            exgeLoci[GTF_exgeExStrand(iex)]=transcriptStrand[exonLoci[iex][exT]];
            exgeLoci[GTF_exgeGeID(iex)]=exonLoci[iex][exG];
            exgeLoci[GTF_exgeTrID(iex)]=exonLoci[iex][exT];
        };

        qsort((void*) exgeLoci, exonN, sizeof(uint64)*GTF_exgeLoci_size, funCompareArrays<uint64,5>);

        ofstream & exgeOut = ofstrOpen(dirOut+"/exonGeTrInfo.tab",ERROR_OUT,P);
        exgeOut<<exonN<<"\n";
        for (uint64 iex=0; iex<exonN; iex++) {
             exgeOut<<exgeLoci[GTF_exgeExStart(iex)] <<"\t"<<  exgeLoci[GTF_exgeExEnd(iex)] <<"\t"<< exgeLoci[GTF_exgeExStrand(iex)] \
              <<"\t"<< exgeLoci[GTF_exgeGeID(iex)] <<"\t"<< exgeLoci[GTF_exgeTrID(iex)] <<"\n"; //the last value, transript-number, is worng here since tranascripts are re-sorted later
        };
        exgeOut.close();

        ofstream & geOut = ofstrOpen(dirOut+"/geneInfo.tab",ERROR_OUT,P);
        geOut << geneID.size() << "\n";
        for (uint64 ig=0; ig<geneID.size(); ig++) {//just geneID for now
            geOut << geneID[ig] <<"\t"<< geneAttr[ig][0] <<"\t"<< geneAttr[ig][1] <<"\n";
        };
        geOut.close();

    };

    {//exon-transcript data structures
        //re-sort transcripts by transcript start/end
        uint64* extrLoci=new uint64 [exonN*GTF_extrLoci_size]; //this also contains transcripts start and end

        uint64 trex1=0;
        for (uint64 iex=0; iex<=exonN; iex++) {
            if (iex==exonN || exonLoci[iex][exT] != exonLoci[trex1][exT]) {
                for (uint64 iex1=trex1; iex1<iex; iex1++) {//go back and fill the trend
                    extrLoci[GTF_extrTrEnd(iex1)]=exonLoci[iex-1][exE];
                };
                if (iex==exonN) break;
                trex1=iex;
            };
            extrLoci[GTF_extrTrStart(iex)]=exonLoci[trex1][exS];
            extrLoci[GTF_extrTrID(iex)]=exonLoci[iex][exT];
            extrLoci[GTF_extrExStart(iex)]=exonLoci[iex][exS];
            extrLoci[GTF_extrExEnd(iex)]=exonLoci[iex][exE];
            extrLoci[GTF_extrGeID(iex)]=exonLoci[iex][exG];
        };

        qsort((void*) extrLoci, exonN, sizeof(uint64)*GTF_extrLoci_size, funCompareArrays<uint64,5>);

        ofstream trOut ((dirOut+"/transcriptInfo.tab").c_str());
        trOut<<transcriptID.size() << "\n";
        ofstream exOut ((dirOut+"/exonInfo.tab").c_str());
        exOut<<exonN<<"\n";

        uint64 trid=extrLoci[GTF_extrTrID(0)];
        uint64 trex=0;
        uint64 trstart=extrLoci[GTF_extrTrStart(0)];
        uint64 trend=extrLoci[GTF_extrTrEnd(0)];
        uint64 exlen=0;
        for (uint64 iex=0;iex<=exonN; iex++) {
            if (iex==exonN || extrLoci[GTF_extrTrID(iex)] != trid) {//start of the new transcript
                //write out previous transcript
                trOut << transcriptID.at(trid) <<"\t"<< extrLoci[GTF_extrTrStart(iex-1)]<<"\t"<< extrLoci[GTF_extrTrEnd(iex-1)] \
                       <<"\t"<< trend << "\t"<< (uint64) transcriptStrand[trid]  <<"\t"<< iex-trex <<"\t"<<trex<<"\t"<<extrLoci[GTF_extrGeID(iex-1)]<<"\n";
                if (iex==exonN) break;
                trid=extrLoci[GTF_extrTrID(iex)];
                trstart=extrLoci[GTF_extrTrStart(iex)];
                trex=iex;
                trend=max(trend,extrLoci[GTF_extrTrEnd(iex-1)]);
                exlen=0;
            };
            exOut << extrLoci[GTF_extrExStart(iex)]-trstart <<"\t"<< extrLoci[GTF_extrExEnd(iex)]-trstart <<"\t"<< exlen <<"\n";
            exlen+=extrLoci[GTF_extrExEnd(iex)]-extrLoci[GTF_extrExStart(iex)]+1;
        };
        trOut.close();
        exOut.close();
    };

    //make junctions
    const uint64 sjStride=4;
    uint64* sjLoci = new uint64 [exonN*sjStride];
    uint64 trIDn=exonLoci[0][exT];
    uint64 sjN=0;
    for (uint64 iex=1; iex<exonN; iex++) {
        if (trIDn==exonLoci[iex][exT]) {
                uint64 chr1=genome.chrBin[exonLoci[iex][exS] >> genome.pGe.gChrBinNbits];
            if ( exonLoci[iex][exS]<=exonLoci[iex-1][exE]+1 ) {//touching - nothing to do
            } else if ( exonLoci[iex][exS]<=exonLoci[iex-1][exE] ) {//overlapping
                P.inOut->logMain << "WARNING: while processing pGe.sjdbGTFfile=" << genome.pGe.sjdbGTFfile <<": overlapping exons:\n";
                P.inOut->logMain << genome.chrName[chr1] <<"\t"<< exonLoci[iex-1][exS]+1-genome.chrStart[chr1] << "\t"<< exonLoci[iex-1][exE]+1-genome.chrStart[chr1]  <<"\n";
                P.inOut->logMain << genome.chrName[chr1] <<"\t"<< exonLoci[iex][exS]  +1-genome.chrStart[chr1] << "\t"<< exonLoci[iex][exE]  +1-genome.chrStart[chr1]  <<"\n";
            } else {
                sjLoci[sjN*sjStride]=exonLoci[iex-1][exE]+1;
                sjLoci[sjN*sjStride+1]=exonLoci[iex][exS]-1;
                sjLoci[sjN*sjStride+2]=(uint64) transcriptStrand[trIDn];
                sjLoci[sjN*sjStride+3]=exonLoci[iex][exG]+1;//genes are numbered from 1
                sjN++;
            };
        } else {
            trIDn=exonLoci[iex][exT];
        };
    };

    qsort((void*) sjLoci, sjN, sizeof(uint64)*sjStride, funCompareUint2);

    char strandChar[3]={'.','+','-'};
    uint64 sjdbN1=sjdbLoci.chr.size();
    sjdbLoci.gene.resize(sjdbN1); //need to resize in case sjdbLoci was loaded from files without gene attribute. TODO make sure gene is always present
    for (uint64 ii=0;ii<sjN;ii++) {
        if ( ii==0 || (sjLoci[ii*sjStride]!=sjLoci[(ii-1)*sjStride])
                   || (sjLoci[ii*sjStride+1]!=sjLoci[(ii-1)*sjStride+1])
                   || (sjLoci[ii*sjStride+2]!=sjLoci[(ii-1)*sjStride+2]) ) {
            uint64 chr1=genome.chrBin[sjLoci[ii*sjStride] >> genome.pGe.gChrBinNbits];
            sjdbLoci.chr.push_back(genome.chrName[chr1]);
            sjdbLoci.start.push_back(sjLoci[ii*sjStride]+1-genome.chrStart[chr1]);
            sjdbLoci.end.push_back(sjLoci[ii*sjStride+1]+1-genome.chrStart[chr1]);
            sjdbLoci.str.push_back(strandChar[sjLoci[ii*sjStride+2]]);
            sjdbLoci.gene.push_back({sjLoci[ii*sjStride+3]});
        } else {
            sjdbLoci.gene.back().insert(sjLoci[ii*sjStride+3]);
        };
    };

    ofstream sjdbList ((dirOut+"/sjdbList.fromGTF.out.tab").c_str());
    for (uint64 ii=sjdbN1;ii<sjdbLoci.chr.size(); ii++) {
        sjdbList << sjdbLoci.chr.at(ii)<<"\t"<< sjdbLoci.start.at(ii) << "\t"<< sjdbLoci.end.at(ii)  <<"\t"<< sjdbLoci.str.at(ii);

        auto gg=sjdbLoci.gene[ii].cbegin();//iterator for genes
        sjdbList <<"\t"<< *gg;
        ++gg;
        for (; gg!=sjdbLoci.gene[ii].cend(); gg++)
            sjdbList <<","<< *gg;
        sjdbList<<"\n";
    };
    sjdbList.close();

    sjdbLoci.priority.resize(sjdbLoci.chr.size(),20);

    P.inOut->logMain << "Processing pGe.sjdbGTFfile=" << genome.pGe.sjdbGTFfile <<", found:\n";
    P.inOut->logMain << "\t\t"  << transcriptID.size() <<" transcripts\n" << "\t\t"  << exonN << " exons (non-collapsed)\n" << "\t\t"  << sjdbLoci.chr.size()-sjdbN1 << " collapsed junctions\n";
    P.inOut->logMain << "Total junctions: " <<sjdbLoci.chr.size()<<"\n";
    time_t rawTime;
    time(&rawTime);
    P.inOut->logMain     << timeMonthDayTime(rawTime) <<" ..... finished GTF processing\n\n" <<flush;

    return sjdbLoci.chr.size()-sjdbN1;
};