File: SuperTranscriptome.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (92 lines) | stat: -rwxr-xr-x 3,706 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include "SuperTranscriptome.h"
#include "streamFuns.h"

void SuperTranscriptome::sjCollapse()
{//sort junctions by superTranscript, and then by the end coordinate, and then by the start coordinate
    sort(sj.begin(), sj.end(),
     [](const sjInfo &sj1, const sjInfo &sj2) {
         return ( sj1.super  < sj2.super) || 
                ( sj1.super == sj2.super && sj1.start < sj2.start ) ||
                ( sj1.super == sj2.super && sj1.start == sj2.start && sj1.end < sj2.end ) ;
     });

    //collapse junctions in each superTr, those junctions that belong to different transcript - remove the transcript info
    vector<vector<array<uint32,2>>> sjCollapsed;
    sjCollapsed.resize(seq.size());
    for(uint32 i = 0; i < sj.size(); i++) {
        if (i==0 || sj[i].start!=sj[i-1].start || sj[i].end!=sj[i-1].end || sj[i].super!=sj[i-1].super) //new junction
            sjCollapsed[ sj[i].super ].push_back({sj[i].start, sj[i].end});
    };
    
    ofstream & superTrSJstream = ofstrOpen(P.pGe.gDir+"/superTranscriptSJcollapsed.tsv", ERROR_OUT, P);
    for(uint64 i = 0; i < sjCollapsed.size(); i++) {
        for(auto &sj1 : sjCollapsed[i])
            superTrSJstream << i <<"\t"<< sj1[0] <<"\t"<< sj1[1] << "\n";
    };
    superTrSJstream.close();

    P.inOut->logMain << "Number of splice junctions in superTranscripts = " << sj.size() <<endl;
    P.inOut->logMain << "Number of collapsed splice junctions in superTranscripts = " << sjCollapsed.size() <<endl;
};

void SuperTranscriptome::load(char *G, vector<uint64> &chrStart, vector<uint64> &chrLength)
{//load superTranscript seqs and
    N=chrLength.size();
    superTrs.resize(N);
    for (uint32 ii=0; ii<N; ii++) {
        superTrs[ii].length=chrLength[ii];
        superTrs[ii].seqP=(uint8*)G+chrStart[ii];
    };
    
    ifstream & superTrSJstream = ifstrOpen(P.pGe.gDir+"/superTranscriptSJcollapsed.tsv", ERROR_OUT, "SOLUTION: re-generate the genome.", P);
    
    uint32 sutr=0,sutr1=0;
    vector<array<uint32,3>> sjC1;
    sjNmax=0;
    sjDonorNmax=0;
    vector<uint32> sjDonor1;
    
    while(true) {//load junctions, assume they are sorted by donor coordinate
        
        superTrSJstream >> sutr;
        bool inGood = superTrSJstream.good();
        
        if (sutr!=sutr1 || !inGood) {//new suTr
            //sort sj1 by acceptor position
            sort(sjC1.begin(), sjC1.end(),
                [](const array<uint32,3> &sj1, const array<uint32,3> &sj2) {
                 return ( sj1[1] < sj2[1] ) ||
                        ( sj1[1] == sj2[1]  && sj1[0] < sj2[0] );
                });

            superTrs[sutr1].sjC=sjC1;
            superTrs[sutr1].sjDonor=sjDonor1;
            
            if (sjNmax < sjC1.size())
                sjNmax=sjC1.size();
            if (sjDonorNmax < sjDonor1.size()) {
                sjDonorNmax=sjDonor1.size();
            };
            
            sjC1.clear();
            sjDonor1.clear();
            sutr1=sutr;
            
            if (!inGood) {
                break; //end of file
            };
        };
        
        uint32 sjd, sja;
        superTrSJstream >> sjd >> sja;
        
        if (sjDonor1.empty() || sjDonor1.back() < sjd) 
            sjDonor1.push_back(sjd);
        
        sjC1.push_back({sjd,sja,(uint32)sjDonor1.size()-1});//record donor, acceptor, and position of the donor in sjDonor
    };
    superTrSJstream.close();
    
    P.inOut->logMain << "Max number of splice junctions in a superTranscript = " << sjNmax <<endl;
    P.inOut->logMain << "Max number of donor sites in a superTranscript = " << sjDonorNmax <<endl;
};