1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
|
#ifndef __RLINK_H__
#define __RLINK_H__
#include "GArgs.h"
#include "GStr.h"
#include "gff.h"
#include "GSam.h"
#include "GBitVec.h"
#include "time.h"
#include "tablemaker.h"
#include "GHashMap.hh"
//#include "cds.h"
#define MAX_NODE 1000000
#define KMER 31
#define DROP 0.5
#define ERROR_PERC 0.1
#define DBL_ERROR 0.01
#define CHI_WIN 100
#define CHI_THR 50
#define SMALL_EXON 35 // exons smaller than this have a tendency to be missed by long read data
#define IS_FPKM_FLAG 1
#define IS_TPM_FLAG 2
#define IS_COV_FLAG 4
const double epsilon=0.000001; //-E
const float trthr=1.0; // transfrag pattern threshold
const float MIN_VAL=-100000.0;
const int MAX_MAXCOMP=200; // is 200 too much, or should I set it up to 150?
//const uint largeintron=20000; // don't trust introns longer than this unless there is higher evidence; less than 10% of all human annotated introns are longer than this
//const uint longintron=70000; // don't trust introns longer than this unless there is higher evidence; about 98% of all human annotated introns are shorter than this
const uint longintron=100000; // don't trust introns longer than this unless there is higher evidence; about 99% of all human annotated introns are shorter than this
const uint longintronanchor=25; // I need a higher anchor for long introns -> use a smaller value here? i.e. 20?
//const uint verylongintron=100000; // don't trust introns longer than this unless there is higher evidence; about 99% of all human annotated introns are shorter than this
//const uint verylongintronanchor=35; // I need a higher anchor for very long introns
const float mismatchfrac=0.02;
const float lowcov=1.5;
const float lowisofrac=0.02;
const int max_trf_number=40000; // maximum number of transfrag accepted so that the memory doesn't blow up
extern bool mergeMode;
extern bool forceBAM; //for stdin alignment data
extern bool verbose;
extern bool debugMode;
//collect all refguide transcripts for a single genomic sequence
struct GRefData {
GList<GffObj> rnas; //all transcripts on this genomic seq
int gseq_id;
const char* gseq_name;
GRefData(int gid=-1):rnas(false,true,false),gseq_id(gid),gseq_name(NULL) {
gseq_id=gid;
if (gseq_id>=0)
gseq_name=GffObj::names->gseqs.getName(gseq_id);
}
void add(GffReader* gffr, GffObj* t) {
if (gseq_id>=0) {
if (gseq_id!=t->gseq_id)
GError("Error: invalid call to GRefData::add() - different genomic sequence!\n");
}
else { //adding first transcript, initialize storage
gseq_id=t->gseq_id;
gseq_name=t->getGSeqName();
if (gffr->gseqtable[gseq_id]==NULL)
GError("Error: invalid genomic sequence data (%s)!\n",gseq_name);
rnas.setCapacity(gffr->gseqtable[gseq_id]->fcount);
}
rnas.Add(t);
t->isUsed(true);
//setLocus(t); //use the GRefLocus::mexons to quickly find an overlap with existing loci, or create a new one
}
bool operator==(GRefData& d){
return gseq_id==d.gseq_id;
}
bool operator<(GRefData& d){
return (gseq_id<d.gseq_id);
}
};
struct CBundlenode:public GSeg {
float cov;
int bid; // bundle node id in bnode -> to easy retrieve it
CBundlenode *nextnode; // next node in the same bundle
CBundlenode(int rstart=0, int rend=0, float _cov=0, int _bid=-1, CBundlenode *_nextnode=NULL):GSeg(rstart, rend),
cov(_cov),bid(_bid),nextnode(_nextnode) {}
};
enum GPFType {
GPFT_NONE=0,
GPFT_TSS,
GPFT_CPAS
}; //on 4 bits: maximum 15 types
struct GPtFeature { //point feature (single coordinate)
GPFType ftype : 4;
int ref_id: 26; //index in a reftable[] with reference names, max 67,108,863
int strand: 2; //-1=-, 0=unstranded, +1=+
uint coord; //genomic coordinate for this feature
GPtFeature(GPFType ft=GPFT_NONE, int rid=-1, int _strand=0, uint loc=0):ftype(ft),
ref_id(rid), strand(_strand), coord(loc) {}
bool operator<(const GPtFeature &o) { return coord<o.coord; }
bool operator==(const GPtFeature &o) { return coord==o.coord; }
//-- should really match ftype and strand too,
// but we don't care, for bundle inclusion
};
struct GRefPtData {
int ref_id; //same with GPtFeature::ref_id, also in GffObj::names->gseqs
GList<GPtFeature> pfs; //all point feature on this genomic seq, sorted
GRefPtData(int gid=-1):ref_id(gid), pfs(true,false,false) { }
void add(GPtFeature* t) { //adds a fully formed GPtFeature record
if (ref_id!=t->ref_id || ref_id<0 || t->ref_id<0)
GError("Error: invalid call to GRefPtData::add() - cannot add feature with ref id %d to ref data id %d!\n",
t->ref_id, ref_id);
pfs.Add(t);
}
//sorting by unique ref_id
bool operator==(GRefPtData& d){
return ref_id==d.ref_id;
}
bool operator<(GRefPtData& d){
return (ref_id<d.ref_id);
}
};
// bundle data structure, holds all data needed for
// infering transcripts from a bundle
enum BundleStatus {
BUNDLE_STATUS_CLEAR=0, //available for loading/prepping
BUNDLE_STATUS_LOADING, //being prepared by the main thread (there can be only one)
BUNDLE_STATUS_READY //ready to be processed, or being processed
};
struct CBundle {
int len;
float cov;
float multi;
int startnode; // id of start node in bundle of same strand
int lastnodeid; // id of last node added to bundle
CBundle(int _len=0, float _cov=0, float _multi=0, int _start=-1, int _last=-1):
len(_len),cov(_cov),multi(_multi), startnode(_start),lastnodeid(_last) {}
};
struct CPath {
int node;
int contnode;
float abundance;
CPath(int n1=0,int n2=0,float abund=0):node(n1),contnode(n2),abundance(abund){}
};
struct CTransfrag {
GVec<int> nodes;
GBitVec pattern;
float abundance;
float srabund; // keeps abundance associated to srfrag
GVec<CPath> path; // stores all the possible paths that leave from a node to reach next node in a transfrag, and distributes the abundance of the transfrag between all possible continuations
float usepath;
int weak; // number of weak links
bool real:1;
bool longread:1; // there is at least a longread supporting transfrag
bool shortread:1; // there is at least one short read supporting transfrag
int guide;
uint longstart; // for long reads: min start of all longreads sharing transfrag
uint longend; // for long reads: max end of all longreads sharing transfrag
CTransfrag(GVec<int>& _nodes,GBitVec& bit, float abund=0, bool treal=false, int tguide=0,float sr=0):nodes(_nodes),pattern(bit),abundance(abund),srabund(sr),path(),usepath(-1),weak(-1),real(treal),longread(false),shortread(false),guide(tguide),longstart(false),longend(false) {}
CTransfrag(float abund=0, bool treal=false,int tguide=0):nodes(),pattern(),abundance(abund),srabund(0),path(),usepath(-1),weak(-1),real(treal),longread(false),shortread(false),guide(tguide),longstart(false),longend(false) {
}
};
struct CMTransfrag { // this is the super-class for transfrag -> to use in case of merging transcripts
CTransfrag *transfrag;
GVec<int> read; // all reads' indeces that are connected to this transfrag
int nf;
int nl;
uint len;
CMTransfrag(CTransfrag *t=NULL):transfrag(t),read(),nf(0),nl(0),len(0) {}
};
struct CGuide {
CTransfrag *trf;
//GffObj* t; // this is what I was using before but I need to store the guide index in guides instead
//CGuide(CTransfrag* _trf=NULL, GffObj* _t=NULL):trf(_trf),t(_t) {}
int g; // stores guide index in guides instead of the actual pointer
CGuide(CTransfrag* _trf=NULL, int _g=-1):trf(_trf),g(_g) {}
};
struct CPartGuide {
int idx;
int olen; // overlap with node
int allolen; // overlap with all nodes
int glen; // guide length
bool terminal_in;
bool terminal_out;
float gcount;
float cov; // assigned overall coverage so far
float ncov; // new coverage in node
CPartGuide(int _idx=0, int _olen=0, int _aolen=0, int _glen=0):idx(_idx),olen(_olen),allolen(_aolen),glen(_glen),
terminal_in(false),terminal_out(false),gcount(0),cov(0),ncov(0) {}
};
struct CTrGuidePat { // remember abundances based on node guide pattern
GBitVec pat;
float abund;
bool terminal;
GVec<int> g;
CTrGuidePat():pat(),abund(0),terminal(false),g() {} // default constructor
CTrGuidePat(GBitVec p, float a, bool _t): pat(p),abund(a),terminal(_t),g() {}
};
struct CNodeGuide {
GVec<CPartGuide> guide;
GVec<CTrGuidePat> trcount;
float sumtrcount; // sum of all used in guides transfrag abundances -> this is needed in order to see if a guide would explain all of them
CNodeGuide():guide(),trcount(),sumtrcount(0) {}
};
struct CGroup:public GSeg {
int color;
int grid;
float cov_sum;
float multi;
float neg_prop; // proportion of negative reads assigned to group out of all positives and negatives
CGroup *next_gr;
CGroup(int rstart=0, int rend=0, int _color=-1, int _grid=0, float _cov_sum=0,
float _multi=0,float _neg_prop=0, CGroup *_next_gr=NULL): GSeg(rstart, rend), color(_color), grid(_grid),cov_sum(_cov_sum),
multi(_multi), neg_prop(_neg_prop),next_gr(_next_gr) { }
};
struct CMerge {
GStr name;
GVec<int> fidx; // file indices for the transcripts in the merge
CMerge(const char* rname=NULL):name(rname),fidx() {}
};
struct CExon{
int predno;
int exonno;
float exoncov;
CExon(int p=0,int e=0,float c=0):predno(p),exonno(e),exoncov(c) {}
};
/*struct TwoFloat{
float start;
float end;
TwoFloat(float v1=0,float v2=0):start(v1),end(v2) {}
};*/
struct CPred{
int predno;
float cov;
CPred(int p=0,float c=0):predno(p),cov(c) {}
};
struct CLongTrf{
int t;
float cov;
GVec<int> group;
CLongTrf(int tno=0,float c=0):t(tno),cov(c),group() {}
};
struct CPrediction:public GSeg {
int geneno;
GffObj* t_eq; //equivalent reference transcript (guide)
//char *id;
float cov;
float longcov;
char strand;
//float frag; // counted number of fragments associated with prediction
int tlen;
bool flag;
GVec<GSeg> exons;
GVec<float> exoncov;
GStr mergename;
CPrediction(int _geneno=0, GffObj* guide=NULL, int gstart=0, int gend=0, float _cov=0, char _strand='.',
int _len=0,bool f=true):GSeg(gstart,gend), geneno(_geneno),t_eq(guide),cov(_cov),longcov(0),strand(_strand),
//CPrediction(int _geneno=0, char* _id=NULL,int gstart=0, int gend=0, float _cov=0, char _strand='.', float _frag=0,
// int _len=0,bool f=true):GSeg(gstart,gend), geneno(_geneno),id(_id),cov(_cov),strand(_strand),frag(_frag),
tlen(_len),flag(f),exons(),exoncov(),mergename() {}
void init(int _geneno=0, GffObj* guide=NULL, int gstart=0, int gend=0, float _cov=0, char _strand='.',
int _len=0) {
geneno=_geneno;
t_eq=guide;
start=gstart;
end=gend;
cov=_cov;
strand=_strand;
tlen=_len;
flag=true;
exons.Clear();
exoncov.Clear();
mergename.clear();
}
CPrediction(CPrediction& c):GSeg(c.start, c.end), geneno(c.geneno),
// id(Gstrdup(c.id)), cov(c.cov), strand(c.strand), frag(c.frag), tlen(c.tlen), flag(c.flag),
t_eq(c.t_eq), cov(c.cov), longcov(c.longcov),strand(c.strand), tlen(c.tlen), flag(c.flag),
exons(c.exons), exoncov(c.exoncov), mergename(c.mergename) {}
~CPrediction() { //GFREE(id);
}
};
struct CMPrediction {
CPrediction *p;
GVec<int> nodes;
GBitVec pat; // pattern of nodes and introns in prediction
GBitVec b; // not retained introns
CMPrediction(CPrediction* _p=NULL): p(_p),nodes(),pat(),b() {}
CMPrediction(CPrediction* _p,GVec<int>& _nodes,GBitVec& _pat, GBitVec& _b): p(_p),nodes(_nodes),pat(_pat),b(_b) {}
};
struct CNodeCapacity {
int id;
bool left;
float perc;
CNodeCapacity(int nid=0,bool leftnode=false,float p=0): id(nid),left(leftnode),perc(p) {}
};
// this class keeps the gene predictions (linked bundle nodes initially)
struct CGene:public GSeg { // I don't necessarily need to make this a GSeg since I can get the start&end from the exons
char strand;
char* geneID;
char* geneName;
float cov; // this is the actual gene coverage
float covsum; // this is a sum of transcripts coverages -> this is what we need for FPKM and TPM estimations
GVec<GSeg> exons; // all possible exons in gene (those are bnodes in bundle)
CGene(int gstart=0, int gend=0, char _strand='.',char *gid=NULL, char *gname=NULL):GSeg(gstart,gend),
strand(_strand), geneID(gid), geneName(gname), exons() { cov=0; covsum=0;}
// getGeneID() and getGeneName() functions of gffobj return pointers to this attributes in gffobj so I don't need to clean them up here
};
//holding transcript info for --merge mode
struct TAlnInfo {
GStr name; //transcript name
int fileidx; //index of transcript file in the TInputFiles.files array
double cov;
double fpkm;
double tpm;
int g;
TAlnInfo(const char* rname=NULL, int fidx=0):name(rname), fileidx(fidx),
cov(-1),fpkm(-1),tpm(-1),g(-1) { }
};
struct CJunction;
struct CReadAln:public GSeg {
//DEBUG ONLY:
// GStr name;
char strand; // 1, 0 (unkown), -1 (reverse)
short int nh;
uint len;
float read_count; // keeps count for all reads (including paired and unpaired)
bool unitig:1; // set if read come from an unitig
bool longread:1; // set if read comes from long read data
GVec<float> pair_count; // keeps count for all paired reads
GVec<int> pair_idx; // keeps indeces for all pairs in assembly mode, or all reads that were collapsed in merge mode
GVec<GSeg> segs; //"exons"
GPVec<CJunction> juncs;
union {
TAlnInfo* tinfo;
bool in_guide;
};
CReadAln(char _strand=0, short int _nh=0,
int rstart=0, int rend=0, TAlnInfo* tif=NULL): GSeg(rstart, rend), //name(rname),
strand(_strand),nh(_nh), len(0), read_count(0), unitig(false),longread(false),pair_count(),pair_idx(),
segs(), juncs(false), tinfo(tif) { }
CReadAln(CReadAln &rd):GSeg(rd.start,rd.end) { // copy contructor
strand=rd.strand;
nh=rd.nh;
len=rd.len;
read_count=rd.read_count;
unitig=rd.unitig;
longread=rd.longread;
pair_count=rd.pair_count;
pair_idx=rd.pair_idx;
juncs=rd.juncs;
tinfo=rd.tinfo;
}
int overlapSegLen(CReadAln* r) {
if (r->start>end || start>r->end) return 0;
int i=0;
int j=0;
int len=0;
while(i<segs.Count()) {
if(segs[i].end<r->segs[j].start) i++;
else if(r->segs[j].end<segs[i].start) j++;
else { // there is overlap
len+=segs[i].overlapLen(r->segs[j].start,r->segs[j].end);
if(segs[i].end<r->segs[j].end) i++;
else j++;
}
if(j==r->segs.Count()) break;
}
return len;
}
~CReadAln() { if(mergeMode) {delete tinfo;} }
};
struct CGraphinfo {
int ngraph;
int nodeno;
CGraphinfo(int ng=-1,int nnode=-1):ngraph(ng),nodeno(nnode){}
};
struct CGJunc {
int leftnode;
int rightnode;
double cov; // ngood
double goodcov; // ngood_reads
CGJunc(int n1=0,int n2=0,double _cov=0,double _goodcov=0):leftnode(n1),rightnode(n2),cov(_cov),goodcov(_goodcov){}
};
struct CGNode {
int id; // initial id in graphno
bool last; // if this is last node (to be linked to sink later)
bool keep; // if I keep it in the final count (true by default)
bool merge; // if this node needs to be merged to its adjacent node
bool future;
CGNode(int _id=0,bool _last=false,bool _keep=true, bool _merge=false, bool _future=false):id(_id),last(_last),keep(_keep),merge(_merge),future(_future){}
};
struct CTreePat {
int nodeno;
int childno;
CTransfrag *tr;
CTreePat **nextpat;
CTreePat(int n=0,int cno=0):nodeno(n),childno(cno),tr(NULL),nextpat(NULL){
if(cno) {
GCALLOC(nextpat,cno*sizeof(CTreePat *));
for(int i=0;i<cno;i++) nextpat[i]=NULL;
}
}
void setchilds(int cno) {
if(cno && !nextpat) {
GCALLOC(nextpat,cno*sizeof(CTreePat *));
for(int i=0;i<cno;i++) nextpat[i]=NULL;
}
childno=cno;
}
void settree(int i, CTreePat *t) {
if(i<childno) nextpat[i]=t;
}
CTreePat *settree(int nextpos,int n,int cno) {
if(nextpos<childno) {
if(!nextpat[nextpos]) nextpat[nextpos]=new CTreePat(n,cno);
return(nextpat[nextpos]);
}
return(NULL);
}
};
struct CTrimPoint { // this can work as a guide keeper too, where pos is the guideidx, abundance is the flow, and start is the included status
uint pos;
float abundance;
bool start:1;
CTrimPoint(uint _pos=0,float abund=0.0,bool _start=true):pos(_pos),abundance(abund),start(_start) {}
};
struct CInterval {
uint pos; // interval start position
float val; // interval value or interval last position depending on use
CInterval *next; // next interval;
CInterval(uint _pos=0,float _val=0,CInterval *_next=NULL):pos(_pos),val(_val),next(_next) {}
};
/*
struct CSegCov:public GSeg {
bool spliced:1;
GVec<GStr> rname;
CSegCov *next; // next interval;
CSegCov(uint start=0,uint end=0):GSeg(start,end),spliced(false),rname(),next(NULL) {}
};
*/
struct CMaxIntv:public GSeg {
GVec<CExon> node;
CMaxIntv *next; // next interval;
CMaxIntv(uint start=0,uint end=0):GSeg(start,end),node(),next(NULL) {}
CMaxIntv(GVec<CExon>& _node,uint start,uint end,CMaxIntv *_next=NULL):GSeg(start,end),node(_node),next(_next) {}
};
struct GInterval {
uint start;
uint end;
GInterval *next;
GInterval(uint _start, uint _end,GInterval *_next=NULL):start(_start),end(_end),next(_next) {}
};
struct CTrInfo {
int trno;
float abundance;
float penalty;
CTrInfo(int tr=-1,float _abund=0.0, float _pen=0.0):trno(tr),abundance(_abund),penalty(_pen) {}
};
struct CNetEdge {
int link;
float rate;
bool fake;
CNetEdge(int lnk=0.0,float r=0.0, bool f=false):link(lnk),rate(r),fake(f){}
};
struct CComponent {
float size;
GVec<int> *set;
CComponent(float _size=0.0,GVec<int> *_set=NULL):size(_size),set(_set) {}
~CComponent() { if(set) delete set;}
};
struct GEdge { // guide edge
// if val < endval then this is start; otherwise it is end
uint val; // value of the boundary
uint endval; // value of the other exon boundary shared with val
int strand;
bool operator<(const GEdge& o) const {
return(val<o.val || (val==o.val && strand<o.strand));
}
bool operator==(const GEdge& o) const {
return(val==o.val && strand==o.strand);
}
GEdge(uint _val=0,uint _endval=0,int _strand=0):val(_val),endval(_endval),strand(_strand) {}
};
struct CGraphnode:public GSeg {
int nodeid;
float cov;
float capacity; // sum of all transcripts abundances exiting and through node
float rate; // conversion rate between in and out transfrags of node
//float frag; // number of fragments included in node
GVec<int> child;
GVec<int> parent;
GBitVec childpat;
GBitVec parentpat;
GVec<int> trf; // transfrags that pass the node
bool hardstart:1; // verified/strong start
bool hardend:1; // verified/strong end
//CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0,float f=0):GSeg(s,e),nodeid(id),cov(nodecov),capacity(cap),rate(r),frag(f),child(),parent(),childpat(),parentpat(),trf(){}
CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0):GSeg(s,e),
nodeid(id),cov(nodecov),capacity(cap),rate(r),child(),parent(),childpat(),parentpat(),trf(),hardstart(false),hardend(false){}
};
// # 0: strand; 1: start; 2: end; 3: nreads; 4: nreads_good;
struct CJunction:public GSeg {
char strand; //-1,0,1
char guide_match; //exact match of a ref intron?
char consleft; // -1,0,1 -1 is not set up, 0 is non consensus, 1 is consensus
char consright; // -1,0,1 -1 is not set up, 0 is non consensus, 1 is consensus
double nreads;
double nreads_good;
double leftsupport;
double rightsupport;
double nm; // number of reads with a high nm (high mismatch)
double mm; // number of reads that support a junction with both anchors bigger than longintronanchor
CJunction(int s=0,int e=0, char _strand=0):GSeg(s,e),
strand(_strand), guide_match(0), consleft(-1), consright(-1),nreads(0),nreads_good(0),
leftsupport(0),rightsupport(0),nm(0),mm(0) {}
bool operator<(CJunction& b) {
if (start<b.start) return true;
if (start>b.start) return false;
if (end<b.end) return true;
if (end>b.end) return false;
if (strand>b.strand) return true;
return false;
}
bool operator==(CJunction& b) {
return (start==b.start && end==b.end && strand==b.strand);
}
};
struct GReadAlnData {
GSamRecord* brec;
char strand; //-1, 0, 1
int nh;
int hi;
GPVec<CJunction> juncs;
union {
TAlnInfo* tinfo;
bool in_guide;
};
//GPVec< GVec<RC_ExonOvl> > g_exonovls; //>5bp overlaps with guide exons, for each read "exon"
GReadAlnData(GSamRecord* bamrec=NULL, char nstrand=0, int num_hits=0,
int hit_idx=0, TAlnInfo* tif=NULL):brec(bamrec), strand(nstrand),
nh(num_hits), hi(hit_idx), juncs(true), tinfo(tif) { } //, g_exonovls(true)
~GReadAlnData() { if(mergeMode) delete tinfo; }
};
/*
struct CTCov { //covered transcript info
int first_cov_exon;
int last_cov_exon;
int numt;
GffObj* guide;
bool whole;
CTCov(GffObj* t, int fex=-1, int lex=0, int ntr=0):first_cov_exon(fex), last_cov_exon(lex),
numt(ntr), guide(t), whole(false) {
whole = (first_cov_exon<0);
}
void print(FILE* f) {
if (whole) { //from get_covered()
guide->printTranscriptGff(f);
}
else { //from get_partial_covered()
bool partial=true;
if (last_cov_exon<0) {
if (guide->exons.Count()==1) partial=false;
last_cov_exon=first_cov_exon;
} else {
if(last_cov_exon-first_cov_exon+1==guide->exons.Count()) partial=false;
}
for(int i=first_cov_exon;i<=last_cov_exon;i++) {
if(partial) fprintf(f, "%s\tpartial\texon\t%u\t%u\t.\t%c\t.\ttranscript_id \"%s_part%d\";\n",guide->getGSeqName(),
guide->exons[i]->start,guide->exons[i]->end,guide->strand,guide->getID(), numt);
else fprintf(f, "%s\tcomplete\texon\t%u\t%u\t.\t%c\t.\ttranscript_id \"%s\";\n",guide->getGSeqName(),
guide->exons[i]->start,guide->exons[i]->end,guide->strand,guide->getID());
}
}
}
};
*/
// bundle data structure, holds all input data parsed from BAM file
// - r216 regression
struct BundleData {
BundleStatus status;
//int64_t bamStart; //start of bundle in BAM file
int idx; //index in the main bundles array
int start;
int end;
unsigned long numreads; // number of reads in this bundle
/*
float wnumreads; // NEW: weighted numreads; a multi-mapped read mapped in 2 places will contribute only 0.5
double sumreads; // sum of all reads' lengths in bundle
double sumfrag; // sum of all fragment lengths (this includes the insertion so it is an estimate)
float num_reads; // number of all reads in bundle that we considered (weighted)
float num_cov; // how many coverages we added (weighted) to obtain sumcov
float num_frag; // how many fragments we added to obtain sumfrag
double num_fragments3;
double sum_fragments3;
*/
double num_fragments; //aligned read/pairs
double frag_len;
double sum_cov; // sum of all transcripts coverages --> needed to compute TPMs
char covflags;
GStr refseq; //reference sequence name
char* gseq; //actual genomic sequence for the bundle
GList<CReadAln> readlist;
GVec<float> bpcov[3]; // this needs to be changed to a more inteligent way of storing the data
GList<CJunction> junction;
GPVec<GffObj> keepguides;
GPVec<GPtFeature> ptfs; //point features for this bundle
GList<CPrediction> pred;
RC_BundleData* rc_data;
BundleData():status(BUNDLE_STATUS_CLEAR), idx(0), start(0), end(0),
numreads(0),
num_fragments(0), frag_len(0),sum_cov(0),covflags(0),
refseq(), gseq(NULL), readlist(false,true), //bpcov(1024),
junction(true, true, true),
keepguides(false), ptfs(false), pred(false), rc_data(NULL) {
for(int i=0;i<3;i++) bpcov[i].setCapacity(4096);
}
void getReady(int currentstart, int currentend) {
//this is only called when the bundle is valid and ready to be processed
start=currentstart;
end=currentend;
//refseq=ref;
//tag all these guides
for (int i=0;i<this->keepguides.Count();++i) {
RC_TData* tdata=(RC_TData*)(keepguides[i]->uptr);
tdata->in_bundle=1;
}
status=BUNDLE_STATUS_READY;
}
void rc_init(GffObj* t, GPVec<RC_TData>* rc_tdata,
GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) {
if (rc_data==NULL) {
rc_data = new RC_BundleData(t->start, t->end,
rc_tdata, rc_edata, rc_idata);
}
}
/* after reference annotation was loaded
void rc_finalize_refs() {
if (rc_data==NULL) return;
//rc_data->setupCov();
}
Not needed here, we update the coverage span as each transcript is added
*/
void keepGuide(GffObj* scaff, GPVec<RC_TData>* rc_tdata=NULL,
GPVec<RC_Feature>* rc_edata=NULL, GPVec<RC_Feature>* rc_idata=NULL);
//bool evalReadAln(GSamRecord& brec, char& strand, int nh); //, int hi);
bool evalReadAln(GReadAlnData& alndata, char& strand);
void Clear() {
keepguides.Clear();
ptfs.Clear();
pred.Clear();
pred.setSorted(false);
readlist.Clear();
for(int i=0;i<3;i++) {
bpcov[i].Clear();
bpcov[i].setCapacity(1024);
}
junction.Clear();
start=0;
end=0;
status=BUNDLE_STATUS_CLEAR;
numreads=0;
num_fragments=0;
frag_len=0;
sum_cov=0;
covflags=0;
delete rc_data;
GFREE(gseq);
rc_data=NULL;
}
~BundleData() {
Clear();
}
};
void processRead(int currentstart, int currentend, BundleData& bdata,
GHash<int>& hashread, GReadAlnData& alndata);
//GSamRecord& brec, char strand, int nh, int hi);
void countFragment(BundleData& bdata, GSamRecord& brec, int hi,int nh);
int printResults(BundleData* bundleData, int geneno, GStr& refname);
int printMergeResults(BundleData* bundleData, int geneno, GStr& refname);
int infer_transcripts(BundleData* bundle);
// --- utility functions
void printGff3Header(FILE* f, GArgs& args);
void printTime(FILE* f);
#endif
|