1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
/**********************************************************************
* file: geneMSA.hh
* licence: Artistic Licence, see file LICENCE.TXT or
* http://www.opensource.org/licenses/artistic-license.php
* descr.: multiple sequence alignment of genomes for comparative gene prediction
* authors: Mario Stanke, Alexander Gebauer
*
*********************************************************************/
#ifndef _GENEMSA
#define _GENEMSA
// project includes
#include "alignment.hh"
#include "exoncand.hh"
#include "orthoexon.hh"
#include "randseqaccess.hh"
#include "speciesgraph.hh"
#include<unordered_map>
#include<boost/functional/hash.hpp>
//forward declarations
class OrthoGraph;
string printRFC(vector<int>);
/* store cumulative sums of log likelihoods for each omega (optional: cumulative sum of number of substitutions)
* we create an object of cumValues for each bit_vector and reading frame combination
*/
struct cumValues{
vector<double> logliks;
int numSubs;
int id;
cumValues(int i, int s=-1):numSubs(s), id(i){};
void addLogliks(vector<double>* ll){
if(logliks.size() == 0){
logliks.resize(ll->size(),0.0);
if(logliks.size() == 0){
cerr<<"logliks still empty!"<<endl;
}
}
for(int u = 0; u < ll->size(); u++){
logliks[u] += (*ll)[u];
}
}
void addNumSubs(int subs){
numSubs += subs;
}
};
class GeneMSA {
public:
GeneMSA(RandSeqAccess *rsa, Alignment *a);
~GeneMSA();
/*
* get and set functions
*/
// start and end position (0-based, inclusive) of the gene range for the given species
// start <= end refer to the FORWARD strand of the sequence (different from class 'alignment')
// start = end = -1 if the species is absent from the alignment
string getSeqID(int speciesIdx);
Strand getStrand(int speciesIdx);
int getStart(int speciesIdx){ return starts[speciesIdx]; }
int getEnd(int speciesIdx){ return ends[speciesIdx]; }
list<ExonCandidate*>* getExonCands(int speciesIdx){ return exoncands.at(speciesIdx); }
//list<OrthoExon> getOrthoExons() { return orthoExonsList; }
Alignment * getAlignment() {return alignment;}
vector<int> getOffsets() {return offsets;}
map<string,ExonCandidate*>* getECHash(list<ExonCandidate*> *ec); // adds the keys to the map function
/*
* assmotifqthresh, assqthresh, dssqthresh are between 0 and 1 and thresholds for the inclusion of
* acceptor/donor splice sites based on the pattern probability
* assqthresh=0.05 means that only acceptor ss are considered
* that have a pattern, such that 5% of true splice site patterns have lower probability.
* The default threshold of 0 means that all splice site patterns are considered.
*/
void createExonCands(int s, const char *dna, map<int_fast64_t, ExonCandidate*> &ecs, map<int_fast64_t, ExonCandidate*> &addECs);
void setExonCands(vector<map<int_fast64_t, ExonCandidate*> > &ecs);
/**
* find all ortholog exon candidates, that are present in at least max(2, consThresh * m)
* where m <= numSpecies is the number of species that are present in the alignment
* Only report OrthoExons oe with at least 'minAvLen' as average length of the exon candidates in oe.
* ortholog exon candidates:
* - both splice sites align exactly
* - the exon candidate types agrees (single, rsingle, internal0, ...)
* - the phases at both boundaries agree (i.e. exon candidate types and length modulo 3)
* EC coordinates are region-based, as they are used in the OrthoGraph
*/
void createOrthoExons(list<OrthoExon> &orthoExonsList, map<int_fast64_t, list<pair<int,ExonCandidate*> > > &alignedECs, Evo *evo, float consThres = 0.0, int minAvLen = 0);
void printStats(); // to stdout
void printGeneRanges();
void printExonCands();
void printOrthoExons(list<OrthoExon> &orthoExonsList);
void computeOmegas(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree);
void computeOmegasEff(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
vector<string> pruneToBV(vector<string> *cs, bit_vector bv); // prune codon strings to bit_vector
vector<int> pruneToBV(vector<int> *rfc, bit_vector bv); // prune RFC to bit_vector
double omegaForCodonTuple(vector<double> *loglik);
void printOmegaForCodon(string outdir);
void printCumOmega();
void comparativeSignalScoring(list<OrthoExon> &orthoExonsList);
// Charlotte Janas playground
LocusTree *constructTree(); // creates, stores are returns the locus tree for the sequence tuple
// calculate a columnwise conservation score and output it (for each species) in wiggle format
void calcConsScore(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, string outdir);
double calcColumnScore(int a, int c, int t, int g); // input: number of a,c,t and g's in one alignment column
void consToWig(vector<double> &consScore, string outdir);
// static functions
static void setTree(PhyloTree *t){tree = t;}
static void setCodonEvo(CodonEvo *c){ codonevo = c; }
static int numSpecies(){ return tree->numSpecies(); }
static void openOutputFiles(string outdir);
static void closeOutputFiles();
// static data members
static int padding; // add this many bases to the region before and after the aligned region
static int orthoExonID; // ID for exons of different species which are orthologous to each other
static int geneRangeID; // stores an ID for the possible gene ranges of the different species which belong together
static vector<int> exonCandID; // IDs for exon candidates of different species
static unordered_map< bit_vector, PhyloTree*, boost::hash<bit_vector>> topologies;
// pointers to the output files
static vector< ofstream* > exonCands_outfiles, orthoExons_outfiles, geneRanges_outfiles_bed, geneRanges_outfiles_gff, omega_outfiles;
// map that stored all codon combinations on which fitch and pruning algorithm already have been run (for calculation of omega and number of substitutions)
static map<vector<string>, pair<vector<double>, int> > computedCumValues;
void printSingleOrthoExon(OrthoExon &oe, bool files = true);
void collect_features(int species, list<OrthoExon> *hects, SpeciesGraph *speciesgraph);
private:
vector<string> getCodonAlignment(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges, const vector<vector<fragment>::const_iterator > &froms, map<unsigned, vector<int> > *alignedCodons = NULL, bool generateString=true, vector<vector<int> > *posStoredCodons = NULL, ofstream *codonAli = NULL);
void cutIncompleteCodons(OrthoExon &oe);
cumValues* findCumValues(bit_vector bv, vector<int> rfc);
static PhyloTree *tree;
LocusTree *ltree;
static CodonEvo *codonevo;
vector<int> starts, ends; // gene ranges for each species
vector<int> offsets; // this many bases are upstream from the region
RandSeqAccess *rsa;
Alignment* alignment; // alignment of regions which possibly belong to a gene
vector< list<ExonCandidate*>* > exoncands; // exon candidates found in the different species in a gene segment
//list<OrthoExon> orthoExonsList; // Steffi: due to multiple copying, I remove this as attribute of any class. Instead it can be passed from compgenepred.cc by reference, whenever necessary.
unordered_map<bit_vector, vector<pair<vector<int>, cumValues> >, boost::hash<bit_vector> > cumOmega; // stores cumulative omega values for every reading frame combination and every bitvector that exist
map<bit_vector, map<vector<int>, vector<double> > > codonOmega;
};
#endif // _GENEMSA
|