1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
/***************************************************************************\
* Filename : evaluation.hh
* Author : Mario Stanke
* Email : stanke@math.uni-goettingen.de
*
* Copyright: (C) 2002 by Mario Stanke
*
* Description:
*
*
* Date | Author | Changes
*------------+---------------+---------------------------------
* 19.06.2002 | Mario Stanke | Creation of the file
* 11.04.2007 | Mario Stanke | Evaluation of the pred. of the transcription termination site (tts)
\**************************************************************************/
#ifndef _EVALUATION_HH
#define _EVALUATION_HH
// project includes
#include "types.hh"
#include "gene.hh"
#include "extrinsicinfo.hh"
// standard C/C++ includes
#include <list>
#include <vector>
#define MAXUTRDIST 5000
class Evaluation {
public:
Evaluation(){
nukTP = nukFP = nukFN = nukFPinside = 0;
nucUTP = nucUFP = nucUFN = nucUFPinside = 0;
exonTP = exonFP_partial = exonFP_overlapping = exonFP_wrong = 0;
exonFN_partial = exonFN_overlapping = exonFN_wrong = 0;
UTRexonTP = UTRexonFP = UTRexonFN = 0;
UTRoffThresh = 20;
geneTP = geneFN = 0;
numPredExons = numAnnoExons = 0;
numPredUTRExons = numAnnoUTRExons = 0;
numUniquePredExons = numUniqueAnnoExons = 0;
numUniquePredUTRExons = numUniqueAnnoUTRExons = 0;
numPredGenes = numAnnoGenes = 0;
numDataSets = 0;
longestPredIntronLen = 0;
tssDist = new int[MAXUTRDIST+1];
for (int i=0; i<= MAXUTRDIST; i++)
tssDist[i] = 0;
numTotalPredTSS = numTSS = 0;
ttsDist = new int[MAXUTRDIST+1];
for (int i=0; i<= MAXUTRDIST; i++)
ttsDist[i] = 0;
numTotalPredTTS = numTTS = 0;
leftFlankEnd = rightFlankBegin = -1;
}
~Evaluation(){
if (tssDist)
delete [] tssDist;
if (ttsDist)
delete [] ttsDist;
};
void addToEvaluation(Transcript* prediction, Transcript *database, Strand strand, Double quotient = -1.0);
void addToEvaluation(Transcript* predictedGeneList, Transcript* annotatedGeneList);
void finishEvaluation();
void print();
void printQuotients();
private:
/*
* Quick evaluation is fast but requires that both gene lists
*
*/
void evaluateQuickOnNucleotideLevel(State* const predictedExon, int curPredBegin,
State* const annotatedExon, int curAnnoBegin);
void evaluateQuickOnExonLevel(State* predictedExon, State* annotatedExon);
void evaluateQuickOnGeneLevel(Transcript* const predictedGeneList, Transcript* const annotatedGeneList);
void evaluateOnNucleotideLevel(list<State> *predictedExon, list<State> *annotatedExon, bool UTR=false);
void evaluateOnExonLevel(list<State> *predictedExon, list<State> *annotatedExon, bool UTR=false);
void evaluateOnGeneLevel(Transcript* const predictedGeneList, Transcript* const annotatedGeneList);
void evaluateOnUTRLevel(Transcript* const predictedGeneList, Transcript* const annotatedGeneList);
public:
// nucleotide level
int nukTP, nukFP, nukFN,
nukFPinside; // false positive coding base inside gene area (as opposed to in flanking regions)
int nucUTP, nucUFP, nucUFN, // UTR bases
nucUFPinside; // false positive noncoding base inside gene area (as opposed to in flanking regions)
double nukSens, nukSpec; // coding base sensitivity and specifity
double nucUSens, nucUSpec; // non-coding base sensitivity and specifity
double exonSens, exonSpec; // coding exon sensitivity and specificity
double UTRexonSens, UTRexonSpec; // exon sensitivity and specificity
double geneSens, geneSpec; // gene sensitivity and specifity
private:
//TP = true positive, FP = false positive, FN = false negative
int leftFlankEnd, rightFlankBegin;
list<Double> quotients;
int longestPredIntronLen;
// exon level
int numPredExons, numAnnoExons;
int numUniquePredExons, numUniqueAnnoExons;
int exonTP,
exonFP,
exonFP_partial, // predicted exon unequal to but included in an annotated exon
exonFP_overlapping,
exonFP_wrong,
exonFN,
exonFN_partial, // annotated exon unequal to but included in a predicted exon
exonFN_overlapping,
exonFN_wrong;
// gene level
int geneTP, geneFP, geneFN;
int numPredGenes, numAnnoGenes;
// UTR level
int *tssDist; // array that holds for each distance the number of predicted TSS that is off by this distance
int numTSS; // number of gene pairs (anno, pred) with identical translation start and where both have an annotated TSS
int numTotalPredTSS;
double meanTssDist;
int medianTssDist;
int *ttsDist; // array that holds for each distance the number of predicted TTS that is off by this distance
int numTTS; // number of gene pairs (anno, pred) with identical stop codon and where both have an annotated TTS
int numTotalPredTTS;
double meanTtsDist;
int medianTtsDist;
int numPredUTRExons, numAnnoUTRExons;
int numUniquePredUTRExons, numUniqueAnnoUTRExons;
int UTRexonTP, UTRexonFP, UTRexonFN;
int UTRoffThresh; // count UTR exon as correct, if one end is exact and the other end at most this many bp off
/*
* data members for the "Burge-Karlin"-Method computing first the
* specifity and sensitivity for each sequence and then taking their
* means afterwards
*/
int numDataSets;
// nukleotide level
int nukTPBK, nukFPBK, nukFPBKinside, nukFNBK;
// exon level
int exonTPBK, exonFPBK, exonFNBK;
};
/*
* predictAndEvaluate
*
* Predict genes on the given set of annotated sequences given the current parameters.
* Then evaluate the accuracy against the given annotation.
*/
Evaluation* predictAndEvaluate(vector<AnnoSequence*> trainGeneList, FeatureCollection &extrinsicFeatures);
#endif // _EVALUATION_HH
|