1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
|
// =============================================================== //
// //
// File : st_quality.hxx //
// Purpose : //
// //
// Institute of Microbiology (Technical University Munich) //
// http://www.arb-home.de/ //
// //
// =============================================================== //
#ifndef ST_QUALITY_HXX
#define ST_QUALITY_HXX
#ifndef ST_WINDOW_HXX
#include "st_window.hxx"
#endif
#define st_assert(cond) arb_assert(cond)
class Sampler {
double sum; //!< sum of added values
size_t count; //!< number of values added
public:
Sampler() : sum(0.0) , count(0) {}
void add(double val) {
sum += val;
count++;
}
void add(const Sampler& other) {
sum += other.sum;
count += other.count;
}
size_t get_count() const { return count; }
double get_sum() const { return sum; }
double get_median() const { return sum / count; }
};
class VarianceSampler : public Sampler {
/*! stores values such that variance can quickly be calculated
* for algorithm see
* http://de.wikipedia.org/wiki/Standardabweichung#Berechnung_f.C3.BCr_auflaufende_Messwerte or
* http://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
*/
double square_sum; //!< sum of squares of added values
public:
VarianceSampler() : square_sum(0.0) {}
void add(double val) {
Sampler::add(val);
square_sum += val*val;
}
void add(const VarianceSampler& other) {
Sampler::add(other);
square_sum += other.square_sum;
}
double get_variance(double median) const { return square_sum/get_count() - median*median; }
double get_variance() const { return get_variance(get_median()); }
};
class LikelihoodRanges : virtual Noncopyable {
/*! The alignment is split into multiple, similar-sized column ranges.
* For each range the likelihoods get summarized
*/
size_t ranges;
VarianceSampler *likelihood;
public:
LikelihoodRanges(size_t ranges);
~LikelihoodRanges() { delete [] likelihood; }
size_t get_range_count() const { return ranges; }
void add(size_t range, double probability) {
st_assert(range < ranges);
likelihood[range].add(probability);
}
void add_relative(double seq_rel_pos, double probability) {
st_assert(seq_rel_pos>=0.0 && seq_rel_pos<1.0);
add(seq_rel_pos*ranges, probability);
}
VarianceSampler summarize_all_ranges();
char *generate_string(Sampler& t_values);
// int is_bad(); // 0 == ok, 1 strange, 2 bad, 3 very bad
};
struct ColumnQualityInfo {
LikelihoodRanges stat_half; // two ranges (one for each half)
LikelihoodRanges stat_five; // five ranges
LikelihoodRanges stat_user; // user defined range size
LikelihoodRanges stat_cons; // two ranges (conserved + variable positions) -- @@@ unused
ColumnQualityInfo(int seq_len, int bucket_size);
size_t overall_range_count() {
return
stat_half.get_range_count() +
stat_five.get_range_count() +
stat_user.get_range_count() +
stat_cons.get_range_count();
}
};
class WeightedFilter;
class ColumnStat;
GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name,
const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size,
int marked_only, st_report_enum report, const char *dest_field);
#else
#error st_quality.hxx included twice
#endif // ST_QUALITY_HXX
|