1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
|
/***************************************************************************
* Copyright (C) 2009 by BUI Quang Minh *
* minh.bui@univie.ac.at *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#ifndef SUPERALIGNMENT_H
#define SUPERALIGNMENT_H
#include "alignment.h"
struct PartitionInfo {
double cur_score; // current log-likelihood
double part_rate; // partition heterogeneity rate
int evalNNIs; // number of evaluated NNIs on subtree
//DoubleVector null_score; // log-likelihood of each branch collapsed to zero
//DoubleVector opt_score; // optimized log-likelihood for every branch
//DoubleVector nni1_score; // log-likelihood for 1st NNI for every branch
//DoubleVector nni2_score; // log-likelihood for 2nd NNI for every branch
vector<DoubleVector> cur_brlen; // current branch lengths
//DoubleVector opt_brlen; // optimized branch lengths for every branch
vector<DoubleVector> nni1_brlen; // branch length for 1st NNI for every branch
vector<DoubleVector> nni2_brlen; // branch length for 2nd NNI for every branch
//double *mem_ptnlh; // total memory allocated for all pattern likelihood vectors
double *cur_ptnlh; // current pattern likelihoods of the tree
//double *nni1_ptnlh; // pattern likelihoods of 1st NNI tree
//double *nni2_ptnlh; // pattern likelihoods of 2nd NNI tree
NNIMove nniMoves[2];
};
class PhyloSuperTree;
/**
Super alignment representing presence/absence of sequences in
k partitions for a total of n sequences. It has the form:
Site_1 Site_2 ... Site_k
Seq_1 1 0 ... 1
Seq_2 0 1 ... 0
... ...
Seq_n 1 1 ... 0
Where (i,j)=1 means Seq_i is present in partition j, 0 otherwise
So data is binary.
@author BUI Quang Minh <minh.bui@univie.ac.at>
*/
class SuperAlignment : public Alignment
{
public:
/** constructor initialize from a supertree */
SuperAlignment(Params ¶ms);
/** constructor initialize empty alignment */
SuperAlignment();
/** destructor */
~SuperAlignment();
void init(StrVector *sequence_names = NULL);
/** return that this is a super-alignment structure */
virtual bool isSuperAlignment() { return true; }
/** read partition model file */
void readPartition(Params ¶ms);
/** read RAxML-style partition file */
void readPartitionRaxml(Params ¶ms);
/** read partition model file in NEXUS format into variable info */
void readPartitionNexus(Params ¶ms);
void printPartition(const char *filename);
void printPartitionRaxml(const char *filename);
void printBestPartition(const char *filename);
void printBestPartitionRaxml(const char *filename);
/**
* create taxa_index from super-alignment to sub-alignment
* @param part index of sub-alignment
*/
void linkSubAlignment(int part);
/**
* @param pattern_index (OUT) vector of size = alignment length storing pattern index of all sites
* the index of sites in 2nd, 3rd,... genes have to be increased by the number of patterns in previous genes
* so that all indices are distinguishable
*/
virtual void getSitePatternIndex(IntVector &pattern_index);
/**
* @param freq (OUT) vector of site-pattern frequencies for all sub-alignments
*/
virtual void getPatternFreq(IntVector &pattern_freq);
/**
* @param[out] freq vector of site-pattern frequencies
*/
virtual void getPatternFreq(int *freq);
/**
Print all site information to a file
@param filename output file name
*/
virtual void printSiteInfo(const char* filename);
/**
extract sub-alignment of a sub-set of sequences
@param aln original input alignment
@param seq_id ID of sequences to extract from
@param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
@param min_taxa only keep alignment that has >= min_taxa sequences
@param[out] kept_partitions (for SuperAlignment) indices of kept partitions
*/
virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa = 0, IntVector *kept_partitions = NULL);
/**
* remove identical sequences from alignment
* @param not_remove name of sequence where removal is avoided
* @param keep_two TRUE to keep 2 out of k identical sequences, false to keep only 1
* @param removed_seqs (OUT) name of removed sequences
* @param target_seqs (OUT) corresponding name of kept sequence that is identical to the removed sequences
* @return this if no sequences were removed, or new alignment if at least 1 sequence was removed
*/
virtual Alignment *removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs);
/*
check if some state is absent, which may cause numerical issues
@param msg additional message into the warning
@return number of absent states in the alignment
*/
virtual int checkAbsentStates(string msg);
/**
Quit if some sequences contain only gaps or missing data
*/
//virtual void checkGappySeq(bool force_error = true);
/**
create a non-parametric bootstrap alignment by resampling sites within partitions
@param aln input alignment
@param pattern_freq (OUT) if not NULL, will store the resampled pattern frequencies
@param spec bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk"
to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk
must equal m, where m is the alignment length. Otherwise, an error will occur.
If spec == NULL, a standard procedure is applied, i.e., randomly draw m sites.
*/
virtual void createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq = NULL, const char *spec = NULL);
/**
resampling pattern frequency by a non-parametric bootstrap
@param pattern_freq (OUT) resampled pattern frequencies
@param spec bootstrap specification, see above
*/
virtual void createBootstrapAlignment(IntVector &pattern_freq, const char *spec = NULL);
/**
resampling pattern frequency by a non-parametric bootstrap
@param pattern_freq (OUT) resampled pattern frequencies
@param spec bootstrap specification, see above
@param rstream random generator stream, NULL to use the global randstream
*/
virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL, int *rstream = NULL);
/**
* shuffle alignment by randomizing the order of sites over all sub-alignments
*/
virtual void shuffleAlignment();
/**
compute the observed (Hamming) distance (number of different pairs of positions per site)
between two sequences
@param seq1 index of sequence 1
@param seq2 index of sequence 2
@return the observed distance between seq1 and seq2 (between 0.0 and 1.0)
*/
virtual double computeObsDist(int seq1, int seq2);
/**
compute the Juke-Cantor corrected distance between 2 sequences over all partitions
@param seq1 index of sequence 1
@param seq2 index of sequence 2
@return any distance between seq1 and seq2
*/
virtual double computeDist(int seq1, int seq2);
/**
* print the super-alignment to a file
* @param filename
* @param append TRUE to append to this file, false to write new file
*/
void printCombinedAlignment(const char *filename, bool append = false);
/**
* print the super-alignment to a stream
* @param out output stream
* @param print_taxid true to print taxa IDs instead of names, default: false
*/
void printCombinedAlignment(ostream &out, bool print_taxid = false);
/**
* print all sub alignments into files with prefix, suffix is the charset name
* @param prefix prefix of output files
*/
void printSubAlignments(Params ¶ms);
/**
@return unconstrained log-likelihood (without a tree)
*/
virtual double computeUnconstrainedLogL();
/**
* @return proportion of missing data in super alignment
*/
double computeMissingData();
/**
* build all patterns of super alignent from partitions and taxa_index
* it is in form of a binary alignment, where 0 means absence and 1 means presence
* of a gene in a sequence
*/
void buildPattern();
/**
count the fraction of constant sites in the alignment, update the variable frac_const_sites
*/
virtual void countConstSite();
/**
* @return number of states, if it is a partition model, return max num_states across all partitions
*/
virtual int getMaxNumStates() {
return max_num_states;
}
/** order pattern by number of character states and return in ptn_order
*/
virtual void orderPatternByNumChars(int pat_type);
/**
actual partition alignments
*/
vector<Alignment*> partitions;
/**
matrix represents the index of taxon i in partition j, -1 if the taxon is not present
*/
vector<IntVector> taxa_index;
/** maximum number of states across all partitions */
int max_num_states;
/**
* concatenate subset of alignments
* @param ids IDs of sub-alignments
* @return concatenated alignment
*/
Alignment *concatenateAlignments(set<int> &ids);
/**
* concatenate all alignments
* @return concatenated alignment
*/
Alignment *concatenateAlignments();
};
#endif
|