1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
|
#ifndef _MODELPOMO_H_
#define _MODELPOMO_H_
#include "modeldna.h"
/* PoMo mutation rate boundaries. This is not necessary anymore
because rates are bounded by underlying Markov model*/
/* const double POMO_MIN_RATE = 5e-5; */
/* const double POMO_INIT_RATE = 1e-3; */
/* const double POMO_MAX_RATE = 1e-1; */
// HETEROZYGOSITY BOUNDARIES ARE NOW SET IN A DYNAMIC WAY TO IMPROVE NUMERICAL
// STABILITY. EXCEEDING THESE VALUES ONLY INDUCES WARNINGS.
/* Boundaries for level of polymorphism or heterozygosity. The heterozygosity
boundaries strongly affect numerical stability. I set them so that errors are
very seldom but there may be data that requires different boundaries. Maybe
they should be set variable, depending on data or user input. */
const double POMO_MIN_HETEROZYGOSITY = 1e-5;
const double POMO_MAX_HETEROZYGOSITY = 1e-1;
/* Not so stringent values. However, crashes are expected, especially because of
the maximum value (?).*/
/* const double POMO_MIN_HETEROZYGOSITY = 1e-4; */
/* const double POMO_MAX_HETEROZYGOSITY = 1e-1; */
// Relative boundaries for the heterozygosity.
const double POMO_MIN_REL_HETEROZYGOSITY = 0.5;
const double POMO_MAX_REL_HETEROZYGOSITY = 3;
/* Boundaries for boundary frequencies. This is not necessary anymore
because those are set by the underlying Markov model. The actual
boundaries will be set, e.g., to
freq_boundary_states[i]*POMO_MIN_REL_FREQ. */
/* const double POMO_MIN_REL_FREQ = 0.5; */
/* const double POMO_MAX_REL_FREQ = 2.0; */
// Boundaries for each of the boundary states.
const double POMO_MIN_BOUNDARY_FREQ = 0.05;
const double POMO_MAX_BOUNDARY_FREQ = 0.95;
class ModelPoMo : virtual public ModelMarkov
{
public:
/**
* Constructor.
* ModelMarkov() constructor calls ModelSubst() constructor.
* ModelSubst():
* - allocates state_freq[tree->aln->num_states]
* ModelMarkov():
* - allocates rates[getNumRateEntries()] = rates[n*(n-1)/2];
* - allocates eigenvalues and eigenvectors.
*
* @param model_name The name of the model (e.g., "HKY+P").
* @param model_params The parameters of the model (user defined models.).
* @param freq_type
* @param freq_params
* @param tree Associated tree for the model.
* @param pomo_heterozygosity heterozygosity of PoMo
*
* @return
*/
ModelPoMo(const char *model_name, string model_params, StateFreqType freq_type, string freq_params,
PhyloTree *tree, string pomo_heterozygosity);
ModelPoMo(PhyloTree *tree);
~ModelPoMo();
// Tell the compiler we want both the init functions (resolve
// warning from clang about PoMo hiding the overloaded init
// function).
using ModelMarkov::init;
/**
* Initialize the PoMo model. Run by constructor.
*
* @param model_name
* @param model_params
* @param freq_type
* @param freq_params
*/
virtual void init(const char *model_name,
string model_params,
StateFreqType freq_type,
string freq_params,
string pomo_heterozygosity);
/**
* \brief Initialize underlying mutation model.
*
* PoMo is built on top of any Markov mutation model which is of class
* ModelMarkov in IQ-TREE and is denominated "underlying" Markov model or
* "underlying" mutation model. The idea is to use the machinery of the
* underlying Markov model and only introduce an additional layer that adds
* polymorphic states and one parameter, namely the heterozygosity or level
* of polymorphism.
*
*/
void init_mutation_model(const char *model_name,
string model_params,
StateFreqType freq_type,
string freq_params,
string pomo_heterozygosity);
/**
* \brief Initialize sampling type.
*
* Weighted (standard) or sampled.
*
*/
void init_sampling_method();
/**
* \brief Initialize state frequencies.
*
* Use the machinery of the underlying mutation model.
*
*/
void init_boundary_frequencies();
/**
* \brief Check if parameters are optimized or not.
*
* The parameters of PoMo are the ones from the underlying mutation model
* plus the the heterozygosity or the level of polymorphism.
*
*/
void init_fixed_parameters(string model_params,
string pomo_heterozygosity);
/**
* Initialize rate_matrix and state_freq for boundary mutation model.
*/
void updatePoMoStatesAndRateMatrix();
/**
* @return model name
*/
virtual string getName();
/**
* @return Number of free parameters.
*/
virtual int getNDim();
/**
@return the number of dimensions corresponding to state frequencies
*/
virtual int getNDimFreq();
/**
* Set bounds for joint optimization with BFGS.
*/
virtual void setBounds(double *lower_bound,
double *upper_bound,
bool *bound_check);
/**
* Write information to output stream (only with -vv).
* @param out Output stream.
*/
virtual void writeInfo(ostream &out);
/**
* The target function which needs to be optimized
* @param x the input vector x
* @return the function value at x
*/
virtual double targetFunk(double x[]);
/**
* @return TRUE if parameters are at the boundary that may cause
* numerical unstability
*/
virtual bool isUnstableParameters();
virtual bool isPolymorphismAware() { return true; };
/**
* @return the number of rate entries
*/
virtual int getNumRateEntries() { return n_alleles*(n_alleles-1)/2; };
/**
* \brief Normalize boundary frequencies so that they sum to 1.0.
*
*/
void normalize_boundary_freqs(double * bfs);
/**
* \brief Check if boundary frequencies are within bounds.
*
*/
void check_boundary_freqs(double * bfs);
/**
* Estimate the empirical (relative) boundary state frequencies. The
* number of A, C, G, and T in the data is summed up and the
* relative proportion of all bases is calculated. The empirical
* boundary state frequencies are set to these relative proportions.
*
* @param freq_boundary_states (OUT) The estimated boundary frequencies,
* size num_states.
*/
void estimateEmpiricalBoundaryStateFreqs(double * freq_boundary_states);
/**
* Estimate the empirical (relative) sum of polymorhic states.
* The number of polymorphic sites in the data is summed up and
* the relative proportion of all sites is calculated and returned.
*
*/
double estimateEmpiricalWattersonTheta();
// Extract the rate entries of a rate matrix (basically remove the diagonal
// and align upper triangle before lower triangle). IN: rate matrix m; OUT:
// rates r (array of size n_connections or 2*n_connections).
void rate_matrix_to_rates(double *m, double *r);
// Extract the exchangeability entries of a rate matrix (basically remove the
// diagonal and align upper triangle before lower triangle; divide through
// stationary frequencies of target alleles). IN: rate matrix m; OUT: rates r
// (array of size n_connections or 2*n_connections).
void rate_matrix_to_exchangeabilities(double *m, double *r);
/**
* Report the model rates to the output file stream 'out'.
*
* @param out Output file stream.
*/
void report_model_params(ostream &out, bool reset_scale = true);
/**
* Report the state frequencies to the output file stream 'out'.
*
* @param out Output file stream.
*/
virtual void report(ostream &out);
/**
* Normalize the mutation probabilities such that the given heterozygosity
* or level of polymorphism is honored.
*
*/
void normalizeMutationRates();
/**
set checkpoint object
@param checkpoint
*/
virtual void setCheckpoint(Checkpoint *checkpoint);
/**
start structure for checkpointing
*/
virtual void startCheckpoint();
/**
save object into the checkpoint
*/
virtual void saveCheckpoint();
/**
Restore object from the checkpoint.
*/
virtual void restoreCheckpoint();
/**
Decompose the rate matrix into eigenvalues and eigenvectors.
This function is necessary, because handling of reversible and
non-reversible models differs in ModelMarkov, but should not
differ for ModelPoMo.
*/
virtual void decomposeRateMatrix();
// I had serious problems with numerical instabilities because of fixed
// boundaries for the heterozygosity. However, it is easy to get an empirical
// value of the heterozygosity and set boundaries according to this value.
// This improves numerical stability. This step does not make sense and,
// hence, is not done, if the heterozygosity is set by the user or to the
// empirical value.
void set_heterozygosity_boundaries();
/**
compute the transition probability matrix.
@param time time between two events
@param mixture (optional) class for mixture model
@param trans_matrix (OUT) the transition matrix between all pairs of states.
Assume trans_matrix has size of num_states * num_states.
*/
virtual void computeTransMatrix(double time, double *trans_matrix, int mixture = 0);
/**
* Set the scale factor of the mutation rates to NEW_SCALE.
*
* @param scale (IN).
*/
void setScale(double new_scale);
/**
* get the underlying mutation model, used with PoMo model
*/
virtual ModelSubst *getMutationModel() { return mutation_model; }
protected:
ModelMarkov *mutation_model;
/**
Compute the rate matrix and then normalize it such that the total number of mutations is 1.
@param rate_matrix (OUT). It is filled with rate matrix entries
@param state_freq state frequencies
@param num_state number of states
*/
virtual void computeRateMatrix(double **rate_matrix, double *state_freq, int num_state);
// Get the current scale factor of the mutation rates.
double getScale();
/**
* This function is served for the multi-dimension
* optimization. It should pack the model parameters into a vector
* that is index from 1 (NOTE: not from 0)
*
* @param variables (OUT) Vector of variables, indexed from 1.
*/
virtual void setVariables(double *variables);
/**
* This function is served for the multi-dimension
* optimization. It should assign the model parameters from a
* vector of variables that is index from 1 (NOTE: not from 0)
*
* @param variables Vector of variables, indexed from 1.
* @return TRUE if parameters are changed, FALSE otherwise (2015-10-20)
*/
virtual bool getVariables(double *variables);
/**
* Called from getVariables() to update the rate matrix for the
* new model parameters. For ModelPoMo this is only a dummy
* function, which has to be declared empty because otherwise,
* restoreCheckpoint() calls the ModelMarkov::setRates() which in
* turn throws an error.
*/
virtual void setRates();
protected:
/*!< Virtual population size of the PoMo model. */
int N;
/**
* Full mutation rate matrix (Q^REV + Q^NONREV; or, Q^GTR + Q^FLUX).
*/
double *mutation_rate_matrix;
/**
* 4 unnormalized stationary frequencies of boundary states.
*/
double *freq_boundary_states;
/**
* Normalized empirical stationary frequencies.
*/
double *freq_boundary_states_emp;
/* /\** */
/* * The rate matrix of the PoMo model. */
/* *\/ */
/* double *rate_matrix; */
/**
* Decompose state (0..57) into abundance of two nucleotides.
*
* @param i (OUT) Abundance of nucleotide 1.
* @param nt1 (OUT) Nucleotide 1 (0: A, 1: C, 2: G, 3: T).
* @param nt2 (OUT) Nucleotide 2 (0: A, 1: C, 2: G, 3: T).
*/
void decomposeState(int state, int &i, int &nt1, int &nt2);
/**
* Compute the normalized stationary frequencies that fulfill the
* detailed balance condition.
*/
void computeStateFreq();
/**
* Compute probability of change from state1 to state2 in one
* Moran with boundary mutation model generation.
*/
double computeProbBoundaryMutation(int state1, int state2);
bool isBoundary(int state);
bool isPolymorphic(int state);
/**
* Get symmetric mutation coefficient from nt1 to nt2 (mij = mji).
*
* @param nt1
* @param nt2
*
* @return
*/
double mutCoeff(int nt1, int nt2);
/**
* Set the initial mutation coefficients which resemble the level
* of polymorphism in the data.
*
* @return
*/
void setInitialMutCoeff();
/**
* Compute the sum of the frequencies of the boundary states.
*
* @return
*/
double computeSumFreqBoundaryStates();
/**
* Compute part of normalization constant of polymorphic states.
*
* @return
*/
double computeSumFreqPolyStates();
/**
* Compute the sum over lamda_pol without mutliplying with mutation
* coefficients. This is useful if the mutation coefficient is constant,
* e.g., when the initial value is set.
*
* @return
*/
double computeSumFreqPolyStatesNoMut();
/**
* Compute the normalization constant. This constant ensures that
* the stationary frequencies (invariant measure) sum up to 1
* (invariant distribution).
*
* @return
*/
double computeNormConst();
/**
* Set the boundary frequency of T such that all boundary frequencies
* sum up to one. This is done, so that they can be compared to
* the frequencies of the GTR model.
*/
void updateFreqBoundaryState ();
/**
* Precision and treshold value for mathematical computations and
* numerical stability checks.
*
* Feel free to set to a global value if there is one available.
*
*/
double eps;
/// Number of nucleotides (alleles). This might be useful in the
/// future, when we do not restrict PoMo to DNA models only.
/// Eventual todo: do not hardcode this.
int n_alleles;
/// True if heterozygosity has been fixed.
bool fixed_heterozygosity;
/// True if heterozygosity has been fixed by user.
bool fixed_heterozygosity_usr;
/// True if heterozygosity has been fixed to empirical estimate
/// from data.
bool fixed_heterozygosity_emp;
/**
* Heterozygosity or level of polymorphism. Will be set by init() and is
* used to normalize the mutation coefficients. This is needed to be done
* because PoMo had trouble to estimate this (especially, when N was large).
* See also ModelPoMo::normalizeMutationRates().
*/
double heterozygosity;
/// True if the model parameters are fixed (e.g., if the
/// transition to transversion ratio is set in the HKY model).
bool fixed_model_params;
/// This array contains the ratio of the fixed mutation
/// probability to the base mutation rate that is still estimated
/// and describes the amount of polymorphism. I.e., for the HKY
/// model with transition to transversion ratio 2.0, this vector
/// would look like [1, 2, 1, 1, 2, 1]
double * fixed_model_params_ratio;
/// The number of connections between nucleotides. If n_alleles=4,
/// there are 6 connections. Set in ModelPoMo::init().
int n_connections;
/// Random binomial sampling or weighted; specified when alignment
/// is created already (Alignment::readCountsFormat()).
SamplingType sampling_method;
// Minimum heterozygosity, set by `set_heterozygosity_boundaries()`.
double min_heterozygosity;
// Maximum heterozygosity, set by `set_heterozygosity_boundaries()`.
double max_heterozygosity;
// The scale factor of the mutation rates. Important for Gamma rate
// heterogeneity.
double scale;
};
#endif /* _MODELPOMO_H_ */
|