1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
|
/***************************************************************************
* PHAST: PHylogenetic Analysis with Space/Time models
* Copyright (c) 2002-2005 University of California, 2006-2010 Cornell
* University. All rights reserved.
*
* This source code is distributed under a BSD-style license. See the
* file LICENSE.txt for details.
***************************************************************************/
/** @file tree_model.h
@brief A tree model represents a phylogenetic tree, substitution rate matrix,
and background frequencies.
The model allows for rate variation and
also for varying substitution models on different branches. If the
tree model is optimized by maximum likelihood, the tree model object
contains data which indicate which parameters to hold constant, and
which to optimize, as well as boundary conditions.
@ingroup phylo
*/
#ifndef TREE_MODEL_H
#define TREE_MODEL_H
#include <vector.h>
#include <markov_matrix.h>
#include <trees.h>
#include <stringsplus.h>
#include <msa.h>
#include <misc.h>
#include <numerical_opt.h>
#include <subst_mods.h>
#include <hmm.h>
/** Maximum alphabet size */
#define MAX_ALPH_SIZE 100
/** In cases where a function of complex numbers is supposed to be
real, we'll consider it close enough if the imaginary component is
smaller in magnitude than this threshold
*/
#define TM_IMAG_EPS 1e-6
/* convergence thresholds for EM, expressed as portions of function
value: (f(xold) - f(xnew)) / f(xnew) */
/** High convergence threshold for EM (as portion of function value (f(xold) - f(xnew)) / f(xnew))*/
#define TM_EM_CONV_HIGH 1e-8
/** Medium convergence threshold for EM (as portion of function value (f(xold) - f(xnew)) / f(xnew))*/
#define TM_EM_CONV_MED 5e-7
/** Low convergence threshold for EM (as portion of function value (f(xold) - f(xnew)) / f(xnew))*/
#define TM_EM_CONV_LOW 1e-5
/** Crude convergence threshold for EM (as portion of function value (f(xold) - f(xnew)) / f(xnew))*/
#define TM_EM_CONV_CRUDE 1e-4
/** Test of conversion */
#define TM_EM_CONV(P) ( (P) == OPT_HIGH_PREC ? TM_EM_CONV_HIGH : ( (P) == OPT_MED_PREC ? TM_EM_CONV_MED : ( (P) == OPT_CRUDE_PREC ? TM_EM_CONV_CRUDE : TM_EM_CONV_LOW) ))
#define BACKGD_STR "backgd"
#define RATEMAT_STR "ratematrix"
#define RATEVAR_STR "ratevar"
#define BRANCHES_STR "branches"
#define SCALE_STR "scale"
#define SCALE_SUB_STR "scale_sub"
#define SELECTION_STR "sel"
#define BGC_STR "bgc"
/** Type of branch length estimation */
typedef enum {
TM_BRANCHLENS_ALL, /**< Estimate branch lengths */
TM_BRANCHLENS_CLOCK, /**< Molecular clock */
TM_SCALE_ONLY, /**< Scale factor only */
TM_BRANCHLENS_NONE /**< Do not estimate branch lengths */
} blen_estim_type;
/** Type of parameter bound; used in bounded estimation of subtree scale */
typedef enum {
NB, /**< No bound */
LB, /**< Lower bound -- subtree scale at
least as large as scale of whole
tree */
UB /**< Upper bound -- subtree scale at
most as large as scale of whole
tree */
} scale_bound_type;
struct tp_struct;
/** Defines alternative substitution model for a particular branch */
typedef struct alt_subst_mod {
subst_mod_type subst_mod; /**< Substitution model */
Vector *backgd_freqs; /**< Eq freqs (set to NULL if separate_backgd=0) */
MarkovMatrix *rate_matrix; /**< Rate matrix (set to NULL if separate_model=0) */
/* Indices in main model parameters where lineage-specific paramers start */
int ratematrix_idx, /**< Indices of rate matrix model parameters */
backgd_idx, /**< Index of first background frequency parameter */
selection_idx, /**< Index of selection parameter */
bgc_idx; /**< Index of bias gene conversion parameter */
double selection, /**< Optional selection parameters for this model (only used if selection_idx >=0 ) . Total selection on the model is the sum of this parameter and the selection value in the main model.*/
bgc; /**< Optional bgc parameters for this model (only used if bgc_idx >=0 ) */
int separate_model; /**< ==1 if no parameters shared with main model */
int separate_backgd; /**< ==1 if no background frequencies shared with non-alternate substitution model */
List *param_list; /**< List of string arguments giving which parameters to
estimate separately (only if separate_model=0) */
String *defString; /**< Name of alternative Model. This is the argument given to phyloFit
to define the alternative model */
String *noopt_arg; /**< Which parameters to hold constant (represented as a command line argument) */
struct alt_subst_mod *share_sel, /**< If not NULL, share selection parameter with another altSubstMod */
*share_bgc; /**< If not NULL, share bgc parameter with another altSubstMod */
/** Note: could implement sharing with other parameters, hasn't been needed yet */
} AltSubstMod;
/** Tree model object */
struct tm_struct {
TreeNode *tree; /**< Root node of tree (used to traverse tree node by node) */
Vector *backgd_freqs; /**< Equilibrium frequencies */
MarkovMatrix *rate_matrix; /**< Rate matrix */
subst_mod_type subst_mod; /**< Substitution model being used for this tree model */
int *msa_seq_idx; /**< (Optional) Mapping from leaf
indices to sequence indices in a
given MSA; used in likelihood
computations */
MSA *msa; /**< (Optional) MSA from which TreeModel
was estimated; for use in tm_fit */
int category; /**< (Optional) site category in MSA or -1 */
int order; /**< Order of model */
double alpha; /**< For gamma or discrete-gamma rate
variation; set to 0 for constant rate */
int nratecats; /**< Number of rate categories, discrete gamma */
double lnL; /**< Log likelihood from training */
struct tp_struct *tree_posteriors;
/**< (Optional) associated
TreePosteriors object */
int use_conditionals; /**< (Optional) compute likelihood using
conditional probabilities at each
column; only relevant when order >
0 */
MarkovMatrix ***P; /**< Probability matrices for edges,
indexed by node id and rate category */
double *rK, *freqK; /**< Rate constants and frequencies */
List **rate_matrix_param_row, /**< Rate matrix parameter row */
**rate_matrix_param_col; /**< Rate matrix parameter column */
int root_leaf_id; /**< Indicates id of leaf node to be
Interpreted as the root. Must be
a child of the actual root of the
tree. Ordinarily will have a null
value (-1), but if non-negative, the
distance to its parent will be
constrained to be zero. */
int allow_gaps; /**< If TRUE, gaps are not allowed to be
taken into account for a model */
int allow_but_penalize_gaps; /**< If TRUE, gaps are allowed but are
penalized NOTE: not used */
int inform_reqd; /**< If TRUE, only "informative" sites
will be given non-zero probability */
int estimate_backgd; /**< Estimate background frequencies as free
parameters in the optimization */
blen_estim_type estimate_branchlens;
/**< If value is TM_BRANCHLENS_ALL, then
estimate all branch lengths; if
value is TM_SCALE_ONLY, then
estimate only a scale factor; and
if value is TM_BRANCHLENS_NONE, do
not estimate branch lengths */
double scale; /**< Current scale factor */
double scale_sub; /**< Scale factor for subtree, when
estimating separate scale factors
for subtree and supertree */
double selection;
int *in_subtree; /**< Array indicating whether each
branch is in designated subtree */
scale_bound_type scale_sub_bound;
/**< Bound on scale of subtree */
TreeNode *subtree_root; /**< Node defining subtree */
int *ignore_branch; /**< If ignore_branch is non-NULL and
ignore_branch[i] == TRUE, then
branch i will be ignored (treated
as infinitely long) in likelihood
calculations */
int empirical_rates; /**< Indicates "empirical"
(non-parametric) model for
rate-variation */
int site_model; /* TRUE if this is a Nielsen-Yang site model; indicates parameterization for site categories */
int estimate_ratemat; /**< Indicates whether rate-matrix
parameters should be estimated */
AltSubstMod ***alt_subst_mods_ptr; /**< Pointer to alt_subst_mod for
each branch and each rate category */
List *alt_subst_mods; /**< List of relevant AltSubstMods for
this tree. alt_subst_mods_ptr above are
usually pointers to elements in this
list */
Vector *all_params; /**< All parameters relevant to this model */
// Vector *lowbound, *upbound;
int *param_map; /**< Map of parameter indices used by
phyloFit. If param_map[i]==-1, then
parameter i is held constant. Otherwise,
if param_map[i]==j, then it is the j'th
element in the vector of parameters which
are optimized */
/* These are the indices in all_params that show where scale, branchlens, rateMatrix,
backgd_freqs, and rate variation parameters are stored*/
int scale_idx, /**< Starting index of scale parameters in parameter vector*/
bl_idx, /**< Starting index of branch length parameters in parameter vector */
ratematrix_idx, /**< Starting index of rate matrix info in parameter vector */
backgd_idx, /**< Starting index of background frequency info in parameter vector */
ratevar_idx, /**< Starting index of rate variation parameters in parameter vector */
selection_idx; /**< Starting index of selection parameters in parameter vector */
List *bound_arg; /**< Used by phyloFit, this is a copy of the
command-line argument(s) which specify
boundaries on parameters */
String *noopt_arg; /**< Used by phyloFit, this is a copy of the
command-line argument which specifies
which parameters to hold constant */
int eqfreq_sym; /**< Use symmetrical equilibrium frequencies? */
int scale_during_opt; /**< Whether to scale rate matrix during optimization.
Normally 0, but 1 if TM_BRANCHLENS_NONE, or
if TM_SCALE and alt_subst_mods!=NULL */
int **iupac_inv_map; /**< Inverse map for IUPAC ambiguity characters */
};
typedef struct tm_struct TreeModel;
/** \name Tree Model allocation/cleanup functions
\{ */
/** Create new tree model object.*/
TreeModel *tm_new(TreeNode *tree, /**< Phylogenetic tree */
MarkovMatrix *rate_matrix, /**< Matrix of transition/transversion rates */
Vector *backgd_freqs, /**< Background frequencies */
subst_mod_type subst_mod, /**< Substitution Model i.e. REV (reversable)*/
char *alphabet, /**< List of possible characters in the sequence data i.e. 'ATGC' */
int nratecats, /**< Number of rate categories */
double alpha, /**< Alpha parameter */
List *rate_consts, /**< List defining rate constants using
non-parametric mixture model for rates instead of gamma dist. */
int root_leaf_id /**< ID number of the root node */
);
/** Re-initialize a tree model with an altered substitution model,
number of rate categories, alpha, set of rate consts, or set of
rate weights.
@param tm Tree Model to re-initialize
@param subst_mod New substitution model
@param new_nratecats New number of rate categories
@param new_alpha New alpha parameter (ignored if new_nratecats == 1)
@param new_rate_consts New list of explicit rate
constants. If non-NULL, implies use of empirical rates.
@param new_rate_weights New list of explicit rate weights
(mixing proportions). Will be normalized automatically. If
new_rate_consts != NULL and new_rate_weights == NULL, weights
will be initialized to approx gamma distrib. defined by alpha
*/
void tm_reinit(TreeModel *tm, subst_mod_type subst_mod,
int new_nratecats, double new_alpha,
List *new_rate_consts, List *new_rate_weights);
/** Set up lists that allow each rate matrix parameter to be mapped to
the rows and columns in which it appears.
@param tm Tree Model that contains rate matrix
*/
void tm_init_rmp(TreeModel *tm);
/** Free rate matrix parameter
@param tm Tree Model that contains rate matrix parameter */
void tm_free_rmp(TreeModel *tm);
/** Free tree model
@param tm Tree Model to free */
void tm_free(TreeModel *tm);
/** \} */
/** \name Tree Model file access functions
\{ */
/** Read tree model from file.
@param F File descriptor of file containing tree model
@param discard_likelihood Whether to discard the log likelihood saved in the file
@result Newly allocated TreeModel from file
@note Simple format specifies: Background
rates, rate matrix, and tree (topology and branch lengths) */
TreeModel *tm_new_from_file(FILE *F, int discard_likelihood);
/** Tree Model to save to file
@param F File descriptor to save Tree Model to
@param tm Tree model to save to file
*/
void tm_print(FILE *F, TreeModel *tm);
/** \} */
/** Create a copy of TreeModel with an alternative substitution model.
@param tm Tree Model to copy from
@param altmod_str Alternate substitution model */
AltSubstMod* tm_add_alt_mod(TreeModel *tm, String *altmod_str);
/** Tree Model to save to file
@param F File descriptor to save Tree Model to
@param tm Tree model to save to file
*/
void tm_print(FILE *F, TreeModel *tm);
/** Copy a Tree Model to an existing tree model
@param dest Destination to copy tree model to
@param src Source to copy tree model from
*/
void tm_cpy(TreeModel *dest, TreeModel *src);
/** Copy a tree model into a new TreeModel object.
@param src Tree Model source to copy contents from
@result Newly allocated TreeModel containing data from src */
TreeModel *tm_create_copy(TreeModel *src);
/** \name Tree Model substitution matrix functions
\{ */
/** Setup the substitution matrices on a Tree Model.
@param tm Tree Model to setup substitution matrix for
*/
void tm_set_subst_matrices(TreeModel *tm);
/** Setup the substitution matrices on a Tree Model with custom probability matrix and branch length.
@param P Probability matrix to use
@param t Branch length to use
*/
void tm_set_subst_matrix(TreeModel *tm, MarkovMatrix *P, double t);
/** Create an alternate substitution model with new background frequencies and probabilities.
@param subst_mod Existing Substitution Model to base new model on
@param backgd_freqs Background frequencies for new model
@param rate_matrix Markov Matrix defining substitution probabilities
@result Newly created, alternate substitution method
*/
AltSubstMod* tm_new_alt_subst_mod(subst_mod_type subst_mod,
Vector *backgd_freqs,
MarkovMatrix *rate_matrix);
/** Free an alternate substitution model.
@param am Alternate substitution model to free
*/
void tm_free_alt_subst_mod(AltSubstMod *am);
/** \} \name Tree Model scale functions
\{ */
/** Scales rate matrix and Optionally parameters, branch lengths, substitution matrices.
@param tm Tree Model containing rate matrix to scale
@param params (Optional) Parameters to scale
@param scale_blens Whether to scale branch lengths as well
@param reset_subst_matrices Whether to reset substitution matrices as well
@warning Works with lineage-specific models (although this is not necessarily the proper way to calculate scale factor!)
*/
void tm_scale_model(TreeModel *tm, Vector *params, int scale_blens,
int reset_subst_matrices);
/** Allocate (if necessary) and initialize backgd frequencies based on observed frequencies in MSA.
If model is symmetric (SSREV) then makes backgd frequencies symmetric.
If mod->backgd_freqs is already allocated, it is not reallocated and assumed to be the correct size.
@param mod A Tree model object. mod->backgd_freqs will be set by this function.
@param msa An MSA object
@param cat The category of the msa to use to compute the backgd
*/
void tm_init_backgd(TreeModel *mod, MSA *msa, int cat);
/** Modifies equilibrium frequency of a model in such a way that
reversibility is maintained.
@param tm Tree Model containing equilibrium frequency to change
@param newfreqs New equilibrium frequencies for tree model
*/
void tm_mod_freqs(TreeModel *tm, Vector *newfreqs);
/** Scale evolutionary rate (only effects branch lengths)
@param tm Tree Model containing branch lengths to scale
@param scale_const Constant to scale by
@param reset_subst_mats Whether to reset substitution matrices
@note to scale rate matrix and more see tm_scale_model
@see tm_scale_model
*/
void tm_scale_branchlens(TreeModel *tm, double scale_const, int reset_subst_mats);
/** Scale the rate matrix of the specified tree model such that the
expected rate of substitution at equilibrium is 1. This is the
standard convention; it allows the unit of measurement of branch
lengths to be expected substitutions/site.
@param mod Tree Model to scale
@result Scaling factor
@note Scaling factor can be used to adjust
branch lengths if the rate matrix has not been scaled throughout the
fitting procedure. */
double tm_scale_rate_matrix(TreeModel *mod);
/** Scale a parameter vector according to a specified rate matrix scale
factor. Branch length params are multiplied by the specified factor
and rate matrix params by its inverse.
@param mod Tree Model
@param params Parameters for scaling
@param scale_factor Factor to scale parameters by
*/
void tm_scale_params(TreeModel *mod, Vector *params, double scale_factor);
/** \} */
/** Fit a tree model to data using BFGS (multidimensional optimization algorithm)
@param mod Tree Model containing desired tree topology to fit, substitution model, and (if appropriate) background frequencies
@param params Initial values for optimization procedure
@param cat MSA category
@param precision Precision describing BFGS convergence criteria
@param logf File descriptor of log file
@param quiet Whether to report progress to stderr
@param error_file If non-NULL, write estimate, variance, and 95%
confidence interval for each parameter to this file.
@returns 0 on success, 1 on failure
*/
int tm_fit(TreeModel *mod, MSA *msa, Vector *params, int cat,
opt_precision_type precision, FILE *logf, int quiet,
FILE *error_file);
/** Fit several tree models (which share parameters) to data using BFGS
@param mod Array of tree models
@param nmod Length of mod array
@param msa Array of msas. There should be one for each mod, or a single msa with nmod categories. In that case mod[i] will apply to category i+1 (i in 0,...,nmod-1).
@param nmsa Length of msa array.
@param precision Precision describing BFGS convergence criteria
@param logf log file
@param quiet Whether to report progress to stderr
@returns 0 on success, 1 on failure
*/
int tm_fit_multi(TreeModel **mod, int nmod, MSA **msa, int nmsa,
opt_precision_type precision,
FILE *logf, int quiet);
/** Set specified TreeModel according to specified parameter vector.
Exact behavior depends on substitution model.
@param mod Tree Model to adjust parameter vector for
@param params Parameter vector values
@param idx_offset (Optional) Index offset for cases in which vectors of parameters are
nested within larger vectors of parameters (set to -1 if not
needed) */
void tm_unpack_params(TreeModel *mod, Vector *params, int idx_offset);
/** \name Tree Model parameter initialization
\{ */
/** Initializes all branch lengths and rate matrix parameters.
Initializes branch lengths to designated constant; initializes
rate-matrix params using specified kappa; kappa ignored for JC69;
REV and UNREST initialized as if HKY. Initializes alpha as
specified, if dgamma. In the case of empirical rates, uniform
weights are used for initialization.
@param mod Tree Model to initialize parameters for
@param branchlen Constant to use as initial branch length
@param kappa Rate matrix initialization parameter
@param alpha Initial value for alpha (if dgamma)
@result Parameter vector
*/
Vector *tm_params_init(TreeModel *mod, double branchlen, double kappa,
double alpha);
/** Initializes all branch lengths and rate matrix parameters using random values.
@param mod Tree Model to initialize parameters for
@result Parameter vector
*/
Vector *tm_params_init_random(TreeModel *mod);
/** Functions to initialize a parameter vector from an existing tree model
@param mod Tree Model used to generate parameter vector
@result Parameter vector
*/
Vector *tm_params_new_init_from_model(TreeModel *mod);
/** Functions to initialize a parameter vector from an existing tree model
@param[in] mod Tree Model used to generate parameter vector
@param[out] params Parameter vector
@param[in] setup_params If TRUE, calls tm_setup_params to set scale_idx, bl_idx, backgd_idx, etc to determine which parameters are stored where in the parameter vector. Otherwise assume this has already been done.
*/
void tm_params_init_from_model(TreeModel *mod, Vector *params, int setup_params);
/** Initialize branch lengths to average number of mutations under parsimony, and apply Jukes Cantor correction.
@param params (Optional) If params is NULL, sets the branch lengths in mod->tree directly. Otherwise sets the parameter vector
@param mod Tree Model to init branch lengths for
@param cat Category associated with tree model
@result Parsimony Score
*/
double tm_params_init_branchlens_parsimony(Vector *params,
TreeModel *mod, MSA *msa, int cat);
/** Setup parameter indexes (mod->scale_idx, mod->ratevar_idx, etc.) and initialize optimization vector.
Assigns parameter indices to mod->scale_idx,
mod->bl_idx, mod->ratevar_idx, mod->ratematrix_idx which point
to indices in mod->all_params where corresponding parameters are
stored. (as well as pointers in altmod structures).
Then, it determines which parameters are to be optimized, and allocates
and assigns values to mod->param_map, which maps indices in all_params
to indices in the optimization vector, so that param_map[i]=j implies
that all_params[i] is stored in opt_vec[j]. Only parameters which
are to be optimized are stored in the optimization vector. Parameters
which are held constant have param_map[i]=-1.
@param mod Tree Model
@param offset The first position to use for parameter indices (usually 0).
@result The number of parameters in this model
*/
int tm_setup_params(TreeModel *mod, int offset);
/** \} \name Tree Model get counts
\{ */
/** Return number of parameters for specified TreeModel
@param mod Tree Model
@note Based on number of taxa and substitution model
@result Nubmer of parameters for specified tree model
*/
int tm_get_nparams(TreeModel *mod);
/** Get the number of equilibrium frequency parameters.
@param mod Tree Model
@result Number of equilibrium frequencies for specified model
*/
int tm_get_neqfreqparams(TreeModel *mod);
/** Get the number of rate variable parameters.
@param mod Tree Model
@result Number of rate variable parameters
*/
int tm_get_nratevarparams(TreeModel *mod);
/** Return the number of leaves in a Tree
@param mod Tree Model to count leafs for
@result Number of leaves in tree model
*/
int tm_get_nleaf(TreeModel *mod);
/** Return the number of branch length parameters
@param mod Tree Model
@result Number of branch length parameters
*/
int tm_get_nbranchlenparams(TreeModel *mod);
/** \} \name Tree Model check reversibility
\{ */
/** Get whether the model being used is reversible
@param tm Tree model to test
@result 1 if reversible, 0 otherwise
*/
int tm_is_reversible(TreeModel *tm);
/** Get whether a node within a tree model is reversible
@param tm Tree model to test
@param node Node with alternate substitution model to test
@result 1 if reversible, 0 otherwise
*/
int tm_node_is_reversible(TreeModel *tm, TreeNode *node);
/** Get whether a substitution model is reversible
@param subst_mod Substitution Model of type subst_mod_type
@result 1 if reversible, 0 otherwise
*/
int subst_mod_is_reversible(int subst_mod);
/** \} \name Tree Model generate MSA
\{ */
/** Generates an alignment according to set of Tree Models and a
Markov matrix defining how to translate among them.
@pre Call srandom externally
@pre TreeModels must appear in same order as the states of the Markov matrix.
@param ncolumns Number of columns in MSA
@param hmm (Optional) Markov matrix of probabilities if NULL, single tree model assumed
@param classmods Array of tree models with different classes, one per HMM state (or 1 total if HMM is NULL)
@param labels (Optional) Used to record state (model) responsible for generating each site; pass NULL if hmm is NULL
@result Multiple Alignment generated from HMM and Tree Model
@note Only appropriate for order 0 models
*/
MSA *tm_generate_msa(int ncolumns, HMM *hmm,
TreeModel **classmods, int *labels);
/** Generates a random alignment using a list of scales and subtree scales.
@pre Call srandom externally
@param nsitesList List of int with each entry indicating number of sites per scale
@param scaleLst List of double with each entry indicating a scale
@param subtreeScaleLst List of double with each entry indicating scale of a subtree
@param model Tree Model
@param subtreeName (Optional) Name of node identifying subtree root
@result Multiple Alignment generated using a list of scales and subtree scales
@note If subtreeName is NULL subtree scales must be 1
*/
MSA *tm_generate_msa_scaleLst(List *nsitesList, List *scaleLst,
List *subtreeScaleLst, TreeModel *model,
char *subtreeName);
/** Generates an alignment according to a Tree Model, subtree, and and probability of subtree switching.
@pre Call srandom externally.
@param ncolumns Number of columns in MSA
@param mod Tree Model
@param subtreeMod Tree Model of the subtree
@param subtree Name of node identifying subtree root
@param subtreeSwitchProb Probability that a node switches its state of being in or out of the subtree
@result Multiple Alignment generated using a Tree Model, subtree, and probability of subtree switching
*/
MSA *tm_generate_msa_random_subtree(int ncolumns, TreeModel *mod,
TreeModel *subtreeMod,
char *subtree, double subtreeSwitchProb);
/** \} */
/** Given a codon model, create and return the induced amino acid model
@param codon_mod Codon model
@param Amino Acid model
*/
TreeModel *tm_induced_aa(TreeModel *codon_mod);
/** Build index of leaf ids to sequence indices in given alignment.
@param mod Tree Model containing leaves
@param msa Multiple Alignment containing sequence data
@note Leaves not present in the alignment will be ignored.
@note its not required that there's a leaf for every sequence in the
alignment.
*/
void tm_build_seq_idx(TreeModel *mod, MSA *msa);
/** Prune away leaves in tree that don't correspond to sequences in a
given alignment.
@param[in,out] mod Tree Model to prune
@param[in] msa Multiple Alignment; all leaves whose names are not in msa->names will be pruned away
@param[out] names Will contain names of deleted leaves on return. Must be pre-allocated
@warning Root of tree (value of mod->tree) may change.
*/
void tm_prune(TreeModel *mod, MSA *msa, List *names);
/** Extrapolate tree model and prune leaves not represented in
alignment.
@param[in] mod Tree Model to prune
@param[in] extrapolate_tree Super tree of mod->tree to scale
@param[in] msa Multiple Alignment; all leaves whose names are not in msa->names will be pruned away
@param[out] pruned_names Will contain names of deleted leaves on return. Must be pre-allocated
@result scale factor
@see tr_scale_by_subtree */
double tm_extrapolate_and_prune(TreeModel *mod, TreeNode *extrapolate_tree,
MSA *msa, List *pruned_names);
/** Reset TreeModel with new or altered tree.
@param[in,out] mod Tree model to modify
@param[in] newtree New tree to model
*/
void tm_reset_tree(TreeModel *mod, TreeNode *newtree);
/** Set branches to be ignored in likelihood calculation and parameter
estimation.
@param mod Tree Model containing branches to be ignored
@param ignore_branches List of Strings indicating nodes in the tree, and the branches leading to those nodes.
*/
void tm_set_ignore_branches(TreeModel *mod, List *ignore_branches);
/** For each parameter, report estimate, variance, and 95% confidence
interval, printed to given filename, one parameter per line.
@param mod Tree Model to determine error for
@param msa Multiple Alignment
@param params Vector of parameters to test on model
@param cat Whether using categories
@param error_fname Filename to save to
@param appendToFile Whether to append to file or overwrite
*/
void tm_free_alt_subst_mods(TreeModel *tm);
void tm_variance(FILE *outfile, TreeModel *mod, MSA *msa, Vector *params, int cat);
/** Calculate Lower/Upper bounds for each parameter or if not empty, get them from parsing mod->bound_arg
@pre *lower_bounds and *upper_bounds have not been allocated
@pre tm_setup_params has already been called
@param lower_bounds Unallocated vector pointer
@param upper_bounds Unallocated vector pointer
@param npar Number of parameters
@param mod Tree Model to estimate bounds for
@param allocate_default If TRUE, allocate space for lower_bounds and upper_bounds even if the boundaries are negative and positive infinity, respectively. If FALSE, set lower_bounds and upper_bounds to NULL to indicate these "default" boundaries.
*/
void tm_new_boundaries(Vector **lower_bounds,
Vector **upper_bounds,
int npar, TreeModel *mod,
int allocate_default);
/** Set lower/upper boundaries for each parameter describing a tree model
@param lower_bounds A vector describing the lower bound for each parameter
@param upper_bounds A vector describing the upper bound for each parameter
@param mod A tree model object
*/
void tm_set_boundaries(Vector *lower_bounds, Vector *upper_bounds,
TreeModel *mod);
struct likelihood_wrapper_struct {
MSA *align;
TreeModel *mod;
void (*unpackFunction)(Vector *params, TreeModel *mod);
double (*likelihoodFunction)(Vector *params, void *data);
};
void tm_site_model_set_ml_weights(TreeModel *mod, Vector *params,
double *counts);
void tm_setup_site_model(TreeModel *mod, const char *foreground, int bgc,
int alt_hypothesis, double selNeg, double selPlus,
double initBgc, double *initWeights);
List *tm_setup_bgc_model_hmm(TreeModel *mod, const char *foreground, double selNeg,
double selPos, double initBgc, double *initWeights);
#endif
|