File: tree_likelihoods.h

package info (click to toggle)
phast 1.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 12,412 kB
  • sloc: ansic: 54,180; makefile: 354; sh: 337; perl: 321
file content (186 lines) | stat: -rw-r--r-- 8,579 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/***************************************************************************
 * PHAST: PHylogenetic Analysis with Space/Time models
 * Copyright (c) 2002-2005 University of California, 2006-2010 Cornell 
 * University.  All rights reserved.
 *
 * This source code is distributed under a BSD-style license.  See the
 * file LICENSE.txt for details.
 ***************************************************************************/

/** @file tree_likelihoods.h
    Computation of likelihoods for columns of a given multiple
    alignment, according to a given tree model.
    @ingroup phylo
 */

#ifndef TREE_LIK_H
#define TREE_LIK_H

#include <tree_model.h>
#include <msa.h>
#include <math.h>
#include <misc.h>

/** Structure for information related to posterior probability of tree
   model wrt an alignment.  
     Each array is indexed as appropriate for
   rate categories, bases in the alignment (indexed according to a
   model's inv_states; with higher order models, actually tuples of
   bases), nodes or edges in the tree (indexed by node ids; a node is
   associated with the edge that connects it to its parent), and
   column tuples in a "sufficient statistics" representation of the
   alignment (all quantities will be the same for all instances of a
   column tuple). */
struct tp_struct {
  double ****base_probs;        /**< Posterior probability of each base
                                   given a node, a column
                                   tuple, and a rate category.  
					- First index is rate category
					- Second is base
					- Third is node
					- Fourth is column tuple
				 */
  double *****subst_probs;      /**< Posterior probability of a
                                   substitution of each base for each
                                   other, given a branch, column
                                   tuple, and rate category.  
					- First index is rate category
					- Second is original base
					- Third is replacement base
					- Fourth is branch
					- Fifth is column tuple 
				*/
  double ***expected_nsubst;    /**< Expected number of substitutions for each
                                   branch x column tuple, given a rate
                                   category (conditioned on rate
                                   category in case posterior
                                   probabilities of rate categories
                                   depend on an HMM or similar).
                                   	- First index is rate category
					- Second is branch 
					- Third is column tuple 
				*/ 
  double ****expected_nsubst_tot; 
                                /**< Total expected number of
                                   substitutions of each type along
                                   each branch for each rate category,
                                   summed over all column tuples
                                   (considering the number of
                                   instances of each tuple).  These
                                   are the sufficient statistics for
                                   computing the likelihood of a tree
                                   model.  Note that they are based on
                                   *joint* probabilities with rate
                                   categories, rather than being
                                   conditioned on rate categories (the
                                   posterior probability of each rate
                                   at each site in incorporated).
                                   	- First index is rate category
					- Second is original base
					- Third is replacement base
					- Fourth is branch 
 */
  double *****expected_nsubst_col;
                                /**< Expected number of substitutions of each
                                   type along each branch for each rate 
				   category, for each tuple column.
				   	- First index is rate category
					- Second is branch 
					- Third is tuple
					- Fourth is original base
					- Fifth is replacement base 
				*/
  double **rcat_probs;          /**< Posterior probability of each rate
                                   category for each column tuple.
                                    	- First index is rate category
					- Second is column tuple 
				*/
  double *rcat_expected_nsites; /**< Expected number of sites in each
                                   rate category */
};

typedef struct tp_struct TreePosteriors;
                                /* see incomplete type in tree_model.h */

#define NULL_LOG_LIKELIHOOD 1   /** Safe value for null when dealing with
                                   log likelihoods (should always be <= 0) FIXME? */

/* does not appear to be implemented */
void tl_dump_matrices(TreeModel *mod, double **inside_vals, 
                      double **outside_vals, double **posterior_probs);

/** Compute the likelihood of a tree model with respect to an
   alignment; Optionally retain column-by-column likelihoods and/or posterior probabilities.  
   @param[in] mod Tree Model to compute likelihood for
   @param[in] msa Multiple Alignment containing data related to tree model
   @param[out] col_scores (Optional) Log likelihood score per column
   @param[out] tuple_scores (Optional) Log likelihood score per tuple
   @param[in] cat Whether to use categories
   @param[out] post (Optional) Computed posterior probabilities; If NULL, no
   posterior probabilities (or related quantities) will be computed.
   If non-NULL each of its attributes must either be NULL or
   previously allocated to the required size. 
   @result Log likelihood of entire tree model specified
*/
double tl_compute_log_likelihood(TreeModel *mod, MSA *msa, 
                                 double *col_scores, 
				 double *tuple_scores, 
				 int cat,
                                 TreePosteriors *post);

/** Create a new TreePosteriors object.
    @param mod Tree Model of which the posterior probabilities are calculated
    @param msa Multiple Alignment
    @param do_bases Whether to allocate space for base probabilities
    @param do_subst Whether to allocate space for substitution probabilities
    @param do_expected_nsubst Whether to allocate space for expected number of substitutions matrix
    @param do_expected_nsubst_tot Whether to allocate space for total expected number of substitutions
    @param do_expected_nsubst_col Whether to allocate space for expected number of substitutions per column
    @param do_rate_cats Whether to allocate space for rate categories
    @param do_rate_cats_exp Whether to allocate space for expected rate categories
    @result Newly allocated TreePosteriors object
*/
TreePosteriors *tl_new_tree_posteriors(TreeModel *mod, MSA *msa, int do_bases, 
                                       int do_substs, int do_expected_nsubst, 
                                       int do_expected_nsubst_tot,
				       int do_expected_nsubst_col,
                                       int do_rate_cats, int do_rate_cats_exp);

/** Free TreePosteriors object
   @param mod Tree model of which posterior are calculated
   @param msa Multiple Alignment
   @param tp TreePosteriors object to free
 */
void tl_free_tree_posteriors(TreeModel *mod, MSA *msa, TreePosteriors *tp);

/** Compute the expected (posterior) complete log likelihood of a tree
   model based on a TreePosteriors object.  
   @param[in] mod Tree Model
   @param[in] post Pre-calculated posterior probabilities
   @note Equilibrium frequencies are not considered
   @result Log Likelihood of tree
*/
double tl_compute_partial_ll_suff_stats(TreeModel *mod, TreePosteriors *post);

/* Could not find implementation */
double tl_compute_ll_suff_stats(TreeModel *mod, MSA *msa, TreePosteriors *post);

/** Given an alphabet, a tuple size, and a vector of equilibrium
   frequencies, create a new vector of marginal equilibrium
   frequencies describing the space of "meta-tuples", which contain
   actual characters *or* missing data characters.  
   Each meta-tuple is
   given an equilibrium frequency equal to the sum of the frequencies
   of all "matching" ordinary tuples.  
    Missing data characters are
   assumed to be gap characters or Ns. 
   @param alphabet List of possible characters
   @param tuple_size Size of tuples
   @param eq_freqs Equilibrium frequencies
   @param New vector of marginal equilibrium frequencies
*/
Vector *get_marginal_eq_freqs (char *alphabet, int tuple_size, 
                                   Vector *eq_freqs);


#endif