1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
|
/* This is an example HY-PHY Batch File.
It reads a 4 taxa dataset "data/hiv.nuc", performs
an HKY85 ML analysis on the data using the tree from the file.
Having finished that, the code simulates 100 replicates of the data
using MLE of the parameters, conducts an HKY ML analysis on each of the
replicates and then tabulates the distribution of resulting ln-likelihoods.
Sergei L. Kosakovsky Pond and Spencer V. Muse
December 1999.
*/
/* 1. Read in the data and store the result in a DataSet variable.*/
DataSet nucleotideSequences = ReadDataFile ("data/hiv.nuc");
/* 2. Filter the data, specifying that all of the data is to be used
and that it is to be treated as nucleotides.*/
DataSetFilter filteredData = CreateFilter (nucleotideSequences,1);
/* 3. Collect observed nucleotide frequencies from the filtered data. observedFreqs will
store the vector of frequencies. */
HarvestFrequencies (observedFreqs, filteredData, 1, 1, 1);
/* 4. Define the KHY substitution matrix. '*' is defined to be -(sum of off-diag row elements) */
HKY85RateMatrix =
{{*,trvs,trst,trvs}
{trvs,*,trvs,trst}
{trst,trvs,*,trvs}
{trvs,trst,trvs,*}};
/*5. Define the HKY85 model, by combining the substitution matrix with the vector of observed (equilibrium)
frequencies. */
Model HKY85 = (HKY85RateMatrix, observedFreqs);
/*6. Now we can define the tree variable, using the tree string read from the data file,
and, by default, assigning the last defined model (HKY85) to all tree branches. */
Tree givenTree = DATAFILE_TREE;
/*7. Since all the likelihood function ingredients (data, tree, equilibrium frequencies)
have been defined we are ready to construct the likelihood function. */
LikelihoodFunction theLnLik = (filteredData, givenTree);
/*8. Maximize the likelihood function, storing parameter values in the matrix paramValues */
Optimize (paramValues, theLnLik);
/*9. Print the tree with optimal branch lengths to the console. */
fprintf (stdout, "\n----ORIGINAL DATA----\n",theLnLik,"\n\n----SIMULATIONS----\n\n");
/*10. Now we set up the simulation loop.
First, we create another copy of the tree which will
serve as the tree for simulated data sets */
Tree simulatedTree = DATAFILE_TREE;
/*12. By default, the random generator is reset every time the program is run.
The value of the seed is stored in RANDOM_SEED.
If you wish to use a particular seed, say the repeat a simulation,
set ASSIGNED_SEED to the value that you want to use.
*/
fprintf (stdout, "\nUsing the seed:\n", Format(RANDOM_SEED,10,0));
/*12. This is a formatting stepm which sets print width for all numbers to 10
and prints the table header */
PRINT_DIGITS = 10;
fprintf (stdout, "\n|----------|----------|\n| Simul. # | Ln lik |\n|----------|----------|");
for (simCounter = 1; simCounter<=10; simCounter = simCounter+1)
{
/*13. Simulate a data set of the same size as the original set using
the MLE of all the parameters */
DataSet simulatedData = SimulateDataSet (theLnLik);
/*14. Repeat the same steps as for the original data to obtain simulated
ln-likelihood */
DataSetFilter filteredSimData = CreateFilter (simulatedData,1);
/*15. Collect observed nucleotide frequencies from the filtered data. observedFreqs will
store the vector of frequencies. */
HarvestFrequencies (simFreqs, filteredSimData, 1, 1, 1);
LikelihoodFunction simLik = (filteredSimData, simulatedTree);
Optimize (simParamValues, simLik);
/* print out the log-likelihood of the simulation */
fprintf (stdout, "\n|", simCounter,"|", simParamValues[1][0],"|");
}
fprintf (stdout, "\n|----------|----------|");
|