1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
/* This is an example HY-PHY Batch File.
It reads a 13 taxa dataset "data/hiv.nuc", performs
an HKY85 with gamma rate heterogeneity ML analysis on the data using the tree from the file.
Having finished that, the code simulates a data set with rate heterogeneity and
demonstrates how to access simulated rate distribution at sites.
Sergei L. Kosakovsky Pond and Spencer V. Muse
May 2002.
*/
/* 1. Read in the data and store the result in a DataSet variable.*/
DataSet nucleotideSequences = ReadDataFile ("data/hiv.nuc");
/* 2. Filter the data, specifying that all of the data is to be used
and that it is to be treated as nucleotides.*/
DataSetFilter filteredData = CreateFilter (nucleotideSequences,1);
/* 3. Collect observed nucleotide frequencies from the filtered data. observedFreqs will
store the vector of frequencies. */
HarvestFrequencies (observedFreqs, filteredData, 1, 1, 1);
/* 4. Define the HKY+Gamma substitution matrix. '*' is defined to be -(sum of off-diag row elements).
*/
global alpha = .5;
alpha:>0.01;alpha:<100;
category c = (4, EQUAL, MEAN,
GammaDist(_x_,alpha,alpha),
CGammaDist(_x_,alpha,alpha),
0 ,
1e25,
CGammaDist(_x_,alpha+1,alpha)
);
global kappa = 1; /* transversion/transition ratio */
HKY85RateMatrix =
{{*,c*t*kappa,c*t,c*t*kappa}
{c*t*kappa,*,c*t*kappa,c*t}
{c*t,c*t*kappa,*,c*t*kappa}
{c*t*kappa,c*t,c*t*kappa,*}};
/*5. Define the HKY85 model, by combining the substitution matrix with the vector of observed (equilibrium)
frequencies. */
Model HKY85 = (HKY85RateMatrix, observedFreqs);
/*6. Now we can define the tree variable, using the tree string read from the data file,
and, by default, assigning the last defined model (HKY85) to all tree branches. */
Tree givenTree = DATAFILE_TREE;
/*7. Since all the likelihood function ingredients (data, tree, equilibrium frequencies)
have been defined we are ready to construct the likelihood function. */
LikelihoodFunction theLnLik = (filteredData, givenTree);
/*8. Maximize the likelihood function, storing parameter values in the matrix paramValues */
Optimize (paramValues, theLnLik);
/*9. Print the tree with optimal branch lengths to the console. */
fprintf (stdout, "\n----ORIGINAL DATA----\n",theLnLik);
/*10. Now we set up the simulation loop.
First, we create another copy of the tree which will
serve as the tree for simulated data sets */
Tree simulatedTree = DATAFILE_TREE;
/*12. By default, the random generator is reset every time the program is run.
The value of the seed is stored in RANDOM_SEED.
If you wish to use a particular seed, say the repeat a simulation,
call the function SetParameter (RANDOM_SEED,value,0).
*/
/*SetParameter (RANDOM_SEED,12345,0);*/
fprintf (stdout, "\nUsing the seed:\n", Format(RANDOM_SEED,10,0));
/*12. Simulating the dataset and storing category (c) variable
values in the row vector ratesAtSites, and the name of the category
variable in the column vector namesOfRateVars. This is useful
if several category variables are present so that we know which
row of ratesAtSites the values of a particular variable go.*/
DataSet simulatedData = SimulateDataSet (theLnLik,"",ratesAtSites,namesOfRateVars);
/*13. We are just going to print out the rates and report their mean
which should be close to 1 and the variance, which should be close to 1/alpha */
fprintf (stdout, "\nCategory variable name: ", namesOfRateVars[0],
"\n Site Rate\n");
sum = 0;
sum2 = 0;
for (k=0; k<Columns (ratesAtSites); k=k+1)
{
term = ratesAtSites[k];
fprintf (stdout, Format (k,10,0), " ", Format (term,10,5),"\n");
sum = sum+term;
sum2 = sum2+term*term;
}
fprintf (stdout, "\nMean = ", sum/Columns (ratesAtSites),
"\nVar = ",(sum2-sum^2/Columns (ratesAtSites))/(Columns(ratesAtSites)-1)," (1/alpha) = ", 1/alpha, "\n");
|