File: PBootstrapHetRates.bf

package info (click to toggle)
hyphy 2.5.69%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,728 kB
  • sloc: cpp: 81,964; xml: 467; lisp: 341; python: 166; javascript: 117; sh: 106; makefile: 87; ansic: 86
file content (169 lines) | stat: -rw-r--r-- 4,028 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/* This is an example HY-PHY Batch File.

   It reads a 13 taxa dataset "data/hiv.nuc", performs
   an HKY85 with gamma rate heterogeneity ML analysis on the data using the tree from the file.
   Having finished that, the code simulates a data set with rate heterogeneity and
   demonstrates how to access simulated rate distribution at sites.

   Sergei L. Kosakovsky Pond and Spencer V. Muse 

   May 2002. 

*/



/* 1. Read in the data and store the result in a DataSet variable.*/

DataSet 		nucleotideSequences = ReadDataFile ("data/hiv.nuc");

   

/* 2. Filter the data, specifying that all of the data is to be used
	  and that it is to be treated as nucleotides.*/

DataSetFilter	filteredData = CreateFilter (nucleotideSequences,1);



/* 3. Collect observed nucleotide frequencies from the filtered data. observedFreqs will
	  store the vector of frequencies. */

HarvestFrequencies (observedFreqs, filteredData, 1, 1, 1);



/* 4. Define the HKY+Gamma substitution matrix. '*' is defined to be -(sum of off-diag row elements).
*/


global alpha = .5;
alpha:>0.01;alpha:<100;
category c = (4, EQUAL, MEAN, 
				GammaDist(_x_,alpha,alpha), 
				CGammaDist(_x_,alpha,alpha), 
				0 , 
		  	    1e25,
		  	    CGammaDist(_x_,alpha+1,alpha)
		  	 );
		  	 
global		kappa = 1; /* transversion/transition ratio */

HKY85RateMatrix = 

		{{*,c*t*kappa,c*t,c*t*kappa}

		 {c*t*kappa,*,c*t*kappa,c*t}

		 {c*t,c*t*kappa,*,c*t*kappa}

		 {c*t*kappa,c*t,c*t*kappa,*}};

		 

/*5.  Define the HKY85 model, by combining the substitution matrix with the vector of observed (equilibrium)

	  frequencies. */

Model HKY85	 = (HKY85RateMatrix, observedFreqs);



/*6.  Now we can define the tree variable, using the tree string read from the data file,

	  and, by default, assigning the last defined model (HKY85) to all tree branches. */

	  

Tree	givenTree = DATAFILE_TREE;



/*7.  Since all the likelihood function ingredients (data, tree, equilibrium frequencies)

	  have been defined we are ready to construct the likelihood function. */

	  

LikelihoodFunction  theLnLik = (filteredData, givenTree);



/*8.  Maximize the likelihood function, storing parameter values in the matrix paramValues */



Optimize (paramValues, theLnLik);



/*9.  Print the tree with optimal branch lengths to the console. */



fprintf  (stdout, "\n----ORIGINAL DATA----\n",theLnLik);

		 

/*10. Now we set up the simulation loop.

	  First, we create another copy of the tree which will

	  serve as the tree for simulated data sets */

	  

Tree	simulatedTree = DATAFILE_TREE;





/*12. By default, the random generator is reset every time the program is run.

	  The value of the seed is stored in RANDOM_SEED.

	  If you wish to use a particular seed, say the repeat a simulation,

	  call the function SetParameter (RANDOM_SEED,value,0).

*/


/*SetParameter (RANDOM_SEED,12345,0);*/

fprintf (stdout, "\nUsing the seed:\n", Format(RANDOM_SEED,10,0));


/*12. Simulating the dataset and storing category (c) variable
	  values in the row vector ratesAtSites, and the name of the category
	  variable in the column vector namesOfRateVars. This is useful
	  if several category variables are present so that we know which 
	  row of ratesAtSites the values of a particular variable go.*/


DataSet		simulatedData = SimulateDataSet (theLnLik,"",ratesAtSites,namesOfRateVars);

/*13. We are just going to print out the rates and report their mean
which should be close to 1 and the variance, which should be close to 1/alpha */

fprintf (stdout, "\nCategory variable name: ", namesOfRateVars[0],
                 "\n      Site       Rate\n");
                 
sum  = 0;
sum2 = 0;

for (k=0; k<Columns (ratesAtSites); k=k+1)
{
	term = ratesAtSites[k];
	fprintf (stdout, Format (k,10,0), " ", Format (term,10,5),"\n");
	sum = sum+term;
	sum2 = sum2+term*term;
}	

fprintf (stdout, "\nMean = ", sum/Columns (ratesAtSites),
				 "\nVar  = ",(sum2-sum^2/Columns (ratesAtSites))/(Columns(ratesAtSites)-1)," (1/alpha) = ", 1/alpha, "\n");