File: ParametricBootstrap.bf

package info (click to toggle)
hyphy 2.5.69%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,728 kB
  • sloc: cpp: 81,964; xml: 467; lisp: 341; python: 166; javascript: 117; sh: 106; makefile: 87; ansic: 86
file content (206 lines) | stat: -rw-r--r-- 3,819 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* This is an example HY-PHY Batch File.



   It reads a 4 taxa dataset "data/hiv.nuc", performs

   an HKY85 ML analysis on the data using the tree from the file.

   Having finished that, the code simulates 100 replicates of the data

   using MLE of the parameters, conducts an HKY ML analysis on each of the 

   replicates and then tabulates the distribution of resulting ln-likelihoods.

   

   

   Sergei L. Kosakovsky Pond and Spencer V. Muse 

   December 1999. 

*/



/* 1. Read in the data and store the result in a DataSet variable.*/



DataSet 		nucleotideSequences = ReadDataFile ("data/hiv.nuc");

   

/* 2. Filter the data, specifying that all of the data is to be used

	  and that it is to be treated as nucleotides.*/

	  

DataSetFilter	filteredData = CreateFilter (nucleotideSequences,1);



/* 3. Collect observed nucleotide frequencies from the filtered data. observedFreqs will

	  store the vector of frequencies. */



HarvestFrequencies (observedFreqs, filteredData, 1, 1, 1);



/* 4. Define the KHY substitution matrix. '*' is defined to be -(sum of off-diag row elements) */



HKY85RateMatrix = 

		{{*,trvs,trst,trvs}

		 {trvs,*,trvs,trst}

		 {trst,trvs,*,trvs}

		 {trvs,trst,trvs,*}};

		 

/*5.  Define the HKY85 model, by combining the substitution matrix with the vector of observed (equilibrium)

	  frequencies. */

	  

Model HKY85	 = (HKY85RateMatrix, observedFreqs);



/*6.  Now we can define the tree variable, using the tree string read from the data file,

	  and, by default, assigning the last defined model (HKY85) to all tree branches. */

	  

Tree	givenTree = DATAFILE_TREE;



/*7.  Since all the likelihood function ingredients (data, tree, equilibrium frequencies)

	  have been defined we are ready to construct the likelihood function. */

	  

LikelihoodFunction  theLnLik = (filteredData, givenTree);



/*8.  Maximize the likelihood function, storing parameter values in the matrix paramValues */



Optimize (paramValues, theLnLik);



/*9.  Print the tree with optimal branch lengths to the console. */



fprintf  (stdout, "\n----ORIGINAL DATA----\n",theLnLik,"\n\n----SIMULATIONS----\n\n");

		 

/*10. Now we set up the simulation loop.

	  First, we create another copy of the tree which will

	  serve as the tree for simulated data sets */

	  

Tree	simulatedTree = DATAFILE_TREE;





/*12. By default, the random generator is reset every time the program is run.

	  The value of the seed is stored in RANDOM_SEED.

	  If you wish to use a particular seed, say the repeat a simulation,

	  set ASSIGNED_SEED to the value that you want to use.

*/


fprintf (stdout, "\nUsing the seed:\n", Format(RANDOM_SEED,10,0));


/*12. This is a formatting stepm which sets print width for all numbers to 10

	  and prints the table header */



PRINT_DIGITS = 10;



fprintf (stdout, "\n|----------|----------|\n| Simul. # |  Ln lik  |\n|----------|----------|");



for (simCounter = 1; simCounter<=10; simCounter = simCounter+1)

{

	/*13. Simulate a data set of the same size as the original set using

		  the MLE of all the parameters */

	DataSet		simulatedData = SimulateDataSet (theLnLik);

	/*14. Repeat the same steps as for the original data to obtain simulated 

		  ln-likelihood */

		  

	DataSetFilter	filteredSimData = CreateFilter (simulatedData,1);



	/*15. Collect observed nucleotide frequencies from the filtered data. observedFreqs will

		  store the vector of frequencies. */



	HarvestFrequencies (simFreqs, filteredSimData, 1, 1, 1);

	LikelihoodFunction simLik = (filteredSimData, simulatedTree);

	Optimize (simParamValues, simLik);

	/* print out the log-likelihood of the simulation */

	fprintf (stdout, "\n|", simCounter,"|", simParamValues[1][0],"|");



}



fprintf (stdout, "\n|----------|----------|");