File: ExonIntron.bf

package info (click to toggle)
hyphy 2.5.69%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,728 kB
  • sloc: cpp: 81,964; xml: 467; lisp: 341; python: 166; javascript: 117; sh: 106; makefile: 87; ansic: 86
file content (144 lines) | stat: -rw-r--r-- 3,527 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* This is an example HY-PHY Batch File.



   It reads in a '#' nucleotide dataset data/ubiq.flt and estimates

   maximum ln-likelihood based on a (hypothetic) partition of the

   data into 2 exons and 1 intron. MG94 codon model is applied to

   coding regions, whereas HKY85 is used for non-coding regions.

   

   Output is printed out as a Newick Style tree with branch lengths

   representing the number of expected substitutions per branch (which

   is the default setting for nucleotide models w/o rate variation).

   

   

   Sergei L. Kosakovsky Pond and Spencer V. Muse 

   December 1999. 

*/



/* 1. Read in the data and store the result in a DataSet variable.*/



DataSet 		sequences = ReadDataFile ("data/ubiq.flt");

   

/* 2. Filter the data. We create two coding partitions, 1-99 and 199-end, treated as codons in std genetic code; and one non-coding partition, 100-198.

*/

	  

DataSetFilter	exon1 = CreateFilter (sequences,3,"0-98", "","TAA,TAG,TGA");

DataSetFilter	intron = CreateFilter(sequences,1,"99-197");

DataSetFilter	exon2 = CreateFilter (sequences,3,siteIndex>197,"","TAA,TAG,TGA");



/* 5. Collect observed nucleotide frequencies from the filtered data. observedFreq will store the vector of frequencies. Since MG94 used nucleotide frequencies, we collect those. */



HarvestFrequencies (observedFreq1, exon1, 1, 1, 1);

HarvestFrequencies (observedFreq2, exon2, 1, 1, 1);

HarvestFrequencies (intronFreqs, intron, 1, 1, 1);



/* 3. Include the model definition file. The file defines the model 2 MG94 models (one for each exon) and the function BuildCodonFrequencies which estimates equilibrium codon frequencies given nucleotide frequencies. The reason each exon needs its own version of MG94 is that observed nucleotide frequencies are intergrated into the rate matrix in a fairly irregular way, so different obs. freqs. for each exon give rise two 2 rate matrices. The way the models defined the _require_ the nucl. freqs to be stored in variables observedFreq1 and observedFreq2. */



#include "Models/MG94x2.mdl";



/*7.  Build codon frequencies for each exon*/



codonFreqs1 = BuildCodonFrequencies (observedFreq1);

codonFreqs2 = BuildCodonFrequencies (observedFreq2);



/*8.	Define the models for each exon. */

Model MG94model1 = (MG94_1,codonFreqs1,0);

Model MG94model2 = (MG94_2,codonFreqs2,0);



/* 4  define the HKY85 model */



HKY85RateMatrix = 

		{{*,trvs,trst,trvs}

		 {trvs,*,trvs,trst}

		 {trst,trvs,*,trvs}

		 {trvs,trst,trvs,*}};

		 

Model HKY85	 = (HKY85RateMatrix, intronFreqs);





/*6. 	We will need a separate tree for each partition. */



UseModel (MG94model1);	  
Tree	exon1Tree = (BLYMUB1,(BLYMUB2,(RICRMA630,(ZMUBIS27F,(ZMU29160,ZMU29161)))));
UseModel (MG94model2);	  
Tree	exon2Tree = (BLYMUB1,(BLYMUB2,(RICRMA630,(ZMUBIS27F,(ZMU29160,ZMU29161)))));
UseModel (HKY85);	  
Tree	intronTree = (BLYMUB1,(BLYMUB2,(RICRMA630,(ZMUBIS27F,(ZMU29160,ZMU29161)))));



/*9.  Since all the likelihood function ingredients (data, tree, equilibrium frequencies) have been defined we are ready to construct the likelihood function. */

	  

LikelihoodFunction  theLnLik = (exon1, exon1Tree,
				        		exon2, exon2Tree,	                         
				        		intron,intronTree);

/*8.  Maximize the likelihood function, storing parameter values in the matrix paramValues.
	  This will take a few minutes! */

Optimize (paramValues, theLnLik);

/*9.  Print the tree with optimal branch lengths to the console. */

fprintf  (stdout, theLnLik);