File: simulateWithDependence.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (142 lines) | stat: -rw-r--r-- 4,698 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include "simulateWithDependence.h"

/*
This code receives a tree file and simulates sequences accordingly using:
simulateTree st1(treeIn, *_sp, alph);
st1.generate_seq(num_pos_with_same_k);
which were written by another beloved group member.

Its feature is to simulate co-evolution between pairs of positions of binary data. Basic logic:
1. the basic concept is to use the regular independent model with 4 states to code a dependent model with 2 states.
   thus, all possible pairs of dada: 00, 01, 10, 11 are coded into A, C, G, T
2. dependency between possitions can be described as a tendency to have the same character (that is: 00 or 11). with this model we can accelerate
   the rate of evolution when an "unstable" state occures (rate increases when 01 (C) or 10 (G))

For more details, please see http://copap.tau.ac.il/benchmark.php and
Ofir Cohen, Haim Ashkenazy, Eli Levy Karin, David Burstein and Tal Pupko (2013)
CoPAP: Co-evolution of Presence-Absence Patterns.
Nucleic Acids Research 2013; doi: 10.1093/nar/gkt471

Eli Levy Karin, 2013

----------------- usage example: ------------------
#include "simulateWithDependence.h"

using namespace sim_with_dep;

int main(int argc, char** argv) {
	
	string treeFile = argv[1];

	double exit_code;

	exit_code = simulate_with_dependence (treeFile, 0.5, 14, 500, 500, 0, 1, 0.893195, 1, 4);
	
	return 0;
}
-------------- end usage example: ---------------

*/
namespace sim_with_dep 
{

	double simulate_with_dependence (string treeFile, double PI_1, double init_k, int total_positions, int num_pos_with_same_k, double k_increase, int is_gamma, double alpha, double beta, int num_cat)
	{

		//read Newick format tree
		tree treeIn(treeFile);
	
		//four states alphabet A C G T (will later be rplaced to 00,01,10,11)
		alphabet* alph = new nucleotide;

		sequenceContainer SC_all; //this will contain all positions
	
		//parameters:
		double PI_0 = 1-PI_1;
		double k = init_k; //will be increased with each iteration

		//parameters:
		int jump_size = total_positions / num_pos_with_same_k;

		for(int i=0; i<jump_size; i++)
		{
			Vdouble freqs; //stationary probabilities PI_00, PI_01, PI_10, PI_11
			double TOTAL = k*PI_1*PI_1 + 2*PI_0*PI_1 + k*PI_0*PI_0;
			freqs.push_back(k*PI_0*PI_0 / TOTAL); //PI_00 = k*PI_0*PI_0 / TOTAL
			freqs.push_back(PI_0*PI_1 / TOTAL); //PI_01 = PI_0*PI_1 / TOTAL
			freqs.push_back(PI_0*PI_1 / TOTAL); //PI_10 = PI_0*PI_1 / TOTAL
			freqs.push_back(k*PI_1*PI_1 / TOTAL); //PI_11 = k*PI_1*PI_1 / TOTAL
	
			//Q matrix (partial values - the rest are calculated by gtrModel using freqs and these values)
			MDOUBLE a2c = PI_1; // --> c2a = freqs[a]*a2c/freqs[c] --> c2a = ((k*PI_0*PI_0 / TOTAL)*PI_1)/(PI_0*PI_1 / TOTAL) = k*PI_0
			MDOUBLE a2g = PI_1;
			MDOUBLE a2t = 0;
			MDOUBLE c2g = 0;
			MDOUBLE c2t = k*PI_1;
			MDOUBLE g2t = k*PI_1;

			//starting the evolutionary model
			distribution *currDist = NULL;
			if(is_gamma == 1)
			{
				currDist = new generalGammaDistribution(alpha,beta,num_cat); // ---> in the future we might want to turn these into param
			}
			else
			{
				currDist = new uniDistribution; // no among site rate variation
			}
		
			replacementModel *probMod = NULL;
			pijAccelerator *pijAcc = NULL;

			probMod = new gtrModel(freqs,a2c,a2g,a2t,c2g,c2t,g2t);
			pijAcc = new trivialAccelerator(probMod);
			stochasticProcess* _sp = new stochasticProcess(currDist, pijAcc);

			//simulate:
			simulateTree st1(treeIn, *_sp, alph);
			st1.generate_seq(num_pos_with_same_k); //simulate num_pos_with_same_k positions with the current k

			if(i == 0)
			{
				SC_all = st1.toSeqDataWithoutInternalNodes(); //first time
			}
			else
			{
				sequenceContainer SC = st1.toSeqDataWithoutInternalNodes(); //concatenate new positions to the ones you have
				SC_all.concatenate(SC);
			}
		
			delete currDist;
			delete probMod;
			delete pijAcc;
			delete _sp;

			k = k + k_increase; //k = 1 , 1.05 , 1.1 , ... , 5.5
		}

		//prepare out file name:
		std::stringstream sstm;
		if(is_gamma == 1)
		{
			sstm << treeFile << ".gammaRateNoInv.PI_1=" << PI_1 << ".init_k=" << init_k << ".k_group_size=" << num_pos_with_same_k << ".k_increase=" << k_increase << ".fas";
		}
		else
		{
			sstm << treeFile << ".NoRate.PI_1=" << PI_1 << ".init_k=" << init_k << ".k_group_size=" << num_pos_with_same_k << ".k_increase=" << k_increase << ".fas";
		}
		std::string seqOutputFile = sstm.str();

		//write out:
		ofstream seq_sim(seqOutputFile.c_str());
		fastaFormat::write(seq_sim,SC_all);
		seq_sim.close();
	
	
		delete alph;
		
		return 0;
	
	}

};