File: SModel.hs

package info (click to toggle)
bali-phy 3.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 10,608 kB
  • sloc: cpp: 67,094; xml: 4,074; perl: 3,715; haskell: 1,861; yacc: 1,067; python: 555; lex: 528; sh: 259; makefile: 20
file content (226 lines) | stat: -rw-r--r-- 10,818 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
module SModel (module SModel,
               module SModel.Doublets,
               module SModel.Codons,
               module SModel.ReversibleMarkov,
               module SModel.Likelihood) where 
import Distributions
import Alphabet
import Tree
import Parameters

import SModel.Nucleotides
import SModel.Doublets
import SModel.Codons
import SModel.ReversibleMarkov
import SModel.Likelihood

builtin builtin_average_frequency 1 "average_frequency" "SModel"
builtin builtin_empirical 2 "empirical" "SModel"
builtin pam 1 "pam" "SModel"
builtin jtt 1 "jtt" "SModel"
builtin wag 1 "wag" "SModel"
builtin builtin_wag_frequencies 1 "wag_frequencies" "SModel"
builtin builtin_lg_frequencies 1 "lg_frequencies" "SModel"
builtin lg 1 "lg" "SModel"
builtin builtin_weighted_frequency_matrix 2 "weighted_frequency_matrix" "SModel"
builtin builtin_frequency_matrix 1 "frequency_matrix" "SModel"
builtin mut_sel_q 2 "mut_sel_q" "SModel"
builtin mut_sel_pi 2 "mut_sel_pi" "SModel"

data F81 = F81 a b c d
data MixtureModel = MixtureModel [(Double,a)]
-- Currently we are weirdly duplicating the mixture probabilities for each component.
-- Probably the actual data-type is something like [(Double,\Int->a)] or [(Double,[a])] where all the [a] should have the same length.
-- This would be a branch-dependent mixture
data MixtureModels = MixtureModels [MixtureModel a]

scaleMM x (MixtureModel dist            ) = MixtureModel [(p, scale x m) | (p, m) <- dist]

mixMM fs ms = MixtureModel $ mix fs [m | MixtureModel m <- ms]
scale_MMs rs ms = [scaleMM r m | (r,m) <- zip' rs ms]
scaled_mixture ms rs fs = mixMM fs (scale_MMs rs ms)

multiParameter model_fn values = MixtureModel [ (f*p, m) |(p,x) <- values, let dist = case model_fn x of MixtureModel d -> d, (f,m) <- dist]
multiParameter_unit model_fn values = multiParameter (\x -> unit_mixture $ model_fn x) values

multi_rate m d = multiParameter (\x->scaleMM x m) d

average_frequency (MixtureModel ms) = list_from_vector $ builtin_average_frequency $ weighted_frequency_matrix $ MixtureModel ms

extend_mixture (MixtureModel ms) (p,x) = MixtureModel $ mix [p, 1.0-p] [certainly x, ms]

plus_inv mm p_inv = extend_mixture mm (p_inv, scale 0.0 $ f81 pi a)
    where a  = getAlphabet mm
          pi = average_frequency mm

multi_rate_unif_bins base dist n_bins = multi_rate base $ uniformDiscretize dist n_bins

rate (ReversibleMarkov a s q pi l t r) = r
rate (MixtureModel d) = average (fmap2 rate d)

branchTransitionP (MixtureModel l) t = let r = rate (MixtureModel l)
                                       in map (\x -> qExp (scale (t/r) (snd x))) l

-- In theory we could take just (a,q) since we could compute smap from a (if states are simple) and pi from q.
nBaseModels (MixtureModel l) = length l
nBaseModels (MixtureModels (m:ms)) = nBaseModels m

baseModel (MixtureModel l) i = snd (l !! i)

stateLetters (ReversibleMarkov _ smap _ _ _ _ _) = smap
stateLetters (F81 _ smap _ _ ) = smap
stateLetters (MixtureModel l) = stateLetters (baseModel (MixtureModel l) 0)
stateLetters (MixtureModels (m:ms)) = stateLetters m

nStates m = sizeOfVectorUnsigned (stateLetters m)
  
getAlphabet (ReversibleMarkov a _ _ _ _ _ _) = a
getAlphabet (F81 a _ _ _) = a
getAlphabet (MixtureModel l) = getAlphabet (baseModel (MixtureModel l) 0)
getAlphabet (MixtureModels (m:ms)) = getAlphabet m

frequencies (ReversibleMarkov _ _ _ pi _ _ _) = pi
frequencies (F81 _ _ _ pi) = pi

componentFrequencies (MixtureModel d)       i = frequencies (baseModel (MixtureModel d) i)
componentFrequencies (MixtureModels (m:ms)) i = componentFrequencies m i

distribution (MixtureModel l) = map fst l
distribution (MixtureModels (m:ms))                  = distribution m

getNthMixture (MixtureModels l) i = l !! i

unwrapMM (MixtureModel dd) = dd

mixMixtureModels l dd = MixtureModel (mix l (map unwrapMM dd))

weighted_frequency_matrix (MixtureModel d) = let model = MixtureModel d
                                                 dist = list_to_vector $ distribution model
                                                 freqs = list_to_vector $ map (componentFrequencies model) [0..nBaseModels model-1]
                                             in builtin_weighted_frequency_matrix dist freqs
weighted_frequency_matrix (MixtureModels (m:ms)) = weighted_frequency_matrix m

frequency_matrix (MixtureModel d) = let model = MixtureModel d
                                    in  builtin_frequency_matrix $ list_to_vector $ map (componentFrequencies model) [0..nBaseModels model-1]

frequency_matrix (MixtureModels (m:ms)) = frequency_matrix m

--
m1a_omega_dist f1 w1 = [(f1,w1), (1.0-f1,1.0)]

m2a_omega_dist f1 w1 posP posW = extendDiscreteDistribution (m1a_omega_dist f1 w1) posP posW

m2a_test_omega_dist f1 w1 posP posW 0 = m2a_omega_dist f1 w1 posP 1.0
m2a_test_omega_dist f1 w1 posP posW _ = m2a_omega_dist f1 w1 posP posW

m3_omega_dist ps omegas = zip' ps omegas

m3p_omega_dist ps omegas posP posW = extendDiscreteDistribution (m3_omega_dist ps omegas) posP posW

m3_test_omega_dist ps omegas posP posW 0 = m3p_omega_dist ps omegas posP 1.0
m3_test_omega_dist ps omegas posP posW _ = m3p_omega_dist ps omegas posP posW

-- The M7 is just a beta distribution
-- gamma' = var(x)/(mu*(1-mu)) = 1/(a+b+1) = 1/(n+1)
m7_omega_dist mu gamma n_bins = uniformDiscretize (beta a b) n_bins where cap = min (mu/(1.0+mu)) ((1.0-mu)/(2.0-mu))
                                                                          gamma' = gamma*cap
                                                                          n = (1.0/gamma')-1.0
                                                                          a = n*mu
                                                                          b = n*(1.0 - mu)

-- The M8 is a beta distribution, where a fraction posP of sites have omega posW
m8_omega_dist mu gamma n_bins posP posW = extendDiscreteDistribution (m7_omega_dist mu gamma n_bins) posP posW

m8a_omega_dist mu gamma n_bins posP = m8_omega_dist mu gamma n_bins posP 1.0

m8a_test_omega_dist mu gamma n_bins posP posW 0 = m8_omega_dist mu gamma n_bins posP 1.0
m8a_test_omega_dist mu gamma n_bins posP posW _ = m8_omega_dist mu gamma n_bins posP posW

--  w1 <- uniform 0.0 1.0
--  [f1, f2] <- dirichlet' 2 1.0
m1a model_func w1 f1 = multiParameter_unit model_func (m1a_omega_dist f1 w1)

m2a model_func w1 f1 posP posW = multiParameter_unit model_func (m2a_omega_dist f1 w1 posP posW)

m2a_test model_func w1 f1 posP posW posSelection = multiParameter_unit model_func (m2a_test_omega_dist f1 w1 posP posW posSelection)

m3 model_func ps omegas = multiParameter_unit model_func (m3_omega_dist ps omegas)

m3_test model_func ps omegas posP posW posSelection = multiParameter_unit model_func (m3_test_omega_dist ps omegas posP posW posSelection)

m7 model_func mu gamma n_bins =  multiParameter_unit model_func (m7_omega_dist mu gamma n_bins)

m8 model_func mu gamma n_bins posP posW = multiParameter_unit model_func (m8_omega_dist mu gamma n_bins posP posW)

m8a model_func mu gamma n_bins posP = multiParameter_unit model_func (m8a_omega_dist mu gamma n_bins posP)

m8a_test model_func mu gamma n_bins posP posW posSelection = multiParameter_unit model_func (m8a_test_omega_dist mu gamma n_bins posP posW posSelection)

-- OK, so if I change this from [Mixture Omega] to Mixture [Omega] or Mixture (\Int -> Omega), how do I apply the function model_func to all the omegas?
branch_site model_func fs ws posP posW = MixtureModels [bg_mixture,fg_mixture]
-- background omega distribution -- where the last omega is 1.0 (neutral)
    where bg_dist = zip fs (ws ++ [1.0])
-- accelerated omega distribution -- posW for all categories
          accel_dist = zip fs (repeat posW)
-- background branches always use the background omega distribution              
          bg_mixture = multiParameter_unit model_func (mix [1.0-posP, posP] [bg_dist, bg_dist])
-- foreground branches use the foreground omega distribution with probability posP
          fg_mixture = multiParameter_unit model_func (mix [1.0-posP, posP] [bg_dist, accel_dist])

branch_site_test model_func fs ws posP posW posSelection = branch_site model_func fs ws posP posW'
    where posW' = if (posSelection == 1) then posW else 1.0

mut_sel w' (ReversibleMarkov a smap q0 pi0 _ _ _) = reversible_markov a smap q pi where
    w = list_to_vector w'
    q = mut_sel_q q0 w
    pi = mut_sel_pi pi0 w

mut_sel' w' q0 = mut_sel w q0 where
    w = get_ordered_elements (alphabet_letters a) w' "fitnesses"
    a = getAlphabet q0

fMutSel codon_a codon_w omega nuc_model = nuc_model & x3 codon_a & dNdS omega & mut_sel codon_w

fMutSel' codon_a codon_ws' omega nuc_model = fMutSel codon_a codon_ws omega nuc_model
    where codon_ws = get_ordered_elements (alphabet_letters codon_a) codon_ws' "fitnesses"

-- \#1->let {w' = listAray' #1} in \#2 #3->fMutSel #0 codon_w #2 #3
-- The whole body of the function is let-floated up in round 2, and w' is eliminated.
fMutSel0 a w omega nuc_q  = fMutSel a codon_w omega nuc_q
    where codon_w = [w'!aa| codon <- codons,let aa = translate a codon]
          w' = listArray' w
          codons = take n_letters [0..]
          n_letters = alphabetSize a
--replicate n_letters (1.0/intToDouble n_letters)

fMutSel0' codon_a amino_ws' omega nuc_model = fMutSel0 codon_a amino_ws omega nuc_model
                                               where amino_ws = get_ordered_elements (alphabet_letters amino_a) amino_ws' "fitnesses"
                                                     amino_a = getAminoAcids codon_a

-- Issue: bad mixing on fMutSel model

gamma_rates_dist alpha = gamma alpha (1.0/alpha)

gamma_rates base alpha n = multi_rate_unif_bins base (gamma_rates_dist alpha) n

log_normal_rates_dist sigmaOverMu = log_normal lmu lsigma where x = log(1.0+sigmaOverMu^2)
                                                                lmu = -0.5*x
                                                                lsigma = sqrt x

log_normal_rates base sigmaOverMu n = multi_rate_unif_bins base (log_normal_rates_dist sigmaOverMu) n

--dp base rates fraction = multi_rate base dist where dist = zip fraction rates
free_rates base rates fraction = scaled_mixture (replicate (length fraction) base) rates fraction

branch_transition_p t smodel branch_cat_list ds b = list_to_vector $ branchTransitionP (getNthMixture smodel (branch_cat_list!!b)) (ds!b)

transition_p_index t smodel branch_cat_list ds = mkArray (numBranches t) (branch_transition_p t smodel branch_cat_list ds)

unit_mixture m = MixtureModel (certainly m)

mmm m = MixtureModels [m]

empirical a filename = builtin_empirical a (listToString filename)

wag_frequencies a = zip (alphabet_letters a) (list_from_vector $ builtin_wag_frequencies a)
lg_frequencies a = zip (alphabet_letters a) (list_from_vector $ builtin_lg_frequencies a)