File: Markov.hs

package info (click to toggle)
bali-phy 4.0~beta16%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,192 kB
  • sloc: cpp: 119,288; xml: 13,482; haskell: 9,722; python: 2,930; yacc: 1,329; perl: 1,169; lex: 904; sh: 343; makefile: 26
file content (117 lines) | stat: -rw-r--r-- 4,754 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
module SModel.Markov (module SModel.Markov, module SModel.Frequency, module Data.Matrix, getEqFreqs) where

import qualified Markov
import           Markov (CTMC(..))
import           SModel.Simple
import           SModel.Rate
import           SModel.Frequency
import           Bio.Alphabet
import           Data.Matrix
import           Tree
import           SModel.EigenExp

foreign import bpcall "SModel:get_equilibrium_rate" get_equilibrium_rate :: Alphabet -> EVector Int -> Matrix Double -> EVector Double -> Double

-- This takes the rate matrix q and adds:
-- * pi -> a cached version of the equilibrium frequencies
-- * t -> a scaling factor
-- * r -> a cached version of the rate at equilibrium
--
-- * the observation model
--   + a -> an alphabet
--   + smap -> mapping from markov states -> alphabet states

-- Now, all of these except (a,smap) are just caches for things that we can compute from q.
-- It is the (a,smap) parts that make this into a SUBSTITUTION model.

-- What the LIKELIHOOD wants is just:
-- * component probabilities
--   - number of components = component_probabilities.size()
-- * mapping from states -> observed letters
--   - number of states = smap(m).size()
-- * for each branch, the probability of transitioning from i->j in component m.
--   That is, transition_p(tree, branch, mixture) = matrix(i,j)
-- * for each component, the root frequencies -> root_frequencies(m)
-- I guess this is a "LikelihoodMixtureModel"
-- Ideally, it would also specify if it is reversible
--   that is, all transition_p matricies in component m satisfy detailed balance with respect to the root frequences in component m
--

-- Fields are: alphabet, smap, q, and cached rate.

data Markov = Markov Alphabet (EVector Int) Markov.Markov Double

-- This is used both for observations, and also to determine which states are the same for computing rates.
instance HasSMap Markov where
    getSMap (Markov _ s _ _) = s

instance CTMC Markov where
    qExp (Markov _ _ m _) = qExp m
    getStartFreqs (Markov _ _ m _) = getStartFreqs m
    getEqFreqs (Markov _ _ m _) = getEqFreqs m
    getQ (Markov _ _ m  _) = getQ m

simple_smap a = list_to_vector [0..(alphabetSize a)-1]

-- In theory we could take just (a,q) since we could compute smap from a (if states are simple) and pi from q.
markov a smap q pi = Markov a smap rm rate where
    rm = Markov.markov q pi
    rate = get_equilibrium_rate a smap (getQ rm) pi

-- In theory we could take just (a,q) since we could compute smap from a (if states are simple) and pi from q.
markov' a smap q = Markov a smap rm rate where
    rm = Markov.markov' q
    rate = get_equilibrium_rate a smap (getQ rm) (getEqFreqs rm)

instance HasAlphabet Markov where
    getAlphabet (Markov a _ _ _) = a

instance HasBranchLengths t => SimpleSModel t Markov where
    branch_transition_p (SModelOnTree tree smodel factor) b = [qExp $ scale (factor * branchLength tree b / r) smodel]
        where r = rate smodel
    distribution _ = [1.0]
    nBaseModels _ = 1
    stateLetters (SModelOnTree _ rm _) = getSMap rm
    componentFrequencies (SModelOnTree _ smodel _) i = [getStartFreqs smodel]!!i

instance Scalable Markov where
    scale x (Markov a s rm r) = Markov a s (scale x rm) (x*r)

instance RateModel Markov where
    rate (Markov _ _ _ r) = r

-- A markov model needs a map from state -> letter in order to have a rate!
-- For codon models, we basically use smap = id for nucleotides (then divide by three)
--   and amino acids.
-- If we had a covarion model on codons, then we'd need to first collaps the state to
-- a codon, and then collapse the codons to either (i) amino acids or (ii) codons, and then divide by three.

instance Show Markov where
    show q = show $ getQ q

nonEq a rates pi = markov a smap q pi
    where smap = simple_smap a
          n = length $ getLetters a
          q = Markov.non_rev_from_list n rates

pairNames ls = [l1 ++ l2 | (l1,l2) <- ls]

allOrderedPairs l = [(x,y) | x <- l, y <- l, x /= y]

orderedLetterPairNames a = pairNames $ allOrderedPairs (getLetters a)

nonEq' a rates' pi' = nonEq a rs pi
    where lPairs = allOrderedPairs (getLetters a)
          rs = if length lPairs == length rates' then
                   [ Markov.getElement rates' (l1++l2) | (l1,l2) <- lPairs]
               else
                   error $ "Expected "++show (length lPairs)++" rates but got "++ show (length rates')++"!"
          pi = list_to_vector $ frequencies_from_dict a pi'

labelledEqFrequencies m = zip (getLetters a) frequencies
    where frequencies = list_from_vector $ getEqFreqs m
          a = getAlphabet m

labelledStartFrequencies m = zip (getLetters a) frequencies
    where frequencies = list_from_vector $ getStartFreqs m
          a = getAlphabet m