1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
|
module SModel.Markov (module SModel.Markov, module SModel.Frequency, module Data.Matrix, getEqFreqs) where
import qualified Markov
import Markov (CTMC(..))
import SModel.Simple
import SModel.Rate
import SModel.Frequency
import Bio.Alphabet
import Data.Matrix
import Tree
import SModel.EigenExp
foreign import bpcall "SModel:get_equilibrium_rate" get_equilibrium_rate :: Alphabet -> EVector Int -> Matrix Double -> EVector Double -> Double
-- This takes the rate matrix q and adds:
-- * pi -> a cached version of the equilibrium frequencies
-- * t -> a scaling factor
-- * r -> a cached version of the rate at equilibrium
--
-- * the observation model
-- + a -> an alphabet
-- + smap -> mapping from markov states -> alphabet states
-- Now, all of these except (a,smap) are just caches for things that we can compute from q.
-- It is the (a,smap) parts that make this into a SUBSTITUTION model.
-- What the LIKELIHOOD wants is just:
-- * component probabilities
-- - number of components = component_probabilities.size()
-- * mapping from states -> observed letters
-- - number of states = smap(m).size()
-- * for each branch, the probability of transitioning from i->j in component m.
-- That is, transition_p(tree, branch, mixture) = matrix(i,j)
-- * for each component, the root frequencies -> root_frequencies(m)
-- I guess this is a "LikelihoodMixtureModel"
-- Ideally, it would also specify if it is reversible
-- that is, all transition_p matricies in component m satisfy detailed balance with respect to the root frequences in component m
--
-- Fields are: alphabet, smap, q, and cached rate.
data Markov = Markov Alphabet (EVector Int) Markov.Markov Double
-- This is used both for observations, and also to determine which states are the same for computing rates.
instance HasSMap Markov where
getSMap (Markov _ s _ _) = s
instance CTMC Markov where
qExp (Markov _ _ m _) = qExp m
getStartFreqs (Markov _ _ m _) = getStartFreqs m
getEqFreqs (Markov _ _ m _) = getEqFreqs m
getQ (Markov _ _ m _) = getQ m
simple_smap a = list_to_vector [0..(alphabetSize a)-1]
-- In theory we could take just (a,q) since we could compute smap from a (if states are simple) and pi from q.
markov a smap q pi = Markov a smap rm rate where
rm = Markov.markov q pi
rate = get_equilibrium_rate a smap (getQ rm) pi
-- In theory we could take just (a,q) since we could compute smap from a (if states are simple) and pi from q.
markov' a smap q = Markov a smap rm rate where
rm = Markov.markov' q
rate = get_equilibrium_rate a smap (getQ rm) (getEqFreqs rm)
instance HasAlphabet Markov where
getAlphabet (Markov a _ _ _) = a
instance HasBranchLengths t => SimpleSModel t Markov where
branch_transition_p (SModelOnTree tree smodel factor) b = [qExp $ scale (factor * branchLength tree b / r) smodel]
where r = rate smodel
distribution _ = [1.0]
nBaseModels _ = 1
stateLetters (SModelOnTree _ rm _) = getSMap rm
componentFrequencies (SModelOnTree _ smodel _) i = [getStartFreqs smodel]!!i
instance Scalable Markov where
scale x (Markov a s rm r) = Markov a s (scale x rm) (x*r)
instance RateModel Markov where
rate (Markov _ _ _ r) = r
-- A markov model needs a map from state -> letter in order to have a rate!
-- For codon models, we basically use smap = id for nucleotides (then divide by three)
-- and amino acids.
-- If we had a covarion model on codons, then we'd need to first collaps the state to
-- a codon, and then collapse the codons to either (i) amino acids or (ii) codons, and then divide by three.
instance Show Markov where
show q = show $ getQ q
nonEq a rates pi = markov a smap q pi
where smap = simple_smap a
n = length $ getLetters a
q = Markov.non_rev_from_list n rates
pairNames ls = [l1 ++ l2 | (l1,l2) <- ls]
allOrderedPairs l = [(x,y) | x <- l, y <- l, x /= y]
orderedLetterPairNames a = pairNames $ allOrderedPairs (getLetters a)
nonEq' a rates' pi' = nonEq a rs pi
where lPairs = allOrderedPairs (getLetters a)
rs = if length lPairs == length rates' then
[ Markov.getElement rates' (l1++l2) | (l1,l2) <- lPairs]
else
error $ "Expected "++show (length lPairs)++" rates but got "++ show (length rates')++"!"
pi = list_to_vector $ frequencies_from_dict a pi'
labelledEqFrequencies m = zip (getLetters a) frequencies
where frequencies = list_from_vector $ getEqFreqs m
a = getAlphabet m
labelledStartFrequencies m = zip (getLetters a) frequencies
where frequencies = list_from_vector $ getStartFreqs m
a = getAlphabet m
|