File: Changepoints.hs

package info (click to toggle)
bali-phy 4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 15,392 kB
  • sloc: cpp: 120,442; xml: 13,966; haskell: 9,975; python: 2,936; yacc: 1,328; perl: 1,169; lex: 912; sh: 343; makefile: 26
file content (113 lines) | stat: -rw-r--r-- 4,504 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
module Probability.Distribution.Changepoints where
-- Multiple Changepoint Model

import Probability.Random
import Probability.Distribution.Uniform
import Probability.Distribution.Poisson
import Probability.Distribution.Transform (logLaplace)
import Probability.Distribution.List (iid2)

data Changepoints d = Changepoints Double (Double,Double) d

instance Dist d => Dist (Changepoints d) where
    type Result (Changepoints d) = [(Result d, Double, Double)]
    dist_name _ = "Changepoints"

instance IOSampleable d => IOSampleable (Changepoints d) where
    sampleIO (Changepoints lambda (start,end) dist) = do
      n <- sampleIO $ poisson lambda
      ps' <- sortOn snd <$> (sampleIO $ iid2 (n+1) dist (uniform start end))
      let ((y1,x1):ps) = ps'
          f x = start + (x-x1)*(end-start)/(end-x1)
          points = toChangePoints y1 start end $ [ (y, f x) | (y,x) <- ps]
      return points

{- We need to pick the point corresponding to the left/starting value in
   an exchangeable way, like we are picking from a set.  Currently, we
   pick the left-most point, and then need to rescale the remaining points.

   We could pick an integer index at random from the sorted list.  This
   should have the right distribution, since it has a 1/n chance of being
   the last added point, the second-to-the-last-added point, etc.  So
   everything should still be uniformly space. BUT, the integer index
   could go down to probability 0 if the size of the set decreases.
   - could we make (uniform_int 0 n) resample if n goes down, and keep the same
     value otherwise?

   We could pick a point at random from the set if we could operate on sets
   and look at the previous value.  If the input set changes, then we could
   keep the same choice if it remains in the set, and choose another entry
   uniformly at random otherwise.
 -}

instance Sampleable d => Sampleable (Changepoints d) where
    sample (Changepoints lambda (start,end) dist) = do
      n <- sample $ poisson lambda
      ps' <- sortOn snd <$> (sample $ iid2 (n+1) dist (uniform start end))
      let ((y1,x1):ps) = ps'
          f x = start + (x-x1)*(end-start)/(end-x1)
          points = toChangePoints y1 start end $ [ (y, f x) | (y,x) <- ps]
      return points
    
changepoints lambda (start,end) dist = Changepoints lambda (start,end) dist

toChangePoints value start end [] = [(value,start,end)]
toChangePoints value start end ((value2,x2):ps) = (value,start,x2):toChangePoints value2 x2 end ps

------------------------------------------------------------------------
{-
 ok, so pos * x1 + (1-pos) * x2 = 1, and x2/x1 = ratio.  x2 = ratio * x1
 And x1/x2 > 0.
 pos * x1 + (1-pos) * ratio * x1 = 1

x1 = 1 / (pos + (1-pos) * ratio)
s2 = ratio / (pos + (1-pos) * ratio)
-}    

data Changepoints2 d = Changepoints2 Double (Double,Double) d

instance (Dist d, Result d ~ Double) => Dist (Changepoints2 d) where
    type Result (Changepoints2 d) = [(Result d, Double, Double)]
    dist_name _ = "Changepoints2"

instance (Sampleable d, Result d ~ Double) => Sampleable (Changepoints2 d) where
    sample (Changepoints2 lambda (start,end) dist) = do
      x <- sample $ dist
      rtree <- getChangePoints lambda
      return $ flattenChangePoints x start (end - start) rtree

changepoints2 lambda (start,end) dist = Changepoints2 lambda (start,end) dist

type Position  = Double
type RatioType = Double
type Length    = Double

data RegionTree = Constant | Split2 Position RatioType RegionTree RegionTree

flattenChangePoints x left length Constant = [(x, left, left+length)]

flattenChangePoints x left length (Split2 pos ratio region1 region2) =
    flattenChangePoints x1 left length1 region1 ++ flattenChangePoints x2 (left+length1) length2 region2
        where 
          length1 = pos*length
          length2 = length - length1
          x1 = x / (pos + (1-pos) * ratio)
          x2 = ratio * x1


-- OK, so if there L nucleotides, and L-1 places to change,
-- we could check that Poisson(lambda * (L-1)) > 0
-- and we are not done, then we have now have M-1 and N-2 places to change

getChangePoints lambda = do
  count <- sample $ poisson $ lambda  -- Or should this be lambda * length - 1?
  if count == 0 then
      return $ Constant
  else do
    ratio <- sample $ logLaplace 0 1
    pos <- sample $ uniform 0 1
    let lambda1 = pos * lambda
        lambda2 = (1-pos) * lambda
    region1 <- getChangePoints lambda1
    region2 <- getChangePoints lambda2
    return $ Split2 pos ratio region1 region2