File: Matrix.hs

package info (click to toggle)
bali-phy 4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 15,392 kB
  • sloc: cpp: 120,442; xml: 13,966; haskell: 9,975; python: 2,936; yacc: 1,328; perl: 1,169; lex: 912; sh: 343; makefile: 26
file content (152 lines) | stat: -rw-r--r-- 7,546 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
module Bio.Alignment.Matrix where

import Bio.Sequence
import Bio.Alphabet
import Data.BitVector
import Data.Text (Text)
import Data.Maybe (fromMaybe)
import qualified Data.Text as Text

import Tree
import qualified Data.IntSet as IntSet
import qualified Data.IntMap as IntMap
import Data.IntMap (IntMap)
import Control.Monad (liftM2)
import Data.Maybe (catMaybes)
import Data.Foldable (toList)

data AlignmentMatrix

foreign import bpcall "Alignment:alignment_length" alignment_length :: AlignmentMatrix -> Int

foreign import bpcall "Alignment:load_alignment" builtin_load_alignment :: Alphabet -> CPPString -> IO AlignmentMatrix

load_alignment :: Alphabet -> String -> IO AlignmentMatrix
load_alignment alphabet filename = builtin_load_alignment alphabet (list_to_string filename)

foreign import bpcall "Alignment:alignment_from_sequences" builtin_alignment_from_sequences :: Alphabet -> EVector (EPair CPPString CPPString) -> AlignmentMatrix

alignment_from_sequences :: Alphabet -> [Sequence] -> AlignmentMatrix
alignment_from_sequences a seqs = builtin_alignment_from_sequences a (toVector $ fmap (\(n, s) -> c_pair (Text.toCppString n) (Text.toCppString s)) seqs)


foreign import bpcall "Alignment:sequence_names" builtin_sequence_names :: AlignmentMatrix -> EVector CPPString
sequence_names :: AlignmentMatrix -> [Text]
sequence_names a = map Text.fromCppString $ vectorToList $ builtin_sequence_names a

foreign import bpcall "Alignment:sequences_from_alignment" builtin_indices_from_alignment :: AlignmentMatrix -> EVector (EVector Int)
indices_from_alignment :: AlignmentMatrix -> [ EVector Int ]
indices_from_alignment a = vectorToList $ builtin_indices_from_alignment a

-- use isequences instead of "sequences" since we aren't using C++ Box<sequence> here.
isequences_from_alignment a = zip (sequence_names a) (indices_from_alignment a)


foreign import bpcall "Alignment:reorder_alignment" builtin_reorder_alignment :: EVector CPPString -> AlignmentMatrix -> AlignmentMatrix
reorder_alignment :: [String] -> AlignmentMatrix -> AlignmentMatrix
reorder_alignment names a = builtin_reorder_alignment names' a where names' = toVector $ map pack_cpp_string names

foreign import bpcall "Bits:alignment_row_to_presence_bitvector" builtin_alignment_row_to_bitvector :: AlignmentMatrix -> Int -> CBitVector
alignment_row_to_bitvector a row = BitVector $ builtin_alignment_row_to_bitvector a row

{-
  The idea behind getConnectedStates:

  1. First we compute the characters behind each branch with any characters at the source node.

    - We use Maybe BitVector, so that we don't assume that nodes with no data are known
      to be all deleted.  Instead, use represent such nodes with Nothing, indicating that we don't
      know what's behind them.

    - We don't treat leaf nodes differently.  Instead, we combine data at the source node of
      a branch with data on previous branches.  On a leaf branch, then typically there's
      data at the node, and there are 0 incoming branches.  On an internal node, then typically
      there is no data at the node, but data on incoming branches.

    - If we have data at an internal node, then we COULD have a situation
      where we have 1 --> 0 <-- 1.  In this case, what should we do?

      This presumes that the character is deleted at the node, but present on both
      children, which is not allowed.  So, we assume this can't happen and just OR the bits.
-}

charactersBehind :: IsTree t => IntMap (Maybe BitVector) -> t -> IntMap (Maybe BitVector)
charactersBehind sequence_masks tree = let
    branches = tree & getEdgesSet

    combine (Just x) (Just y) = Just (x .|. y)
    combine (Just x) Nothing  = Just x
    combine Nothing (Just y)  = Just y
    combine Nothing Nothing   = Nothing

    charBehind'' = branches & IntMap.fromSet charBehind
    charBehind' e = charBehind'' IntMap.! e
    charBehind e = let nodeMask = sequence_masks IntMap.! sourceNode tree e
                       edgeMasks = fmap charBehind' (edgesBeforeEdge tree e)
                   in foldr combine nodeMask edgeMasks
    in charBehind''


{-
  2. Then determine which characters are present at each node:

  - If there are NO incoming branches with data AND there's no data at the node, then
    there's no data on the whole tree, and we don't know how long the vector should be,
    so we error out.

  - If there's only one incoming branch with data behind it, then we just use the bitmask
    for that branch.  This is supposed to handle cases like a degree-1 root node with no data.

    This is actually quadratic in the number of incoming branches...
    An alternative algorithm would be to create an EVector Int and increment the integer at
     each location for each bitmap with a 1.  Then we set each bit to 1 if (i>=2) and 0 otherwise.
     But this loses the bitwise speedup if there are only three incoming edges, which is pretty common.

  - If there are two or more incoming branches with data behind them, then a character is
    presentat the node if its present behind at least two incoming branches.
-}


{- Issues:
   (a) This is quadritic in the node degree.
       We could fix this by COUNTING the number of bits at a node and recording a 1 if there are 2 or more.
       greaterThanBits 1 $ foldr1 addBits (zeroBits L) ms
   (b) Its not clear how to handle observed data at internal nodes.
       Suppose we have observed data at all nodes.  And suppose that we have "-" -> "N" -> "-".
       What should we do?
       Currently what we do is to treat an observed - as - even if there are Ns on both sides.
 -}
getConnectedStates :: IsTree t => IntMap (Maybe (EVector Int)) -> t -> IntMap BitVector
getConnectedStates sequences tree =
    let nodes = tree & getNodesSet

        sequence_masks = fmap (fmap bitmaskFromSequence') sequences

        node_masks = nodes & IntMap.fromSet mask_for_node

        charBehind = charactersBehind sequence_masks tree
        charBehind' e = charBehind IntMap.! e

        pairs l = [(x,y) | (x:ys) <- tails l, y <- ys]

        mask_for_node node = case sequence_masks IntMap.! node of
                               Just mask -> mask
                               Nothing -> let edgesin = edgesTowardNode tree node
                                              inputs = catMaybes $ toList $ fmap charBehind' edgesin
                                          in case inputs of [] -> error $ "get_connected_states: No sequences at or behind node " ++ show node
                                                            [m] -> m
                                                            ms -> foldr1 (.|.) [m1 .&. m2 | (m1,m2) <- pairs ms]
     in node_masks

minimally_connect_characters leaf_sequences tree all_sequences = nodes & IntMap.fromSet sequenceForNode
    where sequenceForNode n = maskSequence (node_masks IntMap.! n) (all_sequences IntMap.! n)
          node_masks = getConnectedStates leaf_sequences tree
          nodes = tree & getNodesSet

{- Here we create fake sequences at internal nodes that are entirely composed of Ns, with no gaps. -}
addAllMissingAncestors observedSequences tree = fromMaybe missingSequence <$> observedSequences
    where missingSequence = toVector $ replicate alignmentLength missingCharIndex
          alignmentLength = fromMaybe (error msg) $ allSame $ observedSequenceLengths
          msg = "minimally_connect_characters': not all observed sequences are the same length!"
          observedSequenceLengths = vector_size <$> (catMaybes $ IntMap.elems observedSequences)