1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
|
module Bio.Alignment (module Bio.Alignment,
module Bio.Sequence,
module Bio.Alignment.Matrix,
module Bio.Alignment.Pairwise,
module Bio.Alignment.Class) where
import Tree
import Data.BitVector
import Data.Foldable
import Data.Array
import Data.Maybe (catMaybes, fromMaybe)
import Parameters
import Foreign.Vector
import Foreign.Pair
import Bio.Sequence
import Bio.Alphabet -- for Alphabet type
import Bio.Alignment.Matrix
import Bio.Alignment.Pairwise
import Bio.Alignment.Class
import Data.IntSet (IntSet)
import qualified Data.IntSet as IntSet
import Data.IntMap (IntMap)
import qualified Data.IntMap as IntMap
import qualified Data.Text as Text
import Data.Text (Text)
import Foreign.IntMap (EIntMap)
import qualified Foreign.IntMap as FIM
import qualified Data.Map as Map
import Data.Map (Map)
data VectorPairIntInt -- ancestral sequences with (int letter, int category) for each site.
foreign import bpcall "Alignment:leaf_sequence_counts" builtin_leaf_sequence_counts :: AlignmentMatrix -> Int -> EVector Int -> EVector (EVector Int)
branch_hmms (model,_) tree scale = getUEdgesSet tree & IntMap.fromSet (model $ fmap (scale *) $ branchLengthsSet tree)
seqlength as tree node = pairwise_alignment_length1 (as IntMap.! b) where
b = head $ edgesOutOfNode tree node
{-
pairwise_alignments_from_matrix a tree = [ pairwise_alignment_from_bits bits1 bits2 | b <- [0..2*numBranches tree-1],
let bits1 = bits ! sourceNode tree b,
let bits2 = bits ! targetNode tree b]
where bits = minimally_connect_characters a tree
-}
{- NOTE: How should we handle forests?
Currently we store the node sequence lengths directory only for nodes with observed data.
We could alternatively store the node sequence lengths only for nodes not in a pairwise alignment.
-}
-- We can't just do forall t.AlignmentOnTree t, because then any constraints on t will be on existential variables, resulting in ambiguity.
data AlignmentOnTree t = AlignmentOnTree t Int (IntMap Int) (IntMap PairwiseAlignment)
pairwise_alignments (AlignmentOnTree _ _ _ as) = as
instance IsGraph t => Alignment (AlignmentOnTree t) where
alignmentLength a@(AlignmentOnTree t _ _ as) = (sequenceLength a node0) + sum [numInsert (as IntMap.! b) | b <- allEdgesFromNode t node0]
where node0 = head $ getNodes t
numSequences (AlignmentOnTree _ n _ _) = n
sequenceLength (AlignmentOnTree t n ls as) node =
case IntMap.lookup node ls of Just length -> length
Nothing -> case edgesOutOfNode t node of
(b:_) -> pairwise_alignment_length1 (as IntMap.! b)
_ -> error "sequenceLength: no length for degree-0 node!"
mkSequenceLengthsMap a@(AlignmentOnTree tree _ _ _) = getNodesSet tree & IntMap.fromSet (\node -> sequenceLength a node)
type EAlignment = EVector (EPair CPPString (EVector Int))
toEAlignment a = toVector [ c_pair (Text.toCppString label) indices | (label, indices) <- a ]
fromEAlignment ea = map ( (\(x,y) -> (Text.fromCppString x,y)) . pair_from_c) $ vectorToList ea
-- Current a' is an alignment, but counts and mapping are EVector
-- AlignmentMatrix -> ETuple (AlignmentMatrix, EVector Int, EVector Int)
foreign import bpcall "Alignment:compress_alignment" builtin_compress_alignment :: EAlignment ->
EPair EAlignment (EPair (EVector Int) (EVector Int))
compress_alignment a = (compressed, counts, mapping)
where tmp123 = builtin_compress_alignment (toEAlignment a)
(compressed', tmp23) = pair_from_c tmp123
(counts, mapping) = pair_from_c tmp23
compressed = fromEAlignment compressed'
-- Current a' is an alignment, but counts and mapping are EVector
-- AlignmentMatrix -> ETuple (AlignmentMatrix, EVector Int, EVector Int)
foreign import bpcall "Alignment:compress_alignment_var_nonvar" builtin_compress_alignment_var_nonvar :: EAlignment -> Alphabet ->
EPair EAlignment (EVector Int)
compress_alignment_var_nonvar a alphabet = (compressed, counts)
where tmp12 = builtin_compress_alignment_var_nonvar (toEAlignment a) alphabet
(compressed', counts) = pair_from_c tmp12
compressed = fromEAlignment compressed'
totalNumIndels (AlignmentOnTree t _ _ as) = sum [numIndels (as IntMap.! b) | b <- allEdgesFromNode t node0]
where node0 = head $ getNodes t
totalLengthIndels (AlignmentOnTree t _ _ as) = sum [lengthIndels (as IntMap.! b) | b <- allEdgesFromNode t node0]
where node0 = head $ getNodes t
-- Alignment -> Int -> EVector Int -> [EVector Int]
leaf_sequence_counts a n counts = vectorToList $ builtin_leaf_sequence_counts a n counts
foreign import bpcall "Alignment:ancestral_sequence_alignment" builtin_ancestral_sequence_alignment :: AlignmentMatrix -> EVector VectorPairIntInt -> EVector Int -> AlignmentMatrix
ancestral_sequence_alignment tree a0 states smap = builtin_ancestral_sequence_alignment a0 states' smap
where states' = toVector [ states IntMap.! node | node <- sort $ getNodes tree]
-- Extract (component,state) -> state.
foreign import bpcall "Alignment:" extractStates :: VectorPairIntInt -> EVector Int
{-
OK, so what do we use the original alignment matrix for?
* the alphabet
* the original sequences.
Right now, I think the node names still correspond to the input alignment names.
But when the node names stop being [0..n_sequences-1], we will need a way to
1. construct the alignment from a list of Sequence objects
2. get a list (leafNodes ++ internalNodes)
3. for each node, get the leaf label
4. use smap to project the state to an alphabet letter.
5. turn the list of Sequence objects into an alignment.
Maybe we want a map from String -> EVector Int.
That would lose the comments on the fastas though.
-}
foreign import bpcall "Alignment:select_alignment_columns" builtin_select_alignment_columns :: AlignmentMatrix -> EVector Int -> AlignmentMatrix
select_alignment_columns alignment sites = builtin_select_alignment_columns alignment (toVector sites)
foreign import bpcall "Alignment:select_alignment_pairs" builtin_select_alignment_pairs :: AlignmentMatrix -> EVector (EPair Int Int) -> Alphabet -> AlignmentMatrix
select_alignment_pairs alignment sites doublets = builtin_select_alignment_pairs alignment sites' doublets
where sites' = toVector $ map (\(x,y) -> c_pair x y) sites
alignmentOnTreeFromSequences tree (Aligned sequences) = AlignmentOnTree tree numSequences lengths pairwiseAs
where -- observedSequences :: IntMap (Maybe (EVector Int))
observedSequences = labelToNodeMap tree $ getSequences sequences
allSequences = minimally_connect_characters observedSequences tree (addAllMissingAncestors observedSequences tree)
numSequences = length $ getSequences sequences
lengths = getNodesSet tree & IntMap.fromSet (\node -> vector_size $ stripGaps $ allSequences IntMap.! node)
bits = fmap bitmaskFromSequence' allSequences
alignmentForBranch b = pairwise_alignment_from_bits (bits IntMap.! source) (bits IntMap.! target)
where source = sourceNode tree b
target = targetNode tree b
pairwiseAs = getEdgesSet tree & IntMap.fromSet alignmentForBranch
find_sequence label sequences = find (\s -> fst s == label) sequences
-- Get a map from labeled nodes to Just v, and unlabeled nodes to Nothing.
-- Complain if a labeled node doesn't have a corresponding entry in the Map.
labelToNodeMap :: (IsGraph t, Eq (LabelType t)) => t -> [(LabelType t, v)] -> IntMap (Maybe v)
labelToNodeMap tree things = getNodesSet tree & IntMap.fromSet objectForNode where
objectForNode node = case getLabel tree node of
Nothing -> Nothing
Just label ->
case lookup label things of
Just object -> Just object
Nothing -> error $ "No object for node with a certain label"
getSequencesOnTree :: (IsGraph t, LabelType t ~ Text) => [Sequence] -> t -> IntMap (Maybe Sequence)
getSequencesOnTree sequence_data tree = getNodesSet tree & IntMap.fromSet sequence_for_node where
sequence_for_node node = case getLabel tree node of
Nothing -> Nothing
Just label ->
case find_sequence label sequence_data of
Just sequence -> Just sequence
Nothing -> error $ "No such sequence " ++ Text.unpack label
getLabelled :: IsGraph t => t -> (LabelType t -> a -> b) -> IntMap a -> [b]
getLabelled tree f things = catMaybes $ fmap go $ IntMap.toList things where
go (node, thing) = case getLabel tree node of
Just label -> Just $ f label thing
Nothing -> Nothing
sequencesFromTree tree isequences = [(label, isequence) | n <- leafNodes tree ++ internalNodes tree,
let label = addAncestralLabel n (getLabels tree),
let isequence = isequences IntMap.! n]
foreign import bpcall "Vector:showObject" showVectorPairIntInt :: VectorPairIntInt -> CPPString
instance Show VectorPairIntInt where
show = unpack_cpp_string . showVectorPairIntInt
-- Ideally we'd like to do
-- type NodeAlignment = EPair Int BranchAlignments
-- but this creates recursive type synonyms, which are not allowed.
-- We hack around this by removing the definition of NodeAlignment.
-- Then NodeAlignment needs a foreign import to construct it.
-- This constructs an all-insert alignment.
data NodeAlignment -- NodeAlignment SourceNode Int (EVector BranchAlignment)
foreign import bpcall "Alignment:" mkNodeAlignment :: Int -> Int -> EVector BranchAlignment -> NodeAlignment
data BranchAlignment -- BranchAlignment TargetNode PairwiseAlignment (EVector BranchAlignment)
foreign import bpcall "Alignment:" mkBranchAlignment :: Int -> PairwiseAlignment -> EVector BranchAlignment -> BranchAlignment
exportAlignmentOnTree :: IsTree t => AlignmentOnTree t -> NodeAlignment
exportAlignmentOnTree a@(AlignmentOnTree tree _ _ as) = mkNodeAlignment root (sequenceLength a root) (branchAlignments $ edgesOutOfNodeArray tree root)
where root = head $ getNodes tree
branchAlignments edges = toVector [ mkBranchAlignment (targetNode tree e) (as IntMap.! e) (branchAlignments $ edgesAfterEdgeArray tree e) | e <- toList $ edges]
foreign import bpcall "Alignment:" substituteLetters :: EVector Int -> EVector Int -> EVector Int
foreign import bpcall "Alignment:" constructPositionSequencesRaw :: NodeAlignment -> EIntMap (EVector Int)
constructPositionSequences a = FIM.importIntMap $ constructPositionSequencesRaw $ exportAlignmentOnTree a
alignedSequences alignment@(AlignmentOnTree tree _ _ _) sequences = getNodesSet tree & IntMap.fromSet stateSequenceForNode
where positionSequences = constructPositionSequences alignment
stateSequenceForNode n = substituteLetters (sequences IntMap.! n) (positionSequences IntMap.! n)
class ToFasta a where
toFasta :: a -> Text
instance ToFasta CharacterData where
toFasta (CharacterData a sequences) = fastaSeqs [(label,sequenceToText a sequence) | (label,sequence) <- sequences]
instance ToFasta UnalignedCharacterData where
toFasta (Unaligned d) = toFasta d
instance ToFasta AlignedCharacterData where
toFasta (Aligned d) = toFasta d
align alignment (Unaligned (CharacterData alphabet seqs)) = Aligned (CharacterData alphabet alignedSeqs)
where AlignmentOnTree tree _ _ _ = alignment
seqsOnTree = fromMaybe (error "No label") <$> labelToNodeMap tree seqs
alignedSeqsOnTree = alignedSequences alignment seqsOnTree
alignedSeqs = getLabelled tree (,) alignedSeqsOnTree
instance Alignment AlignedCharacterData where
alignmentLength (Aligned (CharacterData _ seqs)) = vector_size $ snd $ head seqs
numSequences (Aligned (CharacterData _ seqs)) = length seqs
sequenceLength (Aligned (CharacterData _ seqs)) index = vector_size $ stripGaps $ snd $ (seqs !! index)
-- In both cases we normalize the oldest taxon to have time 0.
-- Would we ever want to specify the present as being older that the oldest taxon?
data TimeDirection = Forward | Backward
foreign import bpcall "Alignment:" getTaxonAgesRaw :: EVector CPPString -> CPPString -> Int -> EVector Double
getTaxonAges labels regex direction = zip labels (toList $ getTaxonAgesRaw cppLabels cppRegex cppDirection)
where cppLabels = toVector (map Text.toCppString labels)
cppRegex = pack_cpp_string regex
convert Forward = 0
convert Backward = 1
cppDirection = convert direction
|