1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
|
module Tree.Newick where
-- For the format, see comments at http://evolution.genetics.washington.edu/phylip/newick_doc.html
-- http://evolution.genetics.washington.edu/phylip/newicktree.html
--
-- See the package BioBase.Newick at https://hackage.haskell.org/package/BiobaseNewick
--
-- See Data.Tree
import Tree
import Parse
import Data.Text (Text)
import qualified Data.Text as T
import Data.Char
import Data.Foldable
import Data.Maybe
import Data.List (stripPrefix)
import Data.Unique.Id
import qualified Data.IntMap as IntMap
import qualified Data.IntSet as IntSet
import Data.Array
import Data.Text.Display
-- We need to handle adding (i) root (ii) labels (iii) branch lengths.
-- Can we do this more generically?
class IsTree t => WriteNewickNode t where
node_info :: t -> Int -> Text
branch_info :: t -> (Maybe Int) -> Text
node_info _ _ = T.empty
branch_info _ _ = T.empty
get_node_label t node = T.append (node_info t node) (attributesText $ getNodeAttributes t node)
get_branch_label t branch@(Just edge) = let text = (branch_info t branch) `T.append` (attributesText $ getEdgeAttributes t edge)
in if T.null text then text else T.singleton ':' `T.append` text
get_branch_label t Nothing = T.empty
instance Display l => WriteNewickNode (Tree l) where
node_info tree node = case getLabel tree node of
Just label -> quoteLabel (display label)
Nothing -> T.empty
instance WriteNewickNode t => WriteNewickNode (WithRoots t) where
node_info (WithRoots tree _ _) = node_info tree
branch_info (WithRoots tree _ _) = branch_info tree
foreign import bpcall "Text:" quoteLabelRaw :: CPPString -> CPPString
quoteLabel l = T.fromCppString $ quoteLabelRaw $ T.toCppString l
instance WriteNewickNode t => WriteNewickNode (WithBranchLengths t) where
node_info (WithBranchLengths tree lengths) node = node_info tree node
branch_info blt (Just b) = T.doubleToText (branchLength blt b)
branch_info _ Nothing = T.empty
instance (HasRoot t, WriteNewickNode t) => WriteNewickNode (WithNodeTimes t) where
node_info (WithNodeTimes tree _) node = node_info tree node
branch_info nht (Just b) = T.doubleToText (branchLength nht b)
branch_info _ Nothing = T.empty
instance (HasBranchLengths t, WriteNewickNode t) => WriteNewickNode (WithBranchRates t) where
node_info (WithBranchRates tree _) node = node_info tree node
branch_info rtt (Just b) = T.doubleToText (branchLength rtt b)
branch_info _ Nothing = T.empty
writeNewick_rooted tree = let nodes = writeNewick_node tree (root tree)
attributes = attributesText $ getAttributes tree
in if T.null attributes
then nodes
else T.concat [attributes,T.singleton ' ',nodes]
writeNewick tree = writeNewick_rooted (makeRooted tree)
intercalate2 t ts = foldl1 (\x y -> x `T.append` t `T.append` y) ts
writeNewick_node tree node = write_branches_and_node tree (edgesOutOfNode tree node) node Nothing `T.snoc` ';' where
write_branches_and_node tree branches node branch = write_branches tree branches `T.append` get_node_label tree node `T.append` get_branch_label tree branch
write_branches tree branches | null branches = T.empty
write_branches tree branches | otherwise = (T.singleton '(') `T.append` text `T.append` (T.singleton ')') where
text = intercalate2 (T.singleton ',') $ fmap (write_branch tree) $ branches
write_branch tree branch = write_branches_and_node tree (edgesAfterEdge tree branch) (targetNode tree branch) (Just branch)
split c "" = []
split c s = case break (== c) s of
(l, s') -> [l] ++ case s' of [] -> []
_:s'' -> lines s''
makeAttributes comments = Attributes $ concatMap go comments where
go comment = if take 5 comment == "&NHX:"
then fmap go' (split ':' (drop 5 comment))
else fmap go' (split ',' comment)
go' comment = case break (== '=') comment of
(key,[]) -> (T.pack key,Nothing)
(key,_:value) -> (T.pack key, Just $ T.pack value)
data NewickNode = NewickNode [NewickNode] (Maybe Text) Attributes (Maybe Double) Attributes
data NewickTree = NewickTree Attributes NewickNode
comment = do
char '['
result <- many $ satisfy (/= ']')
char ']'
if (head result == '&') then
return (Just (tail result))
else
return Nothing
newickSpaces = fmap makeAttributes $ fmap catMaybes $ many $ (comment <|> (satisfy isSpace >> return Nothing))
-- '' escapes to ' inside a quoted string
quoted_char = (string "''" >> return '\'')
<|>
satisfy (\c -> (isPrint c) && (c /= '\'') )
-- unquoted strings can't contain punctuation, and _ changes to space
unquoted_char = (char '_' >> return ' ')
<|>
satisfy (\c -> isPrint c && not (c `elem` " ()[]':;,"))
-- lex: quoted label
quoted_label = do string "'"
label <- many quoted_char
string "'"
return label
-- lex: unquoted label
unquoted_label = some unquoted_char
-- lex: label
node_label = fmap T.pack $ quoted_label <|> unquoted_label
-- I don't want to REQUIRE a branch length
branch_length_p = do
string ":"
attributes1 <- newickSpaces
result <- optionMaybe (token parse_double)
attributes2 <- newickSpaces
return (result, attributes1 +:+ attributes2)
subtree = do nodeAttributes1 <- newickSpaces
children <- option [] descendant_list
nodeAttributes2 <- newickSpaces
node_label <- optionMaybe node_label
nodeAttributes3 <- newickSpaces
(branchLength, branchAttributes) <- (branch_length_p <|> return (Nothing, Attributes []))
return (NewickNode children
node_label
(nodeAttributes1 +:+ nodeAttributes2 +:+ nodeAttributes3)
branchLength
branchAttributes)
descendant_list = do
string "("
children <- sepBy1 subtree (string ",")
string ")"
return children
treeParser = do comments <- newickSpaces
node <- subtree
string ";"
spaces
return $ NewickTree comments node
print_newick (NewickTree comments node) = show comments ++ " " ++ print_newick_sub node ++ ";"
replace from to [] = []
replace from to string = case stripPrefix from string of
Just rest -> to ++ replace from to rest
Nothing -> head string : replace from to (tail string)
quoteName :: String -> String
quoteName name = if any (\l -> elem l "_()[]':;,") name then
"'"++(replace "'" "''" name)++"'"
else
replace " " "_" name
print_newick_sub (NewickNode children node nodeAttributes branch branchAttributes) =
children_string ++ node_string ++ (show nodeAttributes) ++ branch_string ++ (show branchAttributes)
where
children_string | null children = ""
| otherwise = "("++ intercalate "," (map print_newick_sub children) ++ ")"
node_string = case node of Just name -> quoteName (T.unpack name)
Nothing -> ""
branch_string = case branch of Just length -> ":" ++ show length
Nothing -> ""
parse_newick text = runParser treeParser text
-- edge comments come from the child
data Info = Info { i_nodes :: [Node],
i_edges :: [Edge],
i_labels :: [(Int,Maybe Text)],
i_lengths :: [(Int,Maybe Double)],
i_nodeAttributes :: [(Int,Attributes)],
i_edgeAttributes :: [(Int,Attributes)]
}
combineInfo (Info ns1 es1 ls1 bls1 ncs1 ecs1) (Info ns2 es2 ls2 bls2 ncs2 ecs2) =
Info (ns1++ns2) (es1++es2) (ls1++ls2) (bls1++bls2) (ncs1++ncs2) (ecs1++ecs2)
getEdge ids node@(NewickNode _ _ _ branchLength branchAttributes) nodeId = (edgeId, edgeInfo `combineInfo` childInfo) where
edgeId = (hashedId $ idFromSupply ids)+1
reverseEdgeId = reverseEdge edgeId
(ids',childIds) = splitIdSupply ids
edgeInfo = Info [] [eToChild,eFromChild] [] [(edgeId,branchLength),(reverseEdgeId,branchLength)] [] [(edgeId,branchAttributes),(reverseEdgeId,branchAttributes)]
eToChild = Edge nodeId targetId edgeId
eFromChild = Edge targetId nodeId reverseEdgeId
(targetId, childInfo) = getNode childIds node (Just reverseEdgeId)
getNode ids (NewickNode children nodeLabel nodeAttributes _ _) parentEdge = (nodeId,foldr combineInfo nodeInfo childInfo)
where nodeInfo = Info [node] [] [(nodeId,nodeLabel)] [] [(nodeId,nodeAttributes)] []
node = Node nodeId outEdges
nodeId = hashedId $ idFromSupply ids
outEdges = IntSet.fromList edgeIds
edgeIds = case parentEdge of Just e -> (e:childEdgeIds) ; Nothing -> childEdgeIds
(childEdgeIds, childInfo) = unzip [getEdge childIds childNewick nodeId
| (childNewick, childIds) <- zip children (splitIdSupplyL ids)]
newickToTree (NewickTree treeAttributes node) = do
ids <- initIdSupply 'n'
let (rootId, info) = getNode ids node Nothing
nodes = IntMap.fromList [(nodeName node, node) | node <- i_nodes info]
edges = IntMap.fromList [(edgeName edge, edge) | edge <- i_edges info]
-- These SHOULD have all the nodes / edges... but maybe we should use (getNodesSet tree & fromSet _) to make sure.
labels = IntMap.fromList (i_labels info) -- this is IntMap (Maybe Double)
lengths = IntMap.fromList (i_lengths info)
nodeAttributes = IntMap.fromList (i_nodeAttributes info)
edgeAttributes = IntMap.fromList (i_edgeAttributes info)
tree = Tree $ Forest $ Graph nodes edges labels nodeAttributes edgeAttributes treeAttributes
rooted_tree = addRoot rootId tree
return (rooted_tree, lengths)
newickToBranchLengthTree newick = do
(tree, lengths) <- newickToTree newick
let lengths2 = fmap (fromMaybe 0) lengths
return (WithBranchLengths tree lengths2)
readTreeTopology filename = do
text <- readFile filename
(topology, lengths) <- newickToTree (parse_newick text)
return topology
readBranchLengthTree filename = do
text <- readFile filename
newickToBranchLengthTree (parse_newick text)
-- Perhaps we should try and strip off the WithBranchLengths from a general tree...
-- We would need a type family like Unrooted to walk all the modifiers.
readTimeTree filename = do
text <- readFile filename
(tree, lengths) <- newickToTree (parse_newick text)
let lengths2 = fmap (fromMaybe 0) lengths
nodeTimes = getNodesSet tree & IntMap.fromSet nodeTime
nodeTime node = case branchToParent tree node of
Nothing -> 0
Just b -> let p = targetNode tree b
in (nodeTimes IntMap.! p) + (lengths2 IntMap.! b)
present = maximum nodeTimes
nodeAges = fmap (\t -> present - t) nodeTimes
return tree
|