File: CoreTypes.hs

package info (click to toggle)
phybin 0.3-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 576 kB
  • sloc: haskell: 2,141; sh: 559; makefile: 74
file content (360 lines) | stat: -rw-r--r-- 13,553 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
{-# LANGUAGE NamedFieldPuns, BangPatterns #-}
{-# LANGUAGE OverloadedStrings, TypeSynonymInstances, FlexibleInstances #-}
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
{-# OPTIONS_GHC -fwarn-unused-imports #-}

module Bio.Phylogeny.PhyBin.CoreTypes
       (
         -- * Tree and tree decoration types
         NewickTree(..), 
         DefDecor, StandardDecor(..), AnnotatedTree, FullTree(..),
         ClustMode(..), TreeName, NumTaxa(..),
         
         -- * Tree operations
         displayDefaultTree, displayStrippedTree, 
         treeSize, numLeaves, liftFT,
         get_dec, set_dec, get_children, 
         map_labels, all_labels, foldIsomorphicTrees,

         -- * Utilities specific to StandardDecor:
         avg_branchlen, get_bootstraps,

         -- * Command line config options
         PhyBinConfig(..), default_phybin_config,
         WhichRFMode(..),

         -- * General helpers
         Label, LabelTable,
         
         -- * Experimenting with abstracting decoration operations
         HasBranchLen(..)
       )
       where

import Prelude hiding ((<>))
import qualified Data.Map as M
import qualified Data.Set as S
import Data.Foldable (Foldable(..))
import Data.Maybe (maybeToList)
import Data.Monoid (mappend, mconcat)
import Text.PrettyPrint.HughesPJClass hiding (char, Style)

import qualified Data.Clustering.Hierarchical as C

#define NO_ATOMS
#ifndef NO_ATOMS
import StringTable.Atom
#endif

----------------------------------------------------------------------------------------------------
-- Type definitions
----------------------------------------------------------------------------------------------------

type BranchLen = Double

-- | Even though the Newick format allows it, here we ignore interior node
--   labels. (They are not commonly used.)
--
--   Note that these trees are rooted.  The `normalize` function ensures that a
--   single, canonical rooted representation is chosen.
data NewickTree a = 
   NTLeaf     a {-# UNPACK #-} !Label
 | NTInterior a  [NewickTree a]
 deriving (Show, Eq, Ord)

-- TODO: Ordering maybe shouldn't need to touch the metadata.  At least on the fast
-- path.

{-
-- [2010.09.22] Disabling:
instance NFData Atom where
  rnf a = rnf (fromAtom a :: Int)

instance NFData a => NFData (NewickTree a) where
  rnf (NTLeaf l n)      = rnf (l,n)
  rnf (NTInterior l ls) = rnf (l,ls)
-}

instance Functor NewickTree where 
   fmap fn (NTLeaf dec x)      = NTLeaf (fn dec) x 
   fmap fn (NTInterior dec ls) = NTInterior (fn dec) (map (fmap fn) ls)

instance Foldable NewickTree where
  foldMap f (NTLeaf dec x) = f dec
  foldMap f (NTInterior dec ls) = mappend (f dec) $
                                  mconcat (map (foldMap f) ls)

instance Foldable FullTree where
  foldMap f (FullTree _ _ tr) = foldMap f tr

instance Functor FullTree where
  fmap f (FullTree n l tr) = FullTree n l $ fmap f tr

instance Pretty dec => Pretty (NewickTree dec) where 
 -- I'm using displayDefaultTree for the "prettiest" printing and
 -- replacing pPrint with a whitespace-improved version of show:
 pPrint (NTLeaf dec name)   = "NTLeaf"     <+> pPrint dec <+> text (show name)
 pPrint (NTInterior dec ls) = "NTInterior" <+> pPrint dec <+> pPrint ls

instance Pretty a => Pretty (FullTree a) where
  pPrint (FullTree name mp tr) = 
    "FullTree " <+> text name <+> loop tr
   where
    loop (NTLeaf dec ind)    = "NTLeaf"     <+> pPrint dec <+> text (mp M.! ind)
    loop (NTInterior dec ls) = "NTInterior" <+> pPrint dec <+> pPrint ls

instance (Pretty k, Pretty v) => Pretty (M.Map k v) where
  pPrint mp = pPrint (M.toList mp)

-- | Display a tree WITH the bootstrap and branch lengths.
--   This prints in NEWICK format.
displayDefaultTree :: FullTree DefDecor -> Doc
displayDefaultTree orig = loop tr <> ";"
  where
    (FullTree _ mp tr) = orig -- normalize orig
    loop (NTLeaf (Nothing,_) name)     = text (mp M.! name)
    loop (NTLeaf _ _)                  = error "WEIRD -- why did a leaf node have a bootstrap value?"
    loop (NTInterior (bootstrap,_) ls) = 
       case bootstrap of
         Nothing -> base
         Just val -> base <> text ":[" <> text (show val) <> text "]"
      where base = parens$ sep$ map_but_last (<>text",") $ map loop ls

-- | The same, except with no bootstrap or branch lengths.  Any tree annotations
-- ignored.
displayStrippedTree :: FullTree a -> Doc
displayStrippedTree orig = loop tr <> ";"
  where
    (FullTree _ mp tr) = orig -- normalize orig
    loop (NTLeaf _ name) = text (mp M.! name)
    loop (NTInterior _ ls) = parens$ sep$ map_but_last (<>text",") $ map loop ls

----------------------------------------------------------------------------------------------------
-- Labels
----------------------------------------------------------------------------------------------------

-- | Labels are inexpensive unique integers.  The table is necessary for converting them back.
type Label = Int

-- | Map labels back onto meaningful names.
type LabelTable = M.Map Label String

----------------------------------------------------------------------------------------------------
-- Tree metadata (decorators)
----------------------------------------------------------------------------------------------------


-- | The barebones default decorator for NewickTrees contains BOOTSTRAP and
-- BRANCHLENGTH.  The bootstrap values, if present, will range in [0..100]
type DefDecor = (Maybe Int, BranchLen)

-- | Additionally includes some scratch data that is used by the binning algorithm.
type AnnotatedTree = NewickTree StandardDecor

-- | The standard decoration includes everything in `DefDecor` plus
--   some extra cached data:
-- 
--  (1) branch length from parent to "this" node
--  (2) bootstrap values for the node
-- 
--  (3) subtree weights for future use
--      (defined as number of LEAVES, not counting intermediate nodes)
--  (4) sorted lists of labels for symmetry breaking
data StandardDecor = StandardDecor {
  branchLen     :: BranchLen,
  bootStrap     :: Maybe Int,

  -- The rest of these are used by the computations below.  These are
  -- cached (memoized) values that could be recomputed:
  ----------------------------------------
  subtreeWeight :: Int,
  sortedLabels  :: [Label]
 }
 deriving (Show,Read,Eq,Ord)

class HasBranchLen a where
  getBranchLen :: a -> BranchLen

instance HasBranchLen StandardDecor where
  getBranchLen = branchLen

-- This one is kind of sloppy:
instance HasBranchLen DefDecor where
  getBranchLen = snd

-- | A common type of tree contains the standard decorator and also a table for
-- restoring the human-readable node names.
data FullTree a =
  FullTree { treename   :: TreeName
           , labelTable :: LabelTable
           , nwtree     :: NewickTree a 
           }
 deriving (Show, Ord, Eq)

liftFT :: (NewickTree t -> NewickTree a) -> FullTree t -> FullTree a
liftFT fn (FullTree nm labs x) = FullTree nm labs (fn x)

type TreeName = String

instance Pretty StandardDecor where 
 pPrint (StandardDecor bl bs wt ls) = parens$
    "StandardDecor" <+> hsep [pPrint bl, pPrint bs
--                             , pPrint wt, pPrint ls
                             ]


----------------------------------------------------------------------------------------------------
-- * Configuring and running the command line tool.
----------------------------------------------------------------------------------------------------

-- | Due to the number of configuration options for the driver, we pack them into a record.
data PhyBinConfig = 
  PBC { verbose :: Bool
      , num_taxa :: NumTaxa
      , name_hack :: String -> String
      , output_dir :: String
      , inputs :: [String]
      , do_graph :: Bool
      , do_draw :: Bool
      , clust_mode  :: ClustMode
      , highlights  :: [FilePath] -- [FullTree ()]
      , show_trees_in_dendro :: Bool
      , show_interior_consensus :: Bool
      , rfmode :: WhichRFMode
      , preprune_labels :: Maybe [String]
      , print_rfmatrix :: Bool
      , dist_thresh :: Maybe Int
      , branch_collapse_thresh :: Maybe Double -- ^ Branches less than this length are collapsed.
      , bootstrap_collapse_thresh :: Maybe Int
        -- ^ BootStrap values less than this result in the intermediate node being collapsed.        
      }

-- | Supported modes for computing RFDistance.
data WhichRFMode = HashRF | TolerantNaive  
  deriving (Show, Eq, Ord)

-- | How many taxa should we expect in the incoming dataset?
data NumTaxa = Expected Int  -- ^ Supplied by the user.  Committed.
             | Unknown       -- ^ In the future we may automatically pick a behavior.  Now this one is usually an error.
             | Variable      -- ^ Explicitly ignore this setting in favor of comparing all trees
                             --   (even if some are missing taxa).  This only works with certain modes.
  deriving (Show, Read, Eq)
               
-- | The default phybin configuration.
default_phybin_config :: PhyBinConfig
default_phybin_config = 
 PBC { verbose = False
--      , num_taxa = error "must be able to determine the number of taxa expected in the dataset.  (Supply it manually with -n.)"
      , num_taxa  = Unknown
      , name_hack = id -- Default, no transformation of leaf-labels
      , output_dir = "./phybin_out/"
      , inputs = []
      , do_graph = False
      , do_draw = False
--      , clust_mode = BinThem
      , clust_mode = ClusterThem C.UPGMA
      , rfmode = HashRF
      , preprune_labels = Nothing
      , highlights     = []
      , show_trees_in_dendro = False
      , show_interior_consensus = False
      , print_rfmatrix = False
      , dist_thresh = Nothing
      , branch_collapse_thresh = Nothing
      , bootstrap_collapse_thresh = Nothing
     }

data ClustMode = BinThem | ClusterThem { linkage :: C.Linkage }

----------------------------------------------------------------------------------------------------
-- * Simple utility functions for the core types:
----------------------------------------------------------------------------------------------------

-- | How many nodes (leaves and interior) are contained in a NewickTree?
treeSize :: NewickTree a -> Int
treeSize (NTLeaf _ _) = 1
treeSize (NTInterior _ ls) = 1 + sum (map treeSize ls)

-- | This counts only leaf nodes, which should include all taxa.
numLeaves :: NewickTree a -> Int
numLeaves (NTLeaf _ _) = 1
numLeaves (NTInterior _ ls) = sum (map numLeaves ls)


map_but_last :: (a -> a) -> [a] -> [a]
map_but_last _ [] = []
map_but_last _ [h] = [h]
map_but_last fn (h:t) = fn h : map_but_last fn t



get_dec :: NewickTree t -> t
get_dec (NTLeaf     dec _) = dec
get_dec (NTInterior dec _) = dec

-- Set all the decorations to a constant:
set_dec :: b -> NewickTree a -> NewickTree b
set_dec d = fmap (const d)
--set_dec d (NTLeaf _ x) = NTLeaf d x
--set_dec d (NTInterior _ ls) = NTInterior d $ map (set_dec d) ls

get_children :: NewickTree t -> [NewickTree t]
get_children (NTLeaf _ _) = []
get_children (NTInterior _ ls) = ls


-- | Average branch length across all branches in all all trees.
avg_branchlen :: HasBranchLen a => [NewickTree a] -> Double
avg_branchlen origls = fst total / snd total
  where
   total = sum_ls $ map sum_tree origls
   sum_ls ls = (sum$ map fst ls, sum$ map snd ls)
   sum_tree (NTLeaf dec _) | getBranchLen dec == 0  = (0,0)
                           | otherwise              = (abs (getBranchLen dec),1)
   sum_tree (NTInterior dec ls) = 
       let branchLen = getBranchLen dec
           (x,y)     = sum_ls$ map sum_tree ls in
       if branchLen == 0 then (x, y) else ((abs branchLen) + x, 1+y)

-- | Retrieve all the bootstraps values actually present in a tree.
get_bootstraps :: NewickTree StandardDecor -> [Int]
get_bootstraps (NTLeaf (StandardDecor{bootStrap}) _) = maybeToList bootStrap
get_bootstraps (NTInterior (StandardDecor{bootStrap}) ls) =
  maybeToList bootStrap ++ concatMap get_bootstraps ls

-- | Apply a function to all the *labels* (leaf names) in a tree.
map_labels :: (Label -> Label) -> NewickTree a -> NewickTree a
map_labels fn (NTLeaf     dec lbl) = NTLeaf dec $ fn lbl
map_labels fn (NTInterior dec ls)  = NTInterior dec$ map (map_labels fn) ls
 
-- | Return all the labels contained in the tree.
all_labels :: NewickTree t -> [Label]
all_labels (NTLeaf     _ lbl) = [lbl]
all_labels (NTInterior _ ls)  = concat$ map all_labels ls



-- | This function allows one to collapse multiple trees while looking
-- only at the "horizontal slice" of all the annotations *at a given
-- position* in the tree.
--
-- "Isomorphic" must apply both to the shape and the name labels or it
-- is an error to apply this function.
foldIsomorphicTrees :: ([a] -> b) -> [NewickTree a] -> NewickTree b
foldIsomorphicTrees _ [] = error "foldIsomorphicTrees: empty list of input trees"
foldIsomorphicTrees fn ls@(hd:_) = fmap fn horiztrees
 where
   -- Preserve the input order:
   horiztrees = Prelude.foldr consTrees (fmap (const []) hd) ls
   -- We use the tree datatype itself as the intermediate data
   -- structure.  This is VERY allocation-expensive, it would be
   -- possible to trade compute for allocation here:
   consTrees a b = case (a,b) of
    (NTLeaf dec nm1, NTLeaf decls nm2) | nm1 /= nm2 -> error$"foldIsomorphicTrees: mismatched names: "++show (nm1,nm2)
                                       | otherwise ->
     NTLeaf (dec : decls) nm1
    (NTInterior dec ls1, NTInterior decls ls2) ->
     NTInterior (dec:decls) $ zipWith consTrees ls1 ls2
    _ -> error "foldIsomorphicTrees: difference in tree shapes"