File: benchmark_newick_tree_parser.py

package info (click to toggle)
python-dendropy 4.2.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 68,392 kB
  • ctags: 3,947
  • sloc: python: 41,840; xml: 1,400; makefile: 15
file content (118 lines) | stat: -rw-r--r-- 3,970 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
Benchmarking tree parsing.
"""

import sys
import os
import timeit
import argparse
from dendropy.utility import messaging
from dendropy.test.support import pathmap

import dendropy

TREE_FILENAMES = [
    "APG_Angiosperms.newick",
    "GEBA.tree.newick",
    "feb032009.trees.newick",
    "Bininda-emonds_2007_mammals.newick",
    "Jetz_et_al_2012_Aves.sample.tree.newick",
    "Smith_2001_angiosperms.newick",
        ]

def tree_parsing_fn_factory(src_paths, verbose=False):
    def f():
        trees = dendropy.TreeList()
        for src_path in src_paths:
            if verbose:
                sys.stderr.write("  .. {}\n".format(src_path))
            trees.read_from_path(src_path, "newick")
    return f

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-f", "--target-file",
            type=str,
            dest="target_files",
            default=[],
            action="append",
            help="""Path to file to be tokenized; option may be specified multiple times for multiple files. If not specified, default target set will be used.""")
    parser.add_argument("-r", "--repeat",
            type=int,
            default=10,
            help="Repeat each tokenization this number of times (default=%(default)s).")
    parser.add_argument("--delimited_output",
            action="store_true",
            default=False,
            help="Output in tab-delimited instead of aligned format")
    parser.add_argument("--delimited-output",
            action="store_true",
            default=False,
            help="Output in tab-delimited instead of aligned format")
    args = parser.parse_args()

    messenger = messaging.ConsoleMessenger(name="-benchmark")

    src_descs = []
    src_paths = []
    results = []

    if args.target_files:
        for f in args.target_files:
            ff = os.path.expanduser(os.path.expandvars(f))
            src_paths.append(ff)
            src_descs.append( ("User", f) )
    else:
        messenger.info("No sources specified: adding default benchmark target set")
        for f in TREE_FILENAMES:
            ff = pathmap.tree_source_path(f)
            src_paths.append(ff)
            src_descs.append( ("Default", f) )

    for src_path, src_desc in zip(src_paths, src_descs):
        messenger.info("Processing: '{}'".format(src_desc[1]))
        t = timeit.Timer(tree_parsing_fn_factory([src_path]))
        result = min(t.repeat(args.repeat, 1))
        messenger.info("Best time (of {} repetions): {:.10f} seconds".format(args.repeat, result))
        results.append(result)

    messenger.info("Benchmarking complete: all files processed")

    if args.delimited_output:
        result_template = "{}\t{}\t{:.10f}\n"
        header_template = "{}\t{}\t{}\n"
    else:
        max_len1 = max(len(r[0]) for r in src_descs)
        max_len2 = max(len(r[1]) for r in src_descs)
        col1 = "{{:{}}}".format(max_len1)
        col2 = "{{:{}}}".format(max_len2)
        result_template = "[" + col1 + "]  " + col2 + "  {:.10f}\n"
        header_template = col1 + "    " + col2 + "  {}\n"
    sys.stdout.write(header_template.format("Type", "File", "Seconds"))
    for result, src_desc in zip(results, src_descs):
        sys.stdout.write(result_template.format(src_desc[0], src_desc[1], result))

if __name__ == "__main__":
    main()