File: benchmark.py

package info (click to toggle)
python-scandir 1.10.0-4
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 224 kB
  • sloc: ansic: 1,596; python: 1,129; makefile: 4
file content (192 lines) | stat: -rw-r--r-- 6,454 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""Simple benchmark to compare the speed of scandir.walk() with os.walk()."""

import optparse
import os
import stat
import sys
import timeit

import warnings
with warnings.catch_warnings(record=True):
    import scandir

DEPTH = 4
NUM_DIRS = 5
NUM_FILES = 50


def os_walk_pre_35(top, topdown=True, onerror=None, followlinks=False):
    """Pre Python 3.5 implementation of os.walk() that doesn't use scandir."""
    islink, join, isdir = os.path.islink, os.path.join, os.path.isdir

    try:
        names = os.listdir(top)
    except OSError as err:
        if onerror is not None:
            onerror(err)
        return

    dirs, nondirs = [], []
    for name in names:
        if isdir(join(top, name)):
            dirs.append(name)
        else:
            nondirs.append(name)

    if topdown:
        yield top, dirs, nondirs
    for name in dirs:
        new_path = join(top, name)
        if followlinks or not islink(new_path):
            for x in os_walk_pre_35(new_path, topdown, onerror, followlinks):
                yield x
    if not topdown:
        yield top, dirs, nondirs


def create_tree(path, depth=DEPTH):
    """Create a directory tree at path with given depth, and NUM_DIRS and
    NUM_FILES at each level.
    """
    os.mkdir(path)
    for i in range(NUM_FILES):
        filename = os.path.join(path, 'file{0:03}.txt'.format(i))
        with open(filename, 'wb') as f:
            f.write(b'foo')
    if depth <= 1:
        return
    for i in range(NUM_DIRS):
        dirname = os.path.join(path, 'dir{0:03}'.format(i))
        create_tree(dirname, depth - 1)


def get_tree_size(path):
    """Return total size of all files in directory tree at path."""
    size = 0
    try:
        for entry in scandir.scandir(path):
            if entry.is_symlink():
                pass
            elif entry.is_dir():
                size += get_tree_size(os.path.join(path, entry.name))
            else:
                size += entry.stat().st_size
    except OSError:
        pass
    return size


def benchmark(path, get_size=False):
    sizes = {}

    if get_size:
        def do_os_walk():
            size = 0
            for root, dirs, files in os.walk(path):
                for filename in files:
                    fullname = os.path.join(root, filename)
                    st = os.lstat(fullname)
                    if not stat.S_ISLNK(st.st_mode):
                        size += st.st_size
            sizes['os_walk'] = size

        def do_scandir_walk():
            sizes['scandir_walk'] = get_tree_size(path)

    else:
        def do_os_walk():
            for root, dirs, files in os.walk(path):
                pass

        def do_scandir_walk():
            for root, dirs, files in scandir.walk(path):
                pass

    # Run this once first to cache things, so we're not benchmarking I/O
    print("Priming the system's cache...")
    do_scandir_walk()

    # Use the best of 3 time for each of them to eliminate high outliers
    os_walk_time = 1000000
    scandir_walk_time = 1000000
    N = 3
    for i in range(N):
        print('Benchmarking walks on {0}, repeat {1}/{2}...'.format(
            path, i + 1, N))
        os_walk_time = min(os_walk_time, timeit.timeit(do_os_walk, number=1))
        scandir_walk_time = min(scandir_walk_time,
                                timeit.timeit(do_scandir_walk, number=1))

    if get_size:
        if sizes['os_walk'] == sizes['scandir_walk']:
            equality = 'equal'
        else:
            equality = 'NOT EQUAL!'
        print('os.walk size {0}, scandir.walk size {1} -- {2}'.format(
            sizes['os_walk'], sizes['scandir_walk'], equality))

    print('os.walk took {0:.3f}s, scandir.walk took {1:.3f}s -- {2:.1f}x as fast'.format(
          os_walk_time, scandir_walk_time, os_walk_time / scandir_walk_time))


if __name__ == '__main__':
    usage = """Usage: benchmark.py [-h] [tree_dir]

Create a large directory tree named "benchtree" (relative to this script) and
benchmark os.walk() versus scandir.walk(). If tree_dir is specified, benchmark
using it instead of creating a tree."""
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-s', '--size', action='store_true',
                      help='get size of directory tree while walking')
    parser.add_option('-c', '--scandir', type='choice', choices=['best', 'generic', 'c', 'python', 'os'], default='best',
                      help='version of scandir() to use, default "%default"')
    options, args = parser.parse_args()

    if args:
        tree_dir = args[0]
    else:
        tree_dir = os.path.join(os.path.dirname(__file__), 'benchtree')
        if not os.path.exists(tree_dir):
            print('Creating tree at {0}: depth={1}, num_dirs={2}, num_files={3}'.format(
                tree_dir, DEPTH, NUM_DIRS, NUM_FILES))
            create_tree(tree_dir)

    if options.scandir == 'generic':
        scandir.scandir = scandir.scandir_generic
    elif options.scandir == 'c':
        if scandir.scandir_c is None:
            print("ERROR: Compiled C version of scandir not found!")
            sys.exit(1)
        scandir.scandir = scandir.scandir_c
    elif options.scandir == 'python':
        if scandir.scandir_python is None:
            print("ERROR: Python version of scandir not found!")
            sys.exit(1)
        scandir.scandir = scandir.scandir_python
    elif options.scandir == 'os':
        if not hasattr(os, 'scandir'):
            print("ERROR: Python 3.5's os.scandir() not found!")
            sys.exit(1)
        scandir.scandir = os.scandir
    elif hasattr(os, 'scandir'):
        scandir.scandir = os.scandir

    if scandir.scandir == getattr(os, 'scandir', None):
        print("Using Python 3.5's builtin os.scandir()")
    elif scandir.scandir == scandir.scandir_c:
        print('Using fast C version of scandir')
    elif scandir.scandir == scandir.scandir_python:
        print('Using slower ctypes version of scandir')
    elif scandir.scandir == scandir.scandir_generic:
        print('Using very slow generic version of scandir')
    else:
        print('ERROR: Unsure which version of scandir we are using!')
        sys.exit(1)

    if hasattr(os, 'scandir'):
        os.walk = os_walk_pre_35
        print('Comparing against pre-Python 3.5 version of os.walk()')
    else:
        print('Comparing against builtin version of os.walk()')

    benchmark(tree_dir, get_size=options.size)