File: common.py

package info (click to toggle)
datalad 1.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,148 kB
  • sloc: python: 69,414; sh: 1,521; makefile: 220
file content (162 lines) | stat: -rw-r--r-- 5,561 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Helpers for benchmarks of DataLad"""

import os
import os.path as op
import sys
import tarfile
import tempfile
import timeit
from glob import glob

from datalad.api import (
    Dataset,
    create_test_dataset,
)
from datalad.utils import (
    get_tempfile_kwargs,
    getpwd,
    rmtree,
)

############
# Monkey patches

# Robust is_interactive.  Should be not needed since 0.11.4
# https://github.com/datalad/datalad/pull/3268
def _is_stream_tty(stream):
    try:
        # TODO: check on windows if hasattr check would work correctly and
        # add value:
        return stream.isatty()
    except ValueError as exc:
        # Who knows why it is a ValueError, but let's try to be specific
        # If there is a problem with I/O - non-interactive, otherwise reraise
        if "I/O" in str(exc):
            return False
        raise


def is_interactive():
    """Return True if all in/outs are tty"""
    return all(_is_stream_tty(s) for s in (sys.stdin, sys.stdout, sys.stderr))


class SuprocBenchmarks(object):
    # manually set a number since otherwise takes way too long!
    # see https://github.com/spacetelescope/asv/issues/497
    #number = 3
    # although seems to work ok with a timer which accounts for subprocesses

    # custom timer so we account for subprocess times
    timer = timeit.default_timer

    _monkey_patched = False

    def __init__(self):
        if not self._monkey_patched:
            # monkey patch things if needed
            # ASV started to close one of the std streams since some point
            # which caused our is_interactive to fail.  We need to provide
            # more robust version
            from datalad.support.external_versions import external_versions

            # comparing to 0.12.1  since the returned version is "loose"
            # so fails correctly identify rc as pre .0
            if external_versions['datalad'] < '0.12.1':
                from datalad import utils
                from datalad.api import ls
                utils.is_interactive = is_interactive
                ls.is_interactive = is_interactive
            SuprocBenchmarks._monkey_patched = True
        self.remove_paths = []

    def _cleanup(self):
        if not self.remove_paths:
            return  # Nothing TODO
        self.log("Cleaning up %d paths", len(self.remove_paths))
        while self.remove_paths:
            path = self.remove_paths.pop()
            if op.lexists(path):
                rmtree(path)

    def teardown(self):
        self._cleanup()

    def __del__(self):
        # We will at least try
        try:
            self._cleanup()
        except:
            pass

    def log(self, msg, *args):
        """Consistent benchmarks logging"""
        print("BM: "+ str(msg % tuple(args)))


class SampleSuperDatasetBenchmarks(SuprocBenchmarks):
    """
    Setup a sample hierarchy of datasets to be used
    """

    timeout = 3600
    # need to assure that we are working in a different repository now
    # see https://github.com/datalad/datalad/issues/1512
    # might not be sufficient due to side effects between tests and
    # thus getting into the same situation
    ds_count = 0

    # Creating in CWD so things get removed when ASV is done
    #  https://asv.readthedocs.io/en/stable/writing_benchmarks.html
    # that is where it would be run and cleaned up after

    dsname = 'testds1'
    tarfile = 'testds1.tar'

    def setup_cache(self):
        ds_path = create_test_dataset(
            self.dsname
            , spec='2/-2/-2'
            , seed=0
        )[0]
        self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd())
        # Will store into a tarfile since otherwise install -r is way too slow
        # to be invoked for every benchmark
        # Store full path since apparently setup is not ran in that directory
        self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile)
        with tarfile.open(self.tarfile, "w") as tar:
            # F.CK -- Python tarfile can't later extract those because key dirs are
            # read-only.  For now just a workaround - make it all writeable
            from datalad.utils import rotree
            rotree(self.dsname, ro=False, chmod_files=False)
            tar.add(self.dsname, recursive=True)
        rmtree(self.dsname)

    def setup(self):
        self.log("Setup ran in %s, existing paths: %s", getpwd(), glob('*'))

        tempdir = tempfile.mkdtemp(
            **get_tempfile_kwargs({}, prefix="bm")
        )
        self.remove_paths.append(tempdir)
        with tarfile.open(self.tarfile) as tar:
            # note: not a concern for CVE-2007-4559 since we are the ones mastering
            # content for the tar here. See https://github.com/datalad/datalad/pull/7104
            # for more information.
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = op.join(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        self.repo = self.ds.repo
        self.log("Finished setup for %s", tempdir)