File: generate-churning-bundle.py

package info (click to toggle)
mercurial 7.2-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 46,124 kB
  • sloc: python: 214,491; ansic: 56,606; tcl: 3,715; sh: 1,879; lisp: 1,483; cpp: 864; makefile: 792; javascript: 649; xml: 36
file content (357 lines) | stat: -rwxr-xr-x 9,887 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#!/usr/bin/env python3
#
# generate-branchy-bundle - generate a branch for a "large" branchy repository
#
# Copyright 2018 Octobus, contact@octobus.net
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
#
# This script generates a repository suitable for testing delta computation
# strategies.
#
# The repository update a single "large" file with many updates. One fixed part
# of the files always get updated while the rest of the lines get updated over
# time. This update happens over many topological branches, some getting merged
# back.
#
# --lazy     will skip generating the file if one exist with the right content
#            already.
# --validate make sure the generated bundle has the expected content.


import hashlib
import os
import shutil
import subprocess
import sys
import tempfile

import mercurial.context
import mercurial.repo.factory
import mercurial.ui

BUNDLE_NAME = 'big-file-churn.hg'

# constants for generating the repository
NB_CHANGESET = 5000
PERIOD_MERGING = 8
PERIOD_BRANCHING = 7
MOVE_BACK_MIN = 3
MOVE_BACK_RANGE = 5

# constants for generating the large file we keep updating
#
# At each revision, the beginning on the file change,
# and set of other lines changes too.
FILENAME = 'SPARSE-REVLOG-TEST-FILE'
NB_LINES = 10500
ALWAYS_CHANGE_LINES = 50
OTHER_CHANGES = 300


def build_graph():
    heads = {0}
    graph = {0: (None, None)}
    for idx in range(1, NB_CHANGESET + 1):
        p, _ = parents = [idx - 1, None]
        if (idx % PERIOD_BRANCHING) == 0:
            back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE)
            for _ in range(back):
                p = graph.get(p, (p,))[0]
                parents[0] = p
        if (idx % PERIOD_MERGING) == 0:
            parents[1] = min(heads)
        for p in parents:
            heads.discard(p)
        heads.add(idx)
        graph[idx] = tuple(parents)
    return graph


GRAPH = build_graph()


def nextcontent(previous_content):
    """utility to produce a new file content from the previous one"""
    return hashlib.md5(previous_content).hexdigest().encode('ascii')


# make sure some of the revision change the same lines, to let the delta
# folding have overwritten line to play with.

SHIFT_GROUP = [
    17,
    13,
    11,
    7,
    17,
    13,
    5,
    17,
    11,
    7,
    13,
    17,
    3,
    11,
    17,
    13,
    5,
    17,
    7,
    11,
    13,
    17,
]


def filecontent(iteridx, oldcontent):
    """generate a new file content

    The content is generated according the iteration index and previous
    content"""

    # initial call
    if iteridx == 0:
        current = b''
    else:
        current = b"%d" % iteridx

    for idx in range(NB_LINES):
        iter_shift = iteridx // SHIFT_GROUP[idx % len(SHIFT_GROUP)]
        do_change_line = True
        if oldcontent is not None and ALWAYS_CHANGE_LINES < idx:
            do_change_line = not ((idx - iter_shift) % OTHER_CHANGES)

        if do_change_line:
            to_write = current + b'\n'
            current = nextcontent(current)
        else:
            to_write = oldcontent[idx]
        yield to_write


def merge_content(base, left, right):
    """merge two file content to produce a new one

    use unambiguous update on each side when possible, and produce a new line
    whenever a merge is needed. Similar to what the manifest would do.
    """
    for old, left, right in zip(base, left, right):
        if old == left and old == right:
            yield old
        elif old == left and old != right:
            yield right
        elif old != left and old == right:
            yield left
        else:
            yield nextcontent(left + right)


def ancestors(graph, rev):
    """return the set of ancestors of revision <rev>"""
    to_proceed = {rev}
    seen = set(to_proceed)
    while to_proceed:
        current = to_proceed.pop()
        for p in graph[current]:
            if p is None:
                continue
            if p in seen:
                continue
            to_proceed.add(p)
            seen.add(p)
    return seen


def gca(graph, left, right):
    """find the greater common ancestors of left and right

    Note that the algorithm is stupid and N² when run on all merge, however
    this should not be a too much issue given the current scale.
    """
    return max(ancestors(graph, left) & ancestors(graph, right))


def make_one_content_fn(idx, base, left, right):
    """build a function that build the content on demand

    The dependency are kept are reference to make sure they are not
    garbage-collected until we use them. Once we computed the current content,
    we make sure to drop their reference to allow them to be garbage collected.
    """

    def content_fn(idx=idx, base=base, left=left, right=right):
        if left is None:
            new = filecontent(idx, None)
        elif base is None:
            new = filecontent(idx, left())
        else:
            merged = merge_content(base(), left(), right())
            new = filecontent(idx, list(merged))
        return list(new)

    del idx
    del base
    del left
    del right

    value = None
    cf = [content_fn]
    del content_fn

    def final_fn():
        nonlocal value
        if value is None:
            content_fn = cf.pop()
            value = list(content_fn())
            del content_fn
        return value

    return final_fn


def build_content_graph(graph):
    """produce file content for all revision

    The content will be generated on demande and cached. Cleanup the
    dictionnary are you use it to reduce memory usage.
    """
    content = {}
    for idx, (p1, p2) in graph.items():
        base = left = right = None
        if p1 is not None:
            left = content[p1]
            if p2 is not None:
                right = content[p2]
                base_rev = gca(graph, p1, p2)
                base = content[base_rev]
        content[idx] = make_one_content_fn(idx, base, left, right)
    return content


CONTENT = build_content_graph(GRAPH)


def hg(command, *args):
    """call a mercurial command with appropriate config and argument"""
    env = os.environ.copy()
    if 'CHGHG' in env:
        full_cmd = ['chg']
    else:
        full_cmd = ['hg']
    full_cmd.append('--quiet')
    full_cmd.append(command)
    if command == 'commit':
        # reproducible commit metadata
        full_cmd.extend(['--date', '0 0', '--user', 'test'])
    elif command == 'merge':
        # avoid conflicts by picking the local variant
        full_cmd.extend(['--tool', ':merge-local'])
    full_cmd.extend(args)
    env['HGRCPATH'] = ''
    return subprocess.check_call(full_cmd, env=env)


def write_repo(path):
    """write repository content in memory"""
    repo = mercurial.repo.factory.repository(
        mercurial.ui.ui.load(),
        path=path.encode('utf-8'),
    )
    nodemap = {None: repo.nodeconstants.nullid}
    with repo.lock(), repo.transaction(b'bundle-generation'):
        for idx, (p1, p2) in GRAPH.items():
            if sys.stdout.isatty():
                print("generating commit #%d/%d" % (idx, NB_CHANGESET))

            file_fn = lambda repo, memctx, path: mercurial.context.memfilectx(
                repo,
                memctx,
                path,
                data=b''.join(CONTENT.pop(idx)()),
            )

            mc = mercurial.context.memctx(
                repo,
                (nodemap[p1], nodemap[p2]),
                b'commit #%d' % idx if idx else b'initial commit',
                [FILENAME.encode('ascii')],
                file_fn,
                user=b"test",
                date=(0, 0),
            )
            nodemap[idx] = repo.commitctx(mc)


def compute_md5(target):
    with open(target, 'rb') as bundle:
        data = bundle.read()
        return hashlib.md5(data).hexdigest()


def write_md5(target, md5):
    with open(target + '.md5', 'wb') as md5file:
        md5file.write(md5.encode('ascii') + b'\n')


def read_md5(target):
    with open(target + '.md5', 'rb') as md5file:
        return md5file.read().strip().decode('ascii')


def up_to_date_target(target):
    """return true if the file already exist at the right"""
    try:
        found = compute_md5(target)
        expected = read_md5(target)
    except OSError:
        return False
    return found == expected


def run(target, validate=False):
    tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-')
    cwd = os.getcwd()

    tmp_name = "%s-%d" % (target, os.getpid())
    try:
        os.chdir(tmpdir)
        hg(
            'init',
            '--config',
            'format.maxchainlen=%d' % NB_CHANGESET,
        )
        write_repo(tmpdir)
        hg('bundle', '--all', tmp_name, '--config', 'devel.bundle.delta=p1')
        os.replace(tmp_name, target)
        digest = compute_md5(target)
        if not validate:
            write_md5(target, digest)
        else:
            expected = read_md5(target)
            if expected != digest:
                msg = "bundle generated does not match the expected content\n"
                msg += "    expected: %s\n" % expected
                msg += "    got:      %s" % digest
                print(msg, file=sys.stderr)
                return 1
    finally:
        # Windows does not let you remove the current working directory
        os.chdir(cwd)
        shutil.rmtree(tmpdir)
        if os.path.exists(tmp_name):
            os.remove(tmp_name)
    return 0


if __name__ == '__main__':
    orig = os.path.realpath(os.path.dirname(sys.argv[0]))
    target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME)
    lazy = '--lazy' in sys.argv[1:]
    validate = '--validate' in sys.argv[1:]

    if lazy and up_to_date_target(target):
        sys.exit(0)
    sys.exit(run(target, validate=validate))