File: download_rc_binaries.py

package info (click to toggle)
apache-arrow 23.0.1-4
  • links: PTS
  • area: main
  • in suites:
  • size: 76,368 kB
  • sloc: cpp: 654,608; python: 70,522; ruby: 45,964; ansic: 18,742; sh: 7,367; makefile: 633; javascript: 125; xml: 41
file content (364 lines) | stat: -rwxr-xr-x 11,615 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Download release binaries."""

import argparse
import concurrent.futures as cf
import functools
import json
import os
import random
import re
import subprocess
import time
import urllib.request

DEFAULT_PARALLEL_DOWNLOADS = 8


class Downloader:

    def get_file_list(self, prefix, filter=None):
        def traverse(directory, files, directories):
            url = f"{self.URL_ROOT}/{directory}"
            response = urllib.request.urlopen(url).read().decode()
            paths = re.findall('<a href="(.+?)"', response)
            for path in paths:
                path = re.sub(f"^{re.escape(url)}", "", path)
                if path == "../":
                    continue
                resolved_path = f"{directory}{path}"
                if filter and not filter(path):
                    continue
                if path.endswith("/"):
                    directories.append(resolved_path)
                else:
                    files.append(resolved_path)

        files = []
        if prefix != "" and not prefix.endswith("/"):
            prefix += "/"
        directories = [prefix]
        while len(directories) > 0:
            directory = directories.pop()
            traverse(directory, files, directories)
        return files

    def download_files(self, files, dest=None, num_parallel=None, re_match=None):
        """
        Download files from Bintray in parallel. If file already exists, will
        overwrite if the checksum does not match what Bintray says it should be

        Parameters
        ----------
        files : List[Dict]
            File listing from Bintray
        dest : str, default None
            Defaults to current working directory
        num_parallel : int, default 8
            Number of files to download in parallel. If set to None, uses
            default
        """
        if dest is None:
            dest = os.getcwd()
        if num_parallel is None:
            num_parallel = DEFAULT_PARALLEL_DOWNLOADS

        if re_match is not None:
            files = self._filter_files(files, re_match)

        if num_parallel == 1:
            for path in files:
                self._download_file(dest, path)
        else:
            parallel_map_terminate_early(
                functools.partial(self._download_file,
                                  dest), files, num_parallel
            )

    def _filter_files(self, files, re_match):
        regex = re.compile(re_match)
        return [x for x in files if regex.match(x)]

    def _download_file(self, dest, path):
        base, filename = os.path.split(path)

        dest_dir = os.path.join(dest, base)
        os.makedirs(dest_dir, exist_ok=True)

        dest_path = os.path.join(dest_dir, filename)

        print(f"Downloading {path} to {dest_path}")

        url = f"{self.URL_ROOT}/{path}"
        self._download_url(url, dest_path)

    def _download_url(self, url, dest_path, *, extra_args=None):
        cmd = [
            "curl",
            "--fail",
            "--location",
            "--retry",
            "5",
            *(extra_args or []),
            "--output",
            dest_path,
            url,
        ]
        # Retry subprocess in case it fails with OpenSSL Connection errors
        # https://issues.apache.org/jira/browse/INFRA-25274
        for attempt in range(5):
            if attempt > 0:
                delay = attempt * 3
                print(f"Waiting {delay} seconds before retrying {url}")
                time.sleep(delay)
            proc = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()
            if proc.returncode != 0:
                try:
                    # Don't leave possibly partial file around
                    os.remove(dest_path)
                except IOError:
                    pass
                if "OpenSSL" not in stderr:
                    # We assume curl has already retried on other errors.
                    break
            else:
                return
        raise Exception(
            f"Downloading {url} failed\n" f"stdout: {stdout}\nstderr: {stderr}"
        )

    def _curl_version(self):
        cmd = ["curl", "--version"]
        out = subprocess.run(cmd, capture_output=True, check=True).stdout
        match = re.search(r"curl (\d+)\.(\d+)\.(\d+) ", out.decode())
        return (int(match.group(1)), int(match.group(2)), int(match.group(3)))


class Artifactory(Downloader):
    URL_ROOT = "https://packages.apache.org/artifactory/arrow"


class Maven(Downloader):
    URL_ROOT = (
        "https://repository.apache.org"
        + "/content/repositories/staging/org/apache/arrow"
    )


class GitHub(Downloader):
    def __init__(self, repository, tag):
        super().__init__()
        if repository is None:
            raise ValueError("--repository is required")
        if tag is None:
            raise ValueError("--tag is required")
        self._repository = repository
        self._tag = tag
        # use the same name as the gh CLI
        self._token = os.environ.get("GH_TOKEN")

    def get_file_list(self, prefix, filter=None):
        url = (
            f"https://api.github.com/repos/{self._repository}/"
            f"releases/tags/{self._tag}"
        )
        print("Fetching release from", url)
        headers = {
            "Accept": "application/vnd.github+json",
        }
        if self._token:
            headers["Authorization"] = f"Bearer {self._token}"
        request = urllib.request.Request(
            url,
            method="GET",
            headers=headers,
        )
        raw_response = urllib.request.urlopen(request).read().decode()
        response = json.loads(raw_response)

        files = []
        for asset in response["assets"]:
            if filter and not filter(asset["name"]):
                continue
            # Don't use the API URL since it has a fairly strict rate
            # limit unless logged in, and we have a lot of tiny
            # artifacts
            url = (
                f"https://github.com/{self._repository}/"
                f"releases/download/{self._tag}/{asset['name']}"
            )
            files.append((asset["name"], url))
        return files

    def _filter_files(self, files, re_match):
        regex = re.compile(re_match)
        return [x for x in files if regex.match(x[0])]

    def _download_file(self, dest, asset):
        name, url = asset

        os.makedirs(dest, exist_ok=True)
        dest_path = os.path.join(dest, name)
        print(f"Downloading {url} to {dest_path}")

        if os.path.isfile(dest_path):
            print("Already downloaded", dest_path)
            return

        delay = random.randint(0, 3)
        print(f"Waiting {delay} seconds to avoid rate limit")
        time.sleep(delay)

        extra_args = [
            "--header",
            "Accept: application/octet-stream",
        ]
        if self._curl_version() >= (7, 71, 0):
            # Also retry 403s
            extra_args.append("--retry-all-errors")
        self._download_url(url, dest_path, extra_args=extra_args)


def parallel_map_terminate_early(f, iterable, num_parallel):
    tasks = []
    with cf.ProcessPoolExecutor(num_parallel) as pool:
        for v in iterable:
            tasks.append(pool.submit(functools.partial(f, v)))

        for task in cf.as_completed(tasks):
            if task.exception() is not None:
                e = task.exception()
                for task in tasks:
                    task.cancel()
                raise e


ARROW_REPOSITORY_PACKAGE_TYPES = [
    "almalinux",
    "amazon-linux",
    "centos",
    "debian",
    "ubuntu",
]
ARROW_STANDALONE_PACKAGE_TYPES = ["nuget", "python"]
ARROW_PACKAGE_TYPES = ARROW_REPOSITORY_PACKAGE_TYPES + ARROW_STANDALONE_PACKAGE_TYPES


def download_rc_binaries(
    version,
    rc_number,
    re_match=None,
    dest=None,
    num_parallel=None,
    target_package_type=None,
    repository=None,
    tag=None,
):
    version_string = f'{version}-rc{rc_number}'
    version_pattern = re.compile(r"\d+\.\d+\.\d+")
    if target_package_type:
        package_types = [target_package_type]
    else:
        package_types = ARROW_PACKAGE_TYPES
    for package_type in package_types:

        def is_target(path):
            match = version_pattern.search(path)
            if not match:
                return True
            return match[0] == version

        filter = is_target

        if package_type == "github" or package_type in ARROW_STANDALONE_PACKAGE_TYPES:
            downloader = GitHub(repository, tag)
            prefix = ""
            filter = None
        elif package_type in ARROW_REPOSITORY_PACKAGE_TYPES:
            downloader = Artifactory()
            prefix = f'{package_type}-rc'
        else:
            downloader = Artifactory()
            prefix = f'{package_type}-rc/{version_string}'
            filter = None
        files = downloader.get_file_list(prefix, filter=filter)
        downloader.download_files(
            files, re_match=re_match, dest=dest, num_parallel=num_parallel
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Download release candidate binaries")
    parser.add_argument("version", type=str, help="The version number")
    parser.add_argument(
        "rc_number", type=int, help="The release candidate number, e.g. 0, 1, etc"
    )
    parser.add_argument(
        "-e",
        "--regexp",
        type=str,
        default=None,
        help=(
            "Regular expression to match on file names "
            "to only download certain files"
        ),
    )
    parser.add_argument(
        "--dest",
        type=str,
        default=os.getcwd(),
        help="The output folder for the downloaded files",
    )
    parser.add_argument(
        "--num_parallel",
        type=int,
        default=DEFAULT_PARALLEL_DOWNLOADS,
        help="The number of concurrent downloads to do",
    )
    parser.add_argument(
        "--package_type",
        type=str,
        default=None,
        help="The package type to be downloaded",
    )
    parser.add_argument(
        "--repository",
        type=str,
        help=("The repository to pull from " "(required if --package_type=github)"),
    )
    parser.add_argument(
        "--tag",
        type=str,
        help=("The release tag to download " "(required if --package_type=github)"),
    )
    args = parser.parse_args()

    download_rc_binaries(
        args.version,
        args.rc_number,
        dest=args.dest,
        re_match=args.regexp,
        num_parallel=args.num_parallel,
        target_package_type=args.package_type,
        repository=args.repository,
        tag=args.tag,
    )