File: processpool-download

package info (click to toggle)
python-s3transfer 0.11.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,428 kB
  • sloc: python: 15,560; makefile: 9
file content (129 lines) | stat: -rwxr-xr-x 3,929 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
"""
Downloads using s3transfer.processpool.ProcessPoolDownloader

Usage
=====

NOTE: Make sure you run ``pip install -r requirements-dev.txt`` before running.

To download a file::

    ./proccesspool-download -f myfilename -b mybucket -k mykey

To download a prefix recursively to a directory::

    ./proccesspool-download -d mydirname -b mybucket -p myprefix/

"""

import argparse
import os

import botocore.session

from s3transfer.processpool import ProcessPoolDownloader, ProcessTransferConfig

MB = 1024 * 1024


def download(bucket, key, filename, num_processes, mb_chunksize):
    config = ProcessTransferConfig(
        multipart_chunksize=mb_chunksize * MB,
        max_request_processes=num_processes,
    )
    with ProcessPoolDownloader(config=config) as downloader:
        future = downloader.download_file(
            bucket=bucket, key=key, filename=filename
        )
        future.result()


def recursive_download(bucket, prefix, dirname, num_processes, mb_chunksize):
    config = ProcessTransferConfig(
        multipart_chunksize=mb_chunksize * MB,
        max_request_processes=num_processes,
    )
    s3 = botocore.session.get_session().create_client('s3')
    with ProcessPoolDownloader(config=config) as downloader:
        paginator = s3.get_paginator('list_objects')
        for response in paginator.paginate(Bucket=bucket, Prefix=prefix):
            contents = response.get('Contents', [])
            for content in contents:
                key = content['Key']
                filename = os.path.join(dirname, key[len(prefix) :])
                parent_dirname = os.path.dirname(filename)
                if not os.path.exists(parent_dirname):
                    os.makedirs(parent_dirname)
                # An expected size is provided so an additional HeadObject
                # does not need to be made for each of these objects that
                # get downloaded.
                downloader.download_file(
                    bucket,
                    key,
                    filename=filename,
                    expected_size=content['Size'],
                )


def main():
    parser = argparse.ArgumentParser(usage=__doc__)
    parser.add_argument(
        '-b', '--bucket', required=True, help='The S3 bucket to download from'
    )
    single_file_group = parser.add_argument_group('Single file downloads')
    single_file_group.add_argument(
        '-k', '--key', help='The key to download from'
    )
    single_file_group.add_argument(
        '-f', '--filename', help='The name of file to download to'
    )
    recursive_file_group = parser.add_argument_group(
        'Recursive file downloads'
    )
    recursive_file_group.add_argument(
        '-p', '--prefix', help='The prefix to download from'
    )
    recursive_file_group.add_argument(
        '-d', '--dirname', help='The directory to download to'
    )
    parser.add_argument(
        '-n',
        '--num-processes',
        type=int,
        default=10,
        help='The number of processes to run the download. 10 by default.',
    )
    parser.add_argument(
        '-c',
        '--mb-chunksize',
        type=int,
        default=8,
        help='The part size in MB to use for the download. 8 MB by default.',
    )
    args = parser.parse_args()
    if args.filename and args.key:
        download(
            args.bucket,
            args.key,
            args.filename,
            args.num_processes,
            args.mb_chunksize,
        )
    elif args.prefix and args.dirname:
        recursive_download(
            args.bucket,
            args.prefix,
            args.dirname,
            args.num_processes,
            args.mb_chunksize,
        )
    else:
        raise ValueError(
            'Either --key and --filename must be provided or '
            '--prefix and --dirname must be provided.'
        )


if __name__ == '__main__':
    main()