File: sync_s3.py

package info (click to toggle)
python-django-extensions 1.7.4-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,016 kB
  • ctags: 1,342
  • sloc: python: 8,873; makefile: 117
file content (391 lines) | stat: -rw-r--r-- 15,984 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# -*- coding: utf-8 -*-
"""
Sync Media to S3
================

Django command that scans all files in your settings.MEDIA_ROOT and
settings.STATIC_ROOT folders and uploads them to S3 with the same directory
structure.

This command can optionally do the following but it is off by default:
* gzip compress any CSS and Javascript files it finds and adds the appropriate
  'Content-Encoding' header.
* set a far future 'Expires' header for optimal caching.
* upload only media or static files.
* use any other provider compatible with Amazon S3.
* set other than 'public-read' ACL.

Note: This script requires the Python boto library and valid Amazon Web
Services API keys.

Required settings.py variables:
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''
AWS_BUCKET_NAME = ''

When you call this command with the `--renamegzip` param, it will add
the '.gz' extension to the file name. But Safari just doesn't recognize
'.gz' files and your site won't work on it! To fix this problem, you can
set any other extension (like .jgz) in the `SYNC_S3_RENAME_GZIP_EXT`
variable.

Command options are:
  -p PREFIX, --prefix=PREFIX
                        The prefix to prepend to the path on S3.
  --gzip                Enables gzipping CSS and Javascript files.
  --expires             Enables setting a far future expires header.
  --force               Skip the file mtime check to force upload of all
                        files.
  --filter-list         Override default directory and file exclusion
                        filters. (enter as comma separated line)
  --renamegzip          Enables renaming of gzipped files by appending '.gz'.
                        to the original file name. This way your original
                        assets will not be replaced by the gzipped ones.
                        You can change the extension setting the
                        `SYNC_S3_RENAME_GZIP_EXT` var in your settings.py
                        file.
  --invalidate          Invalidates the objects in CloudFront after uploading
                        stuff to s3.
  --media-only          Only MEDIA_ROOT files will be uploaded to S3.
  --static-only         Only STATIC_ROOT files will be uploaded to S3.
  --s3host              Override default s3 host.
  --acl                 Override default ACL settings ('public-read' if
                        settings.AWS_DEFAULT_ACL is not defined).

TODO:
 * Use fnmatch (or regex) to allow more complex FILTER_LIST rules.

"""
import datetime
import email
import gzip
import mimetypes
import os
import time

from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.utils.six import StringIO

from django_extensions.management.utils import signalcommand

# Make sure boto is available
try:
    import boto
    import boto.exception
    HAS_BOTO = True
except ImportError:
    HAS_BOTO = False


class Command(BaseCommand):
    # Extra variables to avoid passing these around
    AWS_ACCESS_KEY_ID = ''
    AWS_SECRET_ACCESS_KEY = ''
    AWS_BUCKET_NAME = ''
    AWS_CLOUDFRONT_DISTRIBUTION = ''
    SYNC_S3_RENAME_GZIP_EXT = ''

    DIRECTORIES = ''
    FILTER_LIST = ['.DS_Store', '.svn', '.hg', '.git', 'Thumbs.db']
    GZIP_CONTENT_TYPES = (
        'text/css',
        'application/javascript',
        'application/x-javascript',
        'text/javascript'
    )

    uploaded_files = []
    upload_count = 0
    skip_count = 0

    help = 'Syncs the complete MEDIA_ROOT structure and files to S3 into the given bucket name.'
    args = 'bucket_name'

    can_import_settings = True

    def add_arguments(self, parser):
        super(Command, self).add_arguments(parser)
        parser.add_argument('-p', '--prefix',
                    dest='prefix',
                    default=getattr(settings, 'SYNC_S3_PREFIX', ''),
                    help="The prefix to prepend to the path on S3.")
        parser.add_argument('-d', '--dir',
                    dest='dir',
                    help="Custom static root directory to use")
        parser.add_argument('--s3host',
                    dest='s3host',
                    default=getattr(settings, 'AWS_S3_HOST', ''),
                    help="The s3 host (enables connecting to other "
                    "providers/regions)")
        parser.add_argument('--acl',
                    dest='acl',
                    default=getattr(settings, 'AWS_DEFAULT_ACL',
                                    'public-read'),
                    help="Enables to override default acl (public-read).")
        parser.add_argument('--gzip',
                    action='store_true', dest='gzip', default=False,
                    help="Enables gzipping CSS and Javascript files.")
        parser.add_argument('--renamegzip',
                    action='store_true', dest='renamegzip', default=False,
                    help="Enables renaming of gzipped assets to have '.gz' "
                    "appended to the filename.")
        parser.add_argument('--expires',
                    action='store_true', dest='expires', default=False,
                    help="Enables setting a far future expires header.")
        parser.add_argument('--force',
                    action='store_true', dest='force', default=False,
                    help="Skip the file mtime check to force upload of "
                    "all files.")
        parser.add_argument('--filter-list', dest='filter_list',
                    action='store', default='',
                    help="Override default directory and file exclusion "
                    "filters. (enter as comma seperated line)")
        parser.add_argument('--invalidate', dest='invalidate', default=False,
                    action='store_true',
                    help='Invalidates the associated objects in CloudFront')
        parser.add_argument('--media-only', dest='media_only', default='',
                    action='store_true',
                    help="Only MEDIA_ROOT files will be uploaded to S3")
        parser.add_argument('--static-only', dest='static_only', default='',
                    action='store_true',
                    help="Only STATIC_ROOT files will be uploaded to S3")

    @signalcommand
    def handle(self, *args, **options):
        if not HAS_BOTO:
            raise ImportError("The boto Python library is not installed.")

        # Check for AWS keys in settings
        if not hasattr(settings, 'AWS_ACCESS_KEY_ID') or not hasattr(settings, 'AWS_SECRET_ACCESS_KEY'):
            raise CommandError('Missing AWS keys from settings file.  Please supply both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.')
        else:
            self.AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID
            self.AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY

        if not hasattr(settings, 'AWS_BUCKET_NAME'):
            raise CommandError('Missing bucket name from settings file. Please add the AWS_BUCKET_NAME to your settings file.')
        else:
            if not settings.AWS_BUCKET_NAME:
                raise CommandError('AWS_BUCKET_NAME cannot be empty.')
        self.AWS_BUCKET_NAME = settings.AWS_BUCKET_NAME

        if not hasattr(settings, 'MEDIA_ROOT'):
            raise CommandError('MEDIA_ROOT must be set in your settings.')
        else:
            if not settings.MEDIA_ROOT:
                raise CommandError('MEDIA_ROOT must be set in your settings.')

        self.AWS_CLOUDFRONT_DISTRIBUTION = getattr(settings, 'AWS_CLOUDFRONT_DISTRIBUTION', '')

        self.SYNC_S3_RENAME_GZIP_EXT = \
            getattr(settings, 'SYNC_S3_RENAME_GZIP_EXT', '.gz')

        self.verbosity = int(options.get('verbosity'))
        self.prefix = options.get('prefix')
        self.do_gzip = options.get('gzip')
        self.rename_gzip = options.get('renamegzip')
        self.do_expires = options.get('expires')
        self.do_force = options.get('force')
        self.invalidate = options.get('invalidate')
        self.DIRECTORIES = options.get('dir')
        self.s3host = options.get('s3host')
        self.default_acl = options.get('acl')
        self.FILTER_LIST = getattr(settings, 'FILTER_LIST', self.FILTER_LIST)
        filter_list = options.get('filter_list')
        if filter_list:
            # command line option overrides default filter_list and
            # settings.filter_list
            self.FILTER_LIST = filter_list.split(',')

        self.media_only = options.get('media_only')
        self.static_only = options.get('static_only')
        # Get directories
        if self.media_only and self.static_only:
            raise CommandError("Can't use --media-only and --static-only together. Better not use anything...")
        elif self.media_only:
            self.DIRECTORIES = [settings.MEDIA_ROOT]
        elif self.static_only:
            self.DIRECTORIES = [settings.STATIC_ROOT]
        elif self.DIRECTORIES:
            self.DIRECTORIES = [self.DIRECTORIES]
        else:
            self.DIRECTORIES = [settings.MEDIA_ROOT, settings.STATIC_ROOT]

        # Now call the syncing method to walk the MEDIA_ROOT directory and
        # upload all files found.
        self.sync_s3()

        # Sending the invalidation request to CloudFront if the user
        # requested this action
        if self.invalidate:
            self.invalidate_objects_cf()

        print("")
        print("%d files uploaded." % self.upload_count)
        print("%d files skipped." % self.skip_count)

    def open_cf(self):
        """
        Returns an open connection to CloudFront
        """
        return boto.connect_cloudfront(
            self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)

    def invalidate_objects_cf(self):
        """
        Split the invalidation request in groups of 1000 objects
        """
        if not self.AWS_CLOUDFRONT_DISTRIBUTION:
            raise CommandError(
                'An object invalidation was requested but the variable '
                'AWS_CLOUDFRONT_DISTRIBUTION is not present in your settings.')

        # We can't send more than 1000 objects in the same invalidation
        # request.
        chunk = 1000

        # Connecting to CloudFront
        conn = self.open_cf()

        # Splitting the object list
        objs = self.uploaded_files
        chunks = [objs[i:i + chunk] for i in range(0, len(objs), chunk)]

        # Invalidation requests
        for paths in chunks:
            conn.create_invalidation_request(
                self.AWS_CLOUDFRONT_DISTRIBUTION, paths)

    def sync_s3(self):
        """
        Walks the media/static directories and syncs files to S3
        """
        bucket, key = self.open_s3()
        for directory in self.DIRECTORIES:
            for root, dirs, files in os.walk(directory):
                self.upload_s3((bucket, key, self.AWS_BUCKET_NAME, directory), root, files)

    def compress_string(self, s):
        """Gzip a given string."""
        zbuf = StringIO()
        zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
        zfile.write(s)
        zfile.close()
        return zbuf.getvalue()

    def get_s3connection_kwargs(self):
        """Returns connection kwargs as a dict"""
        kwargs = {}
        if self.s3host:
            kwargs['host'] = self.s3host
        return kwargs

    def open_s3(self):
        """
        Opens connection to S3 returning bucket and key
        """
        conn = boto.connect_s3(
            self.AWS_ACCESS_KEY_ID,
            self.AWS_SECRET_ACCESS_KEY,
            **self.get_s3connection_kwargs())
        try:
            bucket = conn.get_bucket(self.AWS_BUCKET_NAME)
        except boto.exception.S3ResponseError:
            bucket = conn.create_bucket(self.AWS_BUCKET_NAME)
        return bucket, boto.s3.key.Key(bucket)

    def upload_s3(self, arg, dirname, names):
        """
        This is the callback to os.path.walk and where much of the work happens
        """
        bucket, key, bucket_name, root_dir = arg

        # Skip directories we don't want to sync
        if os.path.basename(dirname) in self.FILTER_LIST:
            # prevent walk from processing subfiles/subdirs below the ignored one
            del names[:]
            return

        # Later we assume the MEDIA_ROOT ends with a trailing slash
        if not root_dir.endswith(os.path.sep):
            root_dir = root_dir + os.path.sep

        for file in names:
            headers = {}

            if file in self.FILTER_LIST:
                continue  # Skip files we don't want to sync

            filename = os.path.join(dirname, file)
            if os.path.isdir(filename):
                continue  # Don't try to upload directories

            file_key = filename[len(root_dir):]
            if self.prefix:
                file_key = '%s/%s' % (self.prefix, file_key)

            # Check if file on S3 is older than local file, if so, upload
            if not self.do_force:
                s3_key = bucket.get_key(file_key)
                if s3_key:
                    s3_datetime = datetime.datetime(*time.strptime(
                        s3_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')[0:6])
                    local_datetime = datetime.datetime.utcfromtimestamp(
                        os.stat(filename).st_mtime)
                    if local_datetime < s3_datetime:
                        self.skip_count += 1
                        if self.verbosity > 1:
                            print("File %s hasn't been modified since last being uploaded" % file_key)
                        continue

            # File is newer, let's process and upload
            if self.verbosity > 0:
                print("Uploading %s..." % file_key)

            content_type = mimetypes.guess_type(filename)[0]
            if content_type:
                headers['Content-Type'] = content_type
            else:
                headers['Content-Type'] = 'application/octet-stream'

            file_obj = open(filename, 'rb')
            file_size = os.fstat(file_obj.fileno()).st_size
            filedata = file_obj.read()
            if self.do_gzip:
                # Gzipping only if file is large enough (>1K is recommended)
                # and only if file is a common text type (not a binary file)
                if file_size > 1024 and content_type in self.GZIP_CONTENT_TYPES:
                    filedata = self.compress_string(filedata)
                    if self.rename_gzip:
                        # If rename_gzip is True, then rename the file
                        # by appending an extension (like '.gz)' to
                        # original filename.
                        file_key = '%s.%s' % (
                            file_key, self.SYNC_S3_RENAME_GZIP_EXT)
                    headers['Content-Encoding'] = 'gzip'
                    if self.verbosity > 1:
                        print("\tgzipped: %dk to %dk" % (file_size / 1024, len(filedata) / 1024))
            if self.do_expires:
                # HTTP/1.0
                headers['Expires'] = '%s GMT' % (email.Utils.formatdate(time.mktime((datetime.datetime.now() + datetime.timedelta(days=365 * 2)).timetuple())))
                # HTTP/1.1
                headers['Cache-Control'] = 'max-age %d' % (3600 * 24 * 365 * 2)
                if self.verbosity > 1:
                    print("\texpires: %s" % headers['Expires'])
                    print("\tcache-control: %s" % headers['Cache-Control'])

            try:
                key.name = file_key
                key.set_contents_from_string(filedata, headers, replace=True,
                                             policy=self.default_acl)
            except boto.exception.S3CreateError as e:
                print("Failed: %s" % e)
            except Exception as e:
                print(e)
                raise
            else:
                self.upload_count += 1
                self.uploaded_files.append(file_key)

            file_obj.close()