1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import bz2
import gzip
import stat
import tarfile
from .files import BaseFile, File
# 2016-01-01T00:00:00+0000
DEFAULT_MTIME = 1451606400
# Python 3.9 contains this change:
# https://github.com/python/cpython/commit/674935b8caf33e47c78f1b8e197b1b77a04992d2
# which changes the output of tar creation compared to earlier versions.
# As this code is used to generate tar files that are meant to be deterministic
# across versions of python (specifically, it's used as part of computing the hash
# of docker images, which needs to be identical between CI (which uses python 3.8),
# and developer environments (using arbitrary versions of python, at this point,
# most probably more recent than 3.9)).
# What we do is subblass TarInfo so that if used on python >= 3.9, it reproduces the
# behavior from python < 3.9.
# Here's how it goes:
# - the behavior in python >= 3.9 is the same as python < 3.9 when the type encoded
# in the tarinfo is CHRTYPE or BLKTYPE.
# - the value of the type is only compared in the context of choosing which behavior
# to take
# - we replace the type with the same value (so that using the value has no changes)
# but that pretends to be the same as CHRTYPE so that the condition that enables the
# old behavior is taken.
class HackedType(bytes):
def __eq__(self, other):
if other == tarfile.CHRTYPE:
return True
return self == other
class TarInfo(tarfile.TarInfo):
@staticmethod
def _create_header(info, format, encoding, errors):
info["type"] = HackedType(info["type"])
return tarfile.TarInfo._create_header(info, format, encoding, errors)
def create_tar_from_files(fp, files):
"""Create a tar file deterministically.
Receives a dict mapping names of files in the archive to local filesystem
paths or ``mozpack.files.BaseFile`` instances.
The files will be archived and written to the passed file handle opened
for writing.
Only regular files can be written.
FUTURE accept a filename argument (or create APIs to write files)
"""
# The format is explicitly set to tarfile.GNU_FORMAT, because this default format
# has been changed in Python 3.8.
with tarfile.open(
name="", mode="w", fileobj=fp, dereference=True, format=tarfile.GNU_FORMAT
) as tf:
for archive_path, f in sorted(files.items()):
if not isinstance(f, BaseFile):
f = File(f)
ti = TarInfo(archive_path)
ti.mode = f.mode or 0o0644
ti.type = tarfile.REGTYPE
if not ti.isreg():
raise ValueError("not a regular file: %s" % f)
# Disallow setuid and setgid bits. This is an arbitrary restriction.
# However, since we set uid/gid to root:root, setuid and setgid
# would be a glaring security hole if the archive were
# uncompressed as root.
if ti.mode & (stat.S_ISUID | stat.S_ISGID):
raise ValueError("cannot add file with setuid or setgid set: " "%s" % f)
# Set uid, gid, username, and group as deterministic values.
ti.uid = 0
ti.gid = 0
ti.uname = ""
ti.gname = ""
# Set mtime to a constant value.
ti.mtime = DEFAULT_MTIME
ti.size = f.size()
# tarfile wants to pass a size argument to read(). So just
# wrap/buffer in a proper file object interface.
tf.addfile(ti, f.open())
def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9):
"""Create a tar.gz file deterministically from files.
This is a glorified wrapper around ``create_tar_from_files`` that
adds gzip compression.
The passed file handle should be opened for writing in binary mode.
When the function returns, all data has been written to the handle.
"""
# Offset 3-7 in the gzip header contains an mtime. Pin it to a known
# value so output is deterministic.
gf = gzip.GzipFile(
filename=filename or "",
mode="wb",
fileobj=fp,
compresslevel=compresslevel,
mtime=DEFAULT_MTIME,
)
with gf:
create_tar_from_files(gf, files)
def create_tar_zst_from_files(fp, files, filename=None, compresslevel=9, threads=1):
"""Create a tar.zst file deterministically from files.
This is a glorified wrapper around ``create_tar_from_files`` that
adds zstandard compression.
The passed file handle should be opened for writing in binary mode.
When the function returns, all data has been written to the handle.
"""
import zstandard
cctx = zstandard.ZstdCompressor(level=compresslevel, threads=threads)
with cctx.stream_writer(writer=fp) as compressor:
create_tar_from_files(compressor, files)
class _BZ2Proxy:
"""File object that proxies writes to a bz2 compressor."""
def __init__(self, fp, compresslevel=9):
self.fp = fp
self.compressor = bz2.BZ2Compressor(compresslevel)
self.pos = 0
def tell(self):
return self.pos
def write(self, data):
data = self.compressor.compress(data)
self.pos += len(data)
self.fp.write(data)
def close(self):
data = self.compressor.flush()
self.pos += len(data)
self.fp.write(data)
def create_tar_bz2_from_files(fp, files, compresslevel=9):
"""Create a tar.bz2 file deterministically from files.
This is a glorified wrapper around ``create_tar_from_files`` that
adds bzip2 compression.
This function is similar to ``create_tar_gzip_from_files()``.
"""
proxy = _BZ2Proxy(fp, compresslevel=compresslevel)
create_tar_from_files(proxy, files)
proxy.close()
|