1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
|
#!/usr/bin/env python3
#
# This script is in the public domain
#
# Author: Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
#
# This script accepts a tarball on standard input and filters it according to
# the same rules used by dpkg --path-exclude and --path-include, using command
# line options of the same name. The result is then printed on standard output.
#
# A tool like this should be written in C but libarchive has issues:
# https://github.com/libarchive/libarchive/issues/587
# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1)
# Should these issues get fixed, then a good template is tarfilter.c in the
# examples directory of libarchive.
#
# We are not using Perl either, because Archive::Tar slurps the whole tarball
# into memory.
#
# We could also use Go but meh...
# https://stackoverflow.com/a/59542307/784669
import tarfile
import sys
import argparse
import fnmatch
import re
class PathFilterAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, "pathfilter", [])
regex = re.compile(fnmatch.translate(values))
items.append((self.dest, regex))
setattr(namespace, "pathfilter", items)
class PaxFilterAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, "paxfilter", [])
regex = re.compile(fnmatch.translate(values))
items.append((self.dest, regex))
setattr(namespace, "paxfilter", items)
class TypeFilterAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, "typefilter", [])
match values:
case "REGTYPE" | "0":
items.append(tarfile.REGTYPE)
case "LNKTYPE" | "1":
items.append(tarfile.LNKTYPE)
case "SYMTYPE" | "2":
items.append(tarfile.SYMTYPE)
case "CHRTYPE" | "3":
items.append(tarfile.CHRTYPE)
case "BLKTYPE" | "4":
items.append(tarfile.BLKTYPE)
case "DIRTYPE" | "5":
items.append(tarfile.DIRTYPE)
case "FIFOTYPE" | "6":
items.append(tarfile.FIFOTYPE)
case _:
raise ValueError("invalid type: %s" % values)
setattr(namespace, "typefilter", items)
class TransformAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, "trans", [])
# This function mimics what src/transform.c from tar does
if not values.startswith("s"):
raise ValueError("regex must start with an 's'")
if len(values) <= 4:
# minimum regex: s/x//
raise ValueError("invalid regex (too short)")
d = values[1]
if values.startswith(f"s{d}{d}"):
raise ValueError("empty regex")
values = values.removeprefix(f"s{d}")
flags = 0
if values.endswith(f"{d}i"):
# trailing flags
flags = re.IGNORECASE
values = values.removesuffix(f"{d}i")
# This regex only finds non-empty tokens.
# Finding empty tokens would require a variable length look-behind
# or \K in order to find escaped delimiters which is not supported by
# the python re module.
tokens = re.findall(rf"(?:\\[\\{d}]|[^{d}])+", values)
match len(tokens):
case 0:
raise ValueError("invalid regex: not enough terms")
case 1:
repl = ""
case 2:
repl = tokens[1]
case _:
raise ValueError("invalid regex: too many terms: %s" % tokens)
items.append((re.compile(tokens[0], flags), repl))
setattr(namespace, "trans", items)
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
Filters a tarball on standard input by the same rules as the dpkg --path-exclude
and --path-include options and writes resulting tarball to standard output. See
dpkg(1) for information on how these two options work in detail. To reuse the
exact same semantics as used by dpkg, paths must be given as /path and not as
./path even though they might be stored as such in the tarball.
Secondly, filter out unwanted pax extended headers using --pax-exclude and
--pax-include. This is useful in cases where a tool only accepts certain xattr
prefixes. For example tar2sqfs only supports SCHILY.xattr.user.*,
SCHILY.xattr.trusted.* and SCHILY.xattr.security.* but not
SCHILY.xattr.system.posix_acl_default.*.
Both types of options use Unix shell-style wildcards:
* matches everything
? matches any single character
[seq] matches any character in seq
[!seq] matches any character not in seq
Thirdly, filter out files matching a specific tar archive member type using
--type-exclude. Valid type names are REGTYPE (regular file), LNKTYPE
(hardlink), SYMTYPE (symlink), CHRTYPE (character special), BLKTYPE (block
special), DIRTYPE (directory), FIFOTYPE (fifo) or their tar format flag value
(0-6, respectively).
Fourthly, transform the path of tar members using a sed expression just as with
GNU tar --transform.
Fifthly, strip leading directory components off of tar members. Just as with
GNU tar --strip-components, tar members that have less or equal components in
their path are not passed through.
Lastly, shift user id and group id of each entry by the value given by the
--idshift argument. The resulting uid or gid must not be negative.
""",
)
parser.add_argument(
"--path-exclude",
metavar="pattern",
action=PathFilterAction,
help="Exclude path matching the given shell pattern. "
"This option can be specified multiple times.",
)
parser.add_argument(
"--path-include",
metavar="pattern",
action=PathFilterAction,
help="Re-include a pattern after a previous exclusion. "
"This option can be specified multiple times.",
)
parser.add_argument(
"--pax-exclude",
metavar="pattern",
action=PaxFilterAction,
help="Exclude pax header matching the given globbing pattern. "
"This option can be specified multiple times.",
)
parser.add_argument(
"--pax-include",
metavar="pattern",
action=PaxFilterAction,
help="Re-include a pax header after a previous exclusion. "
"This option can be specified multiple times.",
)
parser.add_argument(
"--type-exclude",
metavar="type",
action=TypeFilterAction,
help="Exclude certain member types by their type. Choose types either "
"by their name (REGTYPE, LNKTYPE, SYMTYPE, CHRTYPE, BLKTYPE, DIRTYPE, "
"FIFOTYPE) or by their tar format flag values (0-6, respectively). "
"This option can be specified multiple times.",
)
parser.add_argument(
"--transform",
"--xform",
metavar="EXPRESSION",
action=TransformAction,
help="Use sed replace EXPRESSION to transform file names. "
"This option can be specified multiple times.",
)
parser.add_argument(
"--strip-components",
metavar="NUMBER",
type=int,
help="Strip NUMBER leading components from file names",
)
parser.add_argument(
"--idshift",
metavar="NUM",
type=int,
help="Integer value by which to shift the uid and gid of each entry",
)
args = parser.parse_args()
if (
not hasattr(args, "pathfilter")
and not hasattr(args, "paxfilter")
and not hasattr(args, "typefilter")
and not hasattr(args, "strip_components")
):
from shutil import copyfileobj
copyfileobj(sys.stdin.buffer, sys.stdout.buffer)
exit()
# same logic as in dpkg/src/filters.c/filter_should_skip()
prefix_prog = re.compile(r"^([^*?[\\]*).*")
def path_filter_should_skip(member):
skip = False
if not hasattr(args, "pathfilter"):
return False
# normalize path and make it absolute by stripping off all leading
# dots and slashes and then prepending a slash
name = "/" + member.name.lstrip("./")
for t, r in args.pathfilter:
if r.match(name) is not None:
if t == "path_include":
skip = False
else:
skip = True
if skip and (member.isdir() or member.issym()):
for t, r in args.pathfilter:
if t != "path_include":
continue
prefix = prefix_prog.sub(r"\1", r.pattern)
prefix = prefix.rstrip("/")
if name.startswith(prefix):
return False
return skip
def pax_filter_should_skip(header):
if not hasattr(args, "paxfilter"):
return False
skip = False
for t, r in args.paxfilter:
if r.match(header) is None:
continue
if t == "pax_include":
skip = False
else:
skip = True
return skip
def type_filter_should_skip(member):
if not hasattr(args, "typefilter"):
return False
for t in args.typefilter:
if member.type == t:
return True
return False
# starting with Python 3.8, the default format became PAX_FORMAT but we
# are still explicit here in case of future changes.
with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open(
fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT
) as out_tar:
for member in in_tar:
if path_filter_should_skip(member):
continue
if type_filter_should_skip(member):
continue
if args.strip_components:
comps = member.name.split("/")
# just as with GNU tar, archive members with less or equal
# number of components are not passed through at all
if len(comps) <= args.strip_components:
continue
member.name = "/".join(comps[args.strip_components :])
member.pax_headers = {
k: v
for k, v in member.pax_headers.items()
if not pax_filter_should_skip(k)
}
if args.idshift:
if args.idshift < 0 and -args.idshift > member.uid:
print("uid cannot be negative", file=sys.stderr)
exit(1)
if args.idshift < 0 and -args.idshift > member.gid:
print("gid cannot be negative", file=sys.stderr)
exit(1)
member.uid += args.idshift
member.gid += args.idshift
if hasattr(args, "trans"):
for r, s in args.trans:
member.name = r.sub(s, member.name)
if member.isfile():
with in_tar.extractfile(member) as file:
out_tar.addfile(member, file)
else:
out_tar.addfile(member)
if __name__ == "__main__":
main()
|