/* * Copyright 2017 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.turbine.zip; import static java.nio.charset.StandardCharsets.UTF_8; import com.google.common.primitives.UnsignedInts; import java.io.ByteArrayInputStream; import java.io.Closeable; import java.io.IOError; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; import java.nio.charset.CharacterCodingException; import java.nio.charset.CharsetDecoder; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.Iterator; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import java.util.zip.ZipException; /** * A fast, minimal, and somewhat garbage zip implementation. This exists because graal doesn't yet * support {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have * the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile} * instead. * *

If you're reading this because you're fixing a bug, sorry. * *

(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this * implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I * verified they're doing the same work.) This is likely largely due to ZipInputStream reading the * entire file from the beginning to scan the local headers, whereas this implementation (and * ZipFile) only read the central directory. Iterating over the entries (but not reading the data) * is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so * most time is spent just scanning entry names. And rt.jar is an interesting test case because * every compilation has to read it, and it dominates the size of the classpath for small * compilations. * *

Implementation notes: * *

Leading garbage may be supported, since the archive is read backwards using the central * directory. Archives modified with zip -A may not be supported. Trailing garbage is not * supported. *
UTF-8 is the only supported encoding. *
STORED and DEFLATE are the only supported compression methods. *
Zip files larger than Integer.MAX_VALUE bytes are not supported. *
The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end * header is present only if ENDTOT in EOCD header is 0xFFFF. *

*/ public final class Zip { static final int ZIP64_ENDSIG = 0x06064b50; static final int ZIP64_LOCSIG = 0x07064b50; static final int LOCHDR = 30; // LOC header size static final int CENHDR = 46; // CEN header size static final int ENDHDR = 22; // END header size static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size static final int ZIP64_ENDHDR = 56; // ZIP64 end header size static final int ENDTOT = 10; // total number of entries static final int ENDSIZ = 12; // central directory size in bytes static final int ENDCOM = 20; // zip file comment length static final int CENHOW = 10; // compression method static final int CENLEN = 24; // uncompressed size static final int CENSIZ = 20; // compressed size static final int CENNAM = 28; // filename length static final int CENEXT = 30; // extra field length static final int CENCOM = 32; // comment length static final int CENOFF = 42; // LOC header offset static final int LOCEXT = 28; // extra field length static final int ZIP64_ENDSIZ = 40; // central directory size in bytes static final int ZIP64_MAGICCOUNT = 0xFFFF; /** Iterates over a zip archive. */ static class ZipIterator implements Iterator { /** A reader for the backing storage. */ private final FileChannel chan; private final Path path; private int cdindex = 0; private final MappedByteBuffer cd; private final CharsetDecoder decoder = UTF_8.newDecoder(); ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) { this.path = path; this.chan = chan; this.cd = cd; } @Override public boolean hasNext() { return cdindex < cd.limit(); } /* Returns a {@link Entry} for the current CEN entry. */ @Override public Entry next() { // TODO(cushon): technically we're supposed to throw NSEE checkSignature(path, cd, cdindex, 1, 2, "CENSIG"); int nameLength = cd.getChar(cdindex + CENNAM); int extLength = cd.getChar(cdindex + CENEXT); int commentLength = cd.getChar(cdindex + CENCOM); Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex); cdindex += CENHDR + nameLength + extLength + commentLength; return entry; } public String string(ByteBuffer buf, int offset, int length) { buf = buf.duplicate(); buf.position(offset); buf.limit(offset + length); decoder.reset(); try { return decoder.decode(buf).toString(); } catch (CharacterCodingException e) { throw new IOError(e); } } } /** Provides an {@link Iterable} of {@link Entry} over a zip archive. */ public static class ZipIterable implements Iterable, Closeable { private final Path path; private final FileChannel chan; private final MappedByteBuffer cd; public ZipIterable(Path path) throws IOException { this.path = path; this.chan = FileChannel.open(path, StandardOpenOption.READ); // Locate the EOCD long size = chan.size(); if (size < ENDHDR) { throw new ZipException("invalid zip archive"); } long eocdOffset = size - ENDHDR; MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR); eocd.order(ByteOrder.LITTLE_ENDIAN); int index = 0; int commentSize = 0; if (!isSignature(eocd, 0, 5, 6)) { // The archive may contain a zip file comment; keep looking for the EOCD. long start = Math.max(0, size - ENDHDR - 0xFFFF); eocd = chan.map(MapMode.READ_ONLY, start, (size - start)); eocd.order(ByteOrder.LITTLE_ENDIAN); index = (int) ((size - start) - ENDHDR); while (index > 0) { index--; eocd.position(index); if (isSignature(eocd, index, 5, 6)) { commentSize = (int) ((size - start) - ENDHDR) - index; eocdOffset = start + index; break; } } } checkSignature(path, eocd, index, 5, 6, "ENDSIG"); int totalEntries = eocd.getChar(index + ENDTOT); long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ)); int actualCommentSize = eocd.getChar(index + ENDCOM); if (commentSize != actualCommentSize) { throw new ZipException( String.format( "zip file comment length was %d, expected %d", commentSize, actualCommentSize)); } // If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator. if (totalEntries == ZIP64_MAGICCOUNT) { // Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors. long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR; // Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF // entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations // read the full EOCD records and compare them. long zip64cdsize = zip64cdsize(chan, zip64eocdOffset); if (zip64cdsize != -1) { eocdOffset = zip64eocdOffset; cdsize = zip64cdsize; } else { // If we couldn't find a zip64 EOCD at a fixed offset, either it doesn't exist // or there was a zip64 extensible data sector, so try going through the // locator. This approach doesn't work if data was prepended to the archive // without updating the offset in the locator. MappedByteBuffer zip64loc = chan.map(MapMode.READ_ONLY, size - ENDHDR - ZIP64_LOCHDR, ZIP64_LOCHDR); zip64loc.order(ByteOrder.LITTLE_ENDIAN); if (zip64loc.getInt(0) == ZIP64_LOCSIG) { zip64eocdOffset = zip64loc.getLong(8); zip64cdsize = zip64cdsize(chan, zip64eocdOffset); if (zip64cdsize != -1) { eocdOffset = zip64eocdOffset; cdsize = zip64cdsize; } } } } this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize); cd.order(ByteOrder.LITTLE_ENDIAN); } static long zip64cdsize(FileChannel chan, long eocdOffset) throws IOException { MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ZIP64_ENDHDR); zip64eocd.order(ByteOrder.LITTLE_ENDIAN); if (zip64eocd.getInt(0) == ZIP64_ENDSIG) { return zip64eocd.getLong(ZIP64_ENDSIZ); } return -1; } @Override public Iterator iterator() { return new ZipIterator(path, chan, cd); } @Override public void close() throws IOException { chan.close(); } } /** An entry in a zip archive. */ public static class Entry { private final Path path; private final FileChannel chan; private final String name; private final ByteBuffer cd; private final int cdindex; public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) { this.path = path; this.chan = chan; this.name = name; this.cd = cd; this.cdindex = cdindex; } /** The entry name. */ public String name() { return name; } /** The entry data. */ public byte[] data() { // Read the offset and variable lengths from the central directory and then try to map in the // data section in one shot. long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF)); int nameLength = cd.getChar(cdindex + CENNAM); int extLength = cd.getChar(cdindex + CENEXT); int compression = cd.getChar(cdindex + CENHOW); switch (compression) { case 0x8: return getBytes( offset, nameLength, extLength, UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)), /*deflate=*/ true); case 0x0: return getBytes( offset, nameLength, extLength, UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)), /*deflate=*/ false); default: throw new AssertionError( String.format("unsupported compression mode: 0x%x", compression)); } } /** * Number of extra bytes to read for each file, to avoid re-mapping the data if the local header * reports more extra field data than the central directory. */ static final int EXTRA_FIELD_SLACK = 128; private byte[] getBytes( long offset, int nameLength, int cenExtLength, long size, boolean deflate) { if (size > Integer.MAX_VALUE) { throw new IllegalArgumentException("unsupported zip entry size: " + size); } try { MappedByteBuffer fc = chan.map( MapMode.READ_ONLY, offset, Math.min( LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK, chan.size() - offset)); fc.order(ByteOrder.LITTLE_ENDIAN); checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG"); int locExtLength = fc.getChar(LOCEXT); if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) { // If the local header's extra fields don't match the central directory and we didn't // leave enough slac, re-map the data section with the correct extra field length. fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size); fc.order(ByteOrder.LITTLE_ENDIAN); } else { // Otherwise seek past the local header, name, and extra fields to the data. fc.position(LOCHDR + nameLength + locExtLength); fc.limit((int) (LOCHDR + nameLength + locExtLength + size)); } byte[] bytes = new byte[(int) size]; fc.get(bytes); if (deflate) { bytes = new InflaterInputStream( new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true)) .readAllBytes(); } return bytes; } catch (IOException e) { throw new IOError(e); } } } static void checkSignature( Path path, MappedByteBuffer buf, int index, int i, int j, String name) { if (!isSignature(buf, index, i, j)) { throw new AssertionError( String.format( "%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)", path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index))); } } static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) { return (buf.get(index) == 'P') && (buf.get(index + 1) == 'K') && (buf.get(index + 2) == i) && (buf.get(index + 3) == j); } private Zip() {} }