1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
import zlib
try:
import lzma
except ImportError:
lzma = None
from .helpers import Buffer
cdef extern from "lz4.h":
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
int LZ4_compressBound(int inputSize) nogil
buffer = Buffer(bytearray, size=0)
cdef class CompressorBase:
"""
base class for all (de)compression classes,
also handles compression format auto detection and
adding/stripping the ID header (which enable auto detection).
"""
ID = b'\xFF\xFF' # reserved and not used
# overwrite with a unique 2-bytes bytestring in child classes
name = 'baseclass'
@classmethod
def detect(cls, data):
return data.startswith(cls.ID)
def __init__(self, **kwargs):
pass
def compress(self, data):
# add ID bytes
return self.ID + data
def decompress(self, data):
# strip ID bytes
return data[2:]
class CNONE(CompressorBase):
"""
none - no compression, just pass through data
"""
ID = b'\x00\x00'
name = 'none'
def compress(self, data):
return super().compress(data)
def decompress(self, data):
data = super().decompress(data)
if not isinstance(data, bytes):
data = bytes(data)
return data
class LZ4(CompressorBase):
"""
raw LZ4 compression / decompression (liblz4).
Features:
- lz4 is super fast
- wrapper releases CPython's GIL to support multithreaded code
- uses safe lz4 methods that never go beyond the end of the output buffer
"""
ID = b'\x01\x00'
name = 'lz4'
def __init__(self, **kwargs):
pass
def compress(self, idata):
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
cdef int isize = len(idata)
cdef int osize
cdef char *source = idata
cdef char *dest
osize = LZ4_compressBound(isize)
buf = buffer.get(osize)
dest = <char *> buf
with nogil:
osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
if not osize:
raise Exception('lz4 compress failed')
return super().compress(dest[:osize])
def decompress(self, idata):
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
idata = super().decompress(idata)
cdef int isize = len(idata)
cdef int osize
cdef int rsize
cdef char *source = idata
cdef char *dest
# a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
# allocate more if isize * 3 is already bigger, to avoid having to resize often.
osize = max(int(1.1 * 2**23), isize * 3)
while True:
buf = buffer.get(osize)
dest = <char *> buf
with nogil:
rsize = LZ4_decompress_safe(source, dest, isize, osize)
if rsize >= 0:
break
if osize > 2 ** 30:
# this is insane, get out of here
raise Exception('lz4 decompress failed')
# likely the buffer was too small, get a bigger one:
osize = int(1.5 * osize)
return dest[:rsize]
class LZMA(CompressorBase):
"""
lzma compression / decompression
"""
ID = b'\x02\x00'
name = 'lzma'
def __init__(self, level=6, **kwargs):
super().__init__(**kwargs)
self.level = level
if lzma is None:
raise ValueError('No lzma support found.')
def compress(self, data):
# we do not need integrity checks in lzma, we do that already
data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
return super().compress(data)
def decompress(self, data):
data = super().decompress(data)
return lzma.decompress(data)
class ZLIB(CompressorBase):
"""
zlib compression / decompression (python stdlib)
"""
ID = b'\x08\x00' # not used here, see detect()
# avoid all 0x.8.. IDs elsewhere!
name = 'zlib'
@classmethod
def detect(cls, data):
# matches misc. patterns 0x.8.. used by zlib
cmf, flg = data[:2]
is_deflate = cmf & 0x0f == 8
check_ok = (cmf * 256 + flg) % 31 == 0
return check_ok and is_deflate
def __init__(self, level=6, **kwargs):
super().__init__(**kwargs)
self.level = level
def compress(self, data):
# note: for compatibility no super call, do not add ID bytes
return zlib.compress(data, self.level)
def decompress(self, data):
# note: for compatibility no super call, do not strip ID bytes
return zlib.decompress(data)
COMPRESSOR_TABLE = {
CNONE.name: CNONE,
LZ4.name: LZ4,
ZLIB.name: ZLIB,
LZMA.name: LZMA,
}
COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
def get_compressor(name, **kwargs):
cls = COMPRESSOR_TABLE[name]
return cls(**kwargs)
class Compressor:
"""
compresses using a compressor with given name and parameters
decompresses everything we can handle (autodetect)
"""
def __init__(self, name='null', **kwargs):
self.params = kwargs
self.compressor = get_compressor(name, **self.params)
def compress(self, data):
return self.compressor.compress(data)
def decompress(self, data):
hdr = bytes(data[:2]) # detect() does not work with memoryview
for cls in COMPRESSOR_LIST:
if cls.detect(hdr):
return cls(**self.params).decompress(data)
else:
raise ValueError('No decompressor for this data found: %r.', data[:2])
|