1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
|
"""
Semi-random access to bz2 compressed data.
"""
import struct
from io import BytesIO
try:
import lzo
except Exception:
pass
from bx_extras import lrucache
class SeekableLzopFile:
"""
Filelike object supporting read-only semi-random access to bz2 compressed
files for which an offset table (bz2t) has been generated by `bzip-table`.
"""
def __init__(self, filename, table_filename, block_cache_size=0, **kwargs):
self.filename = filename
self.table_filename = table_filename
self.init_table()
self.file = open(self.filename, "rb")
self.dirty = True
self.closed = False
self.at_eof = False
self.file_pos = 0
self.current_block_index = -1
self.current_block = None
if block_cache_size > 0:
self.cache = lrucache.LRUCache(block_cache_size)
else:
self.cache = None
def init_table(self):
self.block_size = None
self.block_info = []
# Position of corresponding block in compressed file (in bytes)
for line in open(self.table_filename):
fields = line.split()
if fields[0] == "s":
self.block_size = int(fields[1])
if fields[0] == "o":
offset = int(fields[1])
compressed_size = int(fields[2])
size = int(fields[3])
self.block_info.append((offset, compressed_size, size))
self.nblocks = len(self.block_info)
def close(self):
self.file.close()
self.closed = True
def load_block(self, index):
if self.cache is not None and index in self.cache:
return self.cache[index]
else:
offset, csize, size = self.block_info[index]
# Get the block of compressed data
self.file.seek(offset)
data = self.file.read(csize)
# Need to prepend a header for python-lzo module (silly)
data = b"".join((b"\xf0", struct.pack("!I", size), data))
value = lzo.decompress(data)
if self.cache is not None:
self.cache[index] = value
return value
def fix_dirty(self):
chunk, offset = self.get_block_and_offset(self.file_pos)
if self.current_block_index != chunk:
self.current_block = BytesIO(self.load_block(chunk))
self.current_block.read(offset)
self.current_block_index = chunk
else:
self.current_block.seek(offset)
self.dirty = False
def get_block_and_offset(self, index):
return int(index // self.block_size), int(index % self.block_size)
def seek(self, offset, whence=0):
"""
Move the file pointer to a particular offset.
"""
# Determine absolute target position
if whence == 0:
target_pos = offset
elif whence == 1:
target_pos = self.file_pos + offset
elif whence == 2:
raise Exception("seek from end not supported")
else:
raise Exception("Invalid `whence` argument: %r", whence)
# Check if this is a noop
if target_pos == self.file_pos:
return
# Verify it is valid
# Move the position
self.file_pos = target_pos
# Mark as dirty, the next time a read is done we need to actually
# move the position in the bzip2 file
self.dirty = True
def tell(self):
return self.file_pos
def read(self, sizehint=-1):
if sizehint < 0:
chunks = []
while True:
val = self._read(1024 * 1024)
if val:
chunks.append(val)
else:
break
return b"".join(chunks)
else:
return self._read(sizehint)
def _read(self, size):
if self.dirty:
self.fix_dirty()
val = b""
while size:
part = self.current_block.read(size)
size -= len(part)
if part:
val += part
elif self.current_block_index == self.nblocks - 1:
self.at_eof = True
break
else:
self.current_block_index += 1
self.current_block = BytesIO(self.load_block(self.current_block_index))
self.file_pos += len(val)
return val
def readline(self):
if self.dirty:
self.fix_dirty()
if self.at_eof:
return b""
rval = []
while True:
line = self.current_block.readline()
self.file_pos += len(line)
rval.append(line)
if len(line) > 0 and line[-1] == b"\n":
break
elif self.current_block_index == self.nblocks - 1:
self.at_eof = True
break
else:
self.current_block_index += 1
self.current_block = BytesIO(self.load_block(self.current_block_index))
return b"".join(rval)
def __next__(self):
line = self.readline()
if line == b"":
raise StopIteration
def __iter__(self):
return self
def flush(self):
pass
def readable(self):
return True
def seekable(self):
return True
def writable(self):
return False
# --- Factor out ---
MAGIC = b"\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a"
F_ADLER32_D = 0x00000001
F_ADLER32_C = 0x00000002
F_H_EXTRA_FIELD = 0x00000040
F_H_GMTDIFF = 0x00000080
F_CRC32_D = 0x00000100
F_CRC32_C = 0x00000200
F_MULTIPART = 0x00000400
F_H_FILTER = 0x00000800
F_H_CRC32 = 0x00001000
assert struct.calcsize("!H") == 2
assert struct.calcsize("!I") == 4
class UnpackWrapper:
def __init__(self, file):
self.file = file
def read(self, amt):
return self.file.read(amt)
def get(self, fmt):
t = struct.unpack(fmt, self.file.read(struct.calcsize(fmt)))
return t[0]
|