
|
"""
Semi-random access to bz2 compressed data.
"""
import bisect
from ._seekbzip2 import SeekBzip2
class SeekableBzip2File:
"""
Filelike object supporting read-only semi-random access to bz2 compressed
files for which an offset table (bz2t) has been generated by `bzip-table`.
"""
def __init__(self, filename, table_filename, **kwargs):
self.filename = filename
self.table_filename = table_filename
self.init_table()
self.init_bz2()
self.pos = 0
self.dirty = True
self.closed = False
def init_bz2(self):
self.seek_bz2 = SeekBzip2(self.filename)
def init_table(self):
# Position in plaintext file
self.table_positions = []
# Position of corresponding block in bz2 file (bits)
self.table_bz2positions = []
pos = 0
for line in open(self.table_filename):
fields = line.split()
# Position of the compressed block in the bz2 file
bz2_pos = int(fields[0])
# print >> sys.stderr, fields[0], bz2_pos
# Length of the block when uncompressed
length = int(fields[1])
self.table_positions.append(pos)
self.table_bz2positions.append(bz2_pos)
old_pos = pos
pos = pos + length
assert pos > old_pos
self.size = pos
def close(self):
self.seek_bz2.close()
self.closed = True
def fix_dirty(self):
# Our virtual position in the uncompressed data is out of sync
# FIXME: If we're moving to a later position that is still in
# the same block, we could just read and throw out bytes in the
# compressed stream, less wasteful then backtracking
chunk, offset = self.get_chunk_and_offset(self.pos)
# Get the seek position for that chunk and seek to it
bz2_seek_pos = self.table_bz2positions[chunk]
# print >>sys.stderr, "bz2 seek pos:", bz2_seek_pos
self.seek_bz2.seek(bz2_seek_pos)
# Consume bytes to move to the correct position
assert len(self.seek_bz2.read(offset)) == offset
# Update state
self.dirty = False
def read(self, sizehint=-1):
if sizehint < 0:
chunks = []
while True:
val = self._read(1024 * 1024)
if val:
chunks.append(val)
else:
break
return b"".join(chunks)
else:
return self._read(sizehint)
def _read(self, size):
if self.dirty:
self.fix_dirty()
val = self.seek_bz2.read(size)
if val is None:
# EOF
self.pos = self.size
val = b""
else:
self.pos = self.pos + len(val)
return val
def readline(self, size=-1):
if self.dirty:
self.fix_dirty()
val = self.seek_bz2.readline(size)
if val is None:
# EOF
self.pos = self.size
val = b""
else:
self.pos = self.pos + len(val)
return val
def tell(self):
return self.pos
def get_chunk_and_offset(self, position):
# Find the chunk that position is in using a binary search
chunk = bisect.bisect(self.table_positions, position) - 1
offset = position - self.table_positions[chunk]
return chunk, offset
def seek(self, offset, whence=0):
# Determine absolute target position
if whence == 0:
target_pos = offset
elif whence == 1:
target_pos = self.pos + offset
elif whence == 2:
target_pos = self.size - offset
else:
raise Exception("Invalid `whence` argument: %r", whence)
# Check if this is a noop
if target_pos == self.pos:
return
# Verify it is valid
assert 0 <= target_pos < self.size, "Attempt to seek outside file"
# Move the position
self.pos = target_pos
# Mark as dirty, the next time a read is done we need to actually
# move the position in the bzip2 file
self.dirty = True
# ---- File like methods ------------------------------------------------
def __next__(self):
ln = self.readline()
if ln == b"":
raise StopIteration()
return ln
def __iter__(self):
return self
def flush(self):
pass
def readable(self):
return True
def readlines(self, sizehint=-1):
return list(self)
def seekable(self):
return True
def xreadlines(self):
return iter(self)
def writable(self):
return False
|