1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
"""
Pyrex/C extension supporting `bx.misc.seekbzip2` (wrapping the low level
functions in `micro-bunzip.c`).
"""
cdef extern from "Python.h":
char * PyBytes_AsString( object )
object PyBytes_FromStringAndSize( char *, Py_ssize_t )
cdef extern from "micro-bunzip.h":
ctypedef struct bunzip_data:
int in_fd
int inbufBitCount
int inbufPos
int inbufCount
int writeCount
unsigned int writeCRC
int writeCurrent
int writeCopies
unsigned int * dbuf
unsigned int get_bits(bunzip_data *bd, char bits_wanted)
int get_next_block( bunzip_data *bd )
int read_bunzip(bunzip_data *bd, char *outbuf, int len)
int start_bunzip(bunzip_data **bdp, int in_fd, char *inbuf, int len)
int read_bunzip_to_char(bunzip_data *bd, char *outbuf, int len, int* gotcount_out, char stopchar )
cdef extern from "unistd.h":
# Not really
ctypedef unsigned long long off_t
off_t lseek( int fildes, off_t offset, int whence )
cdef extern from "stdlib.h":
void free( void *ptr )
import os
import sys
cdef class SeekBzip2:
cdef bunzip_data * bd
cdef int file_fd
cdef int at_eof
def __init__( self, filename ):
self.at_eof = 0
self.file_fd = os.open( filename, os.O_RDONLY )
# Initialize bunzip_data from the file
start_bunzip( &( self.bd ), self.file_fd, NULL, 0 )
def close( self ):
free( self.bd.dbuf )
free( self.bd )
os.close( self.file_fd )
def seek( self, unsigned long long position ):
"""
Seek the bunzip_data to a specific chunk (position must correspond to
that start of a compressed data block).
"""
cdef off_t n_byte
cdef int n_bit
# Break position into bit and byte offsets
## sys.stderr.write( "arg pos: %d\n" % position )
n_byte = position // 8;
n_bit = position % 8;
## sys.stderr.write( "byte pos: %d\n" % n_byte )
## sys.stderr.write( "bit pos: %d\n" % n_bit )
## sys.stderr.flush()
# Seek the underlying file descriptor
if ( lseek( self.file_fd, n_byte, 0 ) != n_byte ):
raise Exception( "lseek of underlying file failed" )
# Init the buffer at the right bit position
self.bd.inbufBitCount = self.bd.inbufPos = self.bd.inbufCount = 0
get_bits( self.bd, n_bit )
# This ensures that the next read call will return 0, causing the
# buffer to be re-initialized
self.bd.writeCount = -1
# Reset EOF tracking
self.at_eof = 0
def readline( self, int amount ):
cdef object rval
cdef char * p_rval
cdef int gotcount
cdef int totalcount
cdef int status
cdef int spaceleft
cdef int desired
gotcount = 0
totalcount = 0
# If already at EOF return None
if self.at_eof:
return None
chunks = []
# We have great difficulty resizing buffers, so we'll just create
# one 8k string at a time
rval = PyBytes_FromStringAndSize( NULL, 8192 )
p_rval = PyBytes_AsString( rval )
spaceleft = 8192
while amount != 0:
if amount > 0 and amount < spaceleft:
desired = amount
else:
desired = spaceleft
## sys.stderr.write( "readline, amount: %d\n" % amount )
## sys.stderr.write( "buffer: %r" % rval[:100] )
## sys.stderr.write( "\n" )
## sys.stderr.flush()
# ord( "\n" ) = 10
status = read_bunzip_to_char( self.bd, p_rval, desired, &gotcount, 10 );
## sys.stderr.write( "readline, desired: %d, gotcount: %d\n" % ( desired, gotcount ) );
## sys.stderr.write( "buffer: %r" % rval[:100] )
## sys.stderr.write( "\n" )
## sys.stderr.flush()
if status == -9:
## sys.stderr.write( "readline, STOP_CHAR\n" ); sys.stderr.flush()
# Reached the stop character (RETVAL_STOPCHAR == -9), so
# we can stop
chunks.append( rval[:8192-spaceleft+gotcount] )
break
elif status == -10:
## sys.stderr.write( "readline, BUFFER_FULL\n" ); sys.stderr.flush()
# Filled the buffer (RETVAL_BUFFER_FULL == -10), so create
# new buffer and keep going
chunks.append( rval )
amount = amount - gotcount
if amount == 0:
# Got the desired amount
break
rval = PyBytes_FromStringAndSize( NULL, 8192 )
p_rval = PyBytes_AsString( rval )
spaceleft = 8192
elif status == -8:
## sys.stderr.write( "readline, END_OF_BLOCK\n" ); sys.stderr.flush()
# No more data in the decomp buffer (RETVAL_END_OF_BLOCK == -10)
if gotcount and p_rval[ gotcount - 1 ] == 10:
chunks.append( rval[:8192-spaceleft+gotcount] )
break
# Update buffer info
p_rval = p_rval + gotcount
spaceleft = spaceleft - gotcount
amount = amount - gotcount
# Get the next block
status = get_next_block( self.bd )
if status == -1:
# Block is end of stream block (RETVAL_LAST_BLOCK == -1)
self.at_eof = 1
chunks.append( rval[:gotcount] )
break
self.bd.writeCRC = 0xffffffff
self.bd.writeCopies = 0
else:
# Some other status
raise Exception( "read_bunzip error %d" % status )
# Return whatever we read
return "".join( chunks )
def read( self, int amount ):
cdef object rval
cdef char * p_rval
cdef int gotcount
cdef int totalcount
cdef int status
totalcount = 0
# If already at EOF return None
if self.at_eof:
return None
# Create a new python bytes string large enough to hold the result
rval = PyBytes_FromStringAndSize( NULL, amount )
p_rval = PyBytes_AsString( rval )
# Read into it
## sys.stderr.write( "read called, bd.current: %x\n" % self.bd.writeCurrent ); sys.stderr.flush()
while amount > 0:
gotcount = read_bunzip( self.bd, p_rval, amount );
if gotcount < 0:
raise Exception( "read_bunzip error %d" % gotcount )
elif gotcount == 0:
status = get_next_block( self.bd )
if status == -1:
self.at_eof = 1
break
self.bd.writeCRC = 0xffffffff
self.bd.writeCopies = 0
else:
totalcount = totalcount + gotcount
amount = amount - gotcount
p_rval = p_rval + gotcount
# Return whatever we read
return rval[:totalcount]
|