File: libctabix.pyx

package info (click to toggle)
python-pysam 0.10.0%2Bds-2
links: PTS, VCS
area: main
in suites: stretch
size: 14,196 kB
ctags: 10,087
sloc: ansic: 79,627; python: 8,569; sh: 282; makefile: 215; perl: 41
file content (1188 lines) | stat: -rw-r--r-- 38,981 bytes
# cython: embedsignature=True
# cython: profile=True
###############################################################################
###############################################################################
# Cython wrapper for access to tabix indexed files in bgzf format
###############################################################################
# The principal classes and functions defined in this module are:
#
# class TabixFile  class wrapping tabix indexed files in bgzf format
#
# class asTuple  Parser class for tuples
# class asGT     Parser class for GTF formatted rows
# class asBed    Parser class for Bed formatted rows
# class asVCF    Parser class for VCF formatted rows
#
# class tabix_generic_iterator  Streamed iterator of bgzf formatted files
#
# Additionally this module defines several additional classes that are part
# of the internal API. These are:
#
# class Parser                base class for parsers of tab-separated rows
# class tabix_file_iterator
# class TabixIterator         iterator class over rows in bgzf file
# class EmptyIterator
#
# For backwards compatibility, the following classes are also defined:
#
# class Tabixfile   equivalent to TabixFile
#
###############################################################################
#
# The MIT License
#
# Copyright (c) 2015 Andreas Heger
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
import os
import sys

from libc.stdio cimport printf, fprintf, stderr
from libc.string cimport strerror
from libc.errno cimport errno
from posix.unistd cimport dup

from cpython cimport PyErr_SetString, PyBytes_Check, \
    PyUnicode_Check, PyBytes_FromStringAndSize, \
    PyObject_AsFileDescriptor

from cpython.version cimport PY_MAJOR_VERSION

cimport pysam.libctabixproxies as ctabixproxies

from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
    tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
    tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
    tbx_destroy, hisremote, region_list

from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
from pysam.libcutils cimport encode_filename, from_string_and_size

cdef class Parser:

    def __init__(self, encoding="ascii"):
        self.encoding = encoding

    def set_encoding(self, encoding):
        self.encoding = encoding

    def get_encoding(self):
        return self.encoding

    cdef parse(self, char * buffer, int length):
        raise NotImplementedError(
            'parse method of %s not implemented' % str(self))

    def __call__(self, char * buffer, int length):
        return self.parse(buffer, length)


cdef class asTuple(Parser):
    '''converts a :term:`tabix row` into a python tuple.

    A field in a row is accessed by numeric index.
    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.TupleProxy r
        r = ctabixproxies.TupleProxy(self.encoding)
        # need to copy - there were some
        # persistence issues with "present"
        r.copy(buffer, len)
        return r


cdef class asGTF(Parser):
    '''converts a :term:`tabix row` into a GTF record with the following
    fields:
   
    +----------+----------+-------------------------------+
    |*Column*  |*Name*    |*Content*                      |
    +----------+----------+-------------------------------+
    |1         |contig    |the chromosome name            |
    +----------+----------+-------------------------------+
    |2         |feature   |The feature type               |
    +----------+----------+-------------------------------+
    |3         |source    |The feature source             |
    +----------+----------+-------------------------------+
    |4         |start     |genomic start coordinate       |
    |          |          |(0-based)                      |
    +----------+----------+-------------------------------+
    |5         |end       |genomic end coordinate         |
    |          |          |(0-based)                      |
    +----------+----------+-------------------------------+
    |6         |score     |feature score                  |
    +----------+----------+-------------------------------+
    |7         |strand    |strand                         |
    +----------+----------+-------------------------------+
    |8         |frame     |frame                          |
    +----------+----------+-------------------------------+
    |9         |attributes|the attribute field            |
    +----------+----------+-------------------------------+

    GTF formatted entries also define the following fields that
    are derived from the attributes field:

    +--------------------+------------------------------+
    |*Name*              |*Content*                     |
    +--------------------+------------------------------+
    |gene_id             |the gene identifier           |
    +--------------------+------------------------------+
    |transcript_id       |the transcript identifier     |
    +--------------------+------------------------------+

    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.GTFProxy r
        r = ctabixproxies.GTFProxy(self.encoding)
        r.copy(buffer, len)
        return r


cdef class asBed(Parser):
    '''converts a :term:`tabix row` into a bed record
    with the following fields:

    +-----------+-----------+------------------------------------------+
    |*Column*   |*Field*    |*Contents*                                |
    |           |           |                                          |
    +-----------+-----------+------------------------------------------+
    |1          |contig     |contig                                    |
    |           |           |                                          |
    +-----------+-----------+------------------------------------------+
    |2          |start      |genomic start coordinate (zero-based)     |
    +-----------+-----------+------------------------------------------+
    |3          |end        |genomic end coordinate plus one           |
    |           |           |(zero-based)                              |
    +-----------+-----------+------------------------------------------+
    |4          |name       |name of feature.                          |
    +-----------+-----------+------------------------------------------+
    |5          |score      |score of feature                          |
    +-----------+-----------+------------------------------------------+
    |6          |strand     |strand of feature                         |
    +-----------+-----------+------------------------------------------+
    |7          |thickStart |thickStart                                |
    +-----------+-----------+------------------------------------------+
    |8          |thickEnd   |thickEnd                                  |
    +-----------+-----------+------------------------------------------+
    |9          |itemRGB    |itemRGB                                   |
    +-----------+-----------+------------------------------------------+
    |10         |blockCount |number of bocks                           |
    +-----------+-----------+------------------------------------------+
    |11         |blockSizes |',' separated string of block sizes       |
    +-----------+-----------+------------------------------------------+
    |12         |blockStarts|',' separated string of block genomic     |
    |           |           |start positions                           |
    +-----------+-----------+------------------------------------------+

    Only the first three fields are required. Additional
    fields are optional, but if one is defined, all the preceding
    need to be defined as well.

    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.BedProxy r
        r = ctabixproxies.BedProxy(self.encoding)
        r.copy(buffer, len)
        return r


cdef class asVCF(Parser): 
    '''converts a :term:`tabix row` into a VCF record with
    the following fields:
    
    +----------+---------+------------------------------------+
    |*Column*  |*Field*  |*Contents*                          |
    |          |         |                                    |
    +----------+---------+------------------------------------+
    |1         |contig   |chromosome                          |
    +----------+---------+------------------------------------+
    |2         |pos      |chromosomal position, zero-based    |
    +----------+---------+------------------------------------+
    |3         |id       |id                                  |
    +----------+---------+------------------------------------+
    |4         |ref      |reference allele                    |
    +----------+---------+------------------------------------+
    |5         |alt      |alternate alleles                   |
    +----------+---------+------------------------------------+
    |6         |qual     |quality                             |
    +----------+---------+------------------------------------+
    |7         |filter   |filter                              |
    +----------+---------+------------------------------------+
    |8         |info     |info                                |
    +----------+---------+------------------------------------+
    |9         |format   |format specifier.                   |
    +----------+---------+------------------------------------+

    Access to genotypes is via index::

        contig = vcf.contig
        first_sample_genotype = vcf[0]
        second_sample_genotype = vcf[1]

    '''
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.VCFProxy r
        r = ctabixproxies.VCFProxy(self.encoding)
        r.copy(buffer, len)
        return r


cdef class TabixFile:
    """Random access to bgzf formatted files that
    have been indexed by :term:`tabix`.

    The file is automatically opened. The index file of file
    ``<filename>`` is expected to be called ``<filename>.tbi``
    by default (see parameter `index`).
    
    Parameters
    ----------
    
    filename : string
        Filename of bgzf file to be opened.

    index : string
        The filename of the index. If not set, the default is to
        assume that the index is called ``filename.tbi`

    mode : char
        The file opening mode. Currently, only ``r`` is permitted.
        
    parser : :class:`pysam.Parser`
    
        sets the default parser for this tabix file. If `parser`
        is None, the results are returned as an unparsed string.
        Otherwise, `parser` is assumed to be a functor that will return
        parsed data (see for example :class:`~pysam.asTuple` and
        :class:`~pysam.asGTF`).

    encoding : string

        The encoding passed to the parser

    Raises
    ------
    
    ValueError
        if index file is missing.

    IOError
        if file could not be opened
    """
    def __cinit__(self,
                  filename,
                  mode='r',
                  parser=None,
                  index=None,
                  encoding="ascii",
                  *args,
                  **kwargs ):

        self.htsfile = NULL
        self.is_remote = False
        self.is_stream = False
        self.parser = parser
        self._open(filename, mode, index, *args, **kwargs)
        self.encoding = encoding

    def _open( self, 
               filename,
               mode='r',
               index=None,
              ):
        '''open a :term:`tabix file` for reading.'''

        if mode != 'r':
            raise ValueError("invalid file opening mode `%s`" % mode)

        if self.htsfile != NULL:
            self.close()
        self.htsfile = NULL

        filename_index = index or (filename + ".tbi")
        # encode all the strings to pass to tabix
        self.filename = encode_filename(filename)
        self.filename_index = encode_filename(filename_index)

        self.is_stream = self.filename == b'-'
        self.is_remote = hisremote(self.filename)

        if not self.is_remote:
            if not os.path.exists(filename):
                raise IOError("file `%s` not found" % filename)

            if not os.path.exists(filename_index):
                raise IOError("index `%s` not found" % filename_index)

        # open file
        cdef char *cfilename = self.filename
        with nogil:
            self.htsfile = hts_open(cfilename, 'r')

        if self.htsfile == NULL:
            raise IOError("could not open file `%s`" % filename)
        
        #if self.htsfile.format.category != region_list:
        #    raise ValueError("file does not contain region data")

        cfilename = self.filename_index
        with nogil:
            self.index = tbx_index_load(cfilename)

        if self.index == NULL:
            raise IOError("could not open index for `%s`" % filename)

        if not self.is_stream:
            self.start_offset = self.tell()

    def _dup(self):
        '''return a copy of this tabix file.
        
        The file is being re-opened.
        '''
        return TabixFile(self.filename,
                         mode="r", 
                         parser=self.parser,
                         index=self.filename_index,
                         encoding=self.encoding)

    def fetch(self, 
              reference=None,
              start=None, 
              end=None, 
              region=None,
              parser=None,
              multiple_iterators=False):
        '''fetch one or more rows in a :term:`region` using 0-based
        indexing. The region is specified by :term:`reference`,
        *start* and *end*. Alternatively, a samtools :term:`region`
        string can be supplied.

        Without *reference* or *region* all entries will be fetched. 
        
        If only *reference* is set, all reads matching on *reference*
        will be fetched.

        If *parser* is None, the default parser will be used for
        parsing.
        
        Set *multiple_iterators* to true if you will be using multiple
        iterators on the same file at the same time. The iterator
        returned will receive its own copy of a filehandle to the file
        effectively re-opening the file. Re-opening a file creates
        some overhead, so beware.

        '''
        if not self.is_open():
            raise ValueError("I/O operation on closed file")

        # convert coordinates to region string, which is one-based
        if reference:
            if end is not None:
                if end < 0:
                    raise ValueError("end out of range (%i)" % end)
                if start is None:
                    start = 0
                    
                if start < 0:
                    raise ValueError("start out of range (%i)" % end)
                elif start > end:
                    raise ValueError(
                        'start (%i) >= end (%i)' % (start, end))
                elif start == end:
                    return EmptyIterator()
                else:
                    region = '%s:%i-%i' % (reference, start + 1, end)
            elif start is not None:
                if start < 0:
                    raise ValueError("start out of range (%i)" % end)
                region = '%s:%i' % (reference, start + 1)
            else:
                region = reference

        # get iterator
        cdef hts_itr_t * itr
        cdef char *cstr
        cdef TabixFile fileobj

        # reopen the same file if necessary
        if multiple_iterators:
            fileobj = self._dup()
        else:
            fileobj = self

        if region is None:
            # without region or reference - iterate from start
            with nogil:
                itr = tbx_itr_queryi(fileobj.index,
                                      HTS_IDX_START,
                                      0,
                                      0)
        else:
            s = force_bytes(region, encoding=fileobj.encoding)
            cstr = s
            with nogil:
                itr = tbx_itr_querys(fileobj.index, cstr)

        if itr == NULL:
            if region is None:
                if len(self.contigs) > 0:
                    # when accessing a tabix file created prior tabix 1.0
                    # the full-file iterator is empty.
                    raise ValueError(
                        "could not create iterator, possible "
                        "tabix version mismatch")
                else:
                    # possible reason is that the file is empty -
                    # return an empty iterator
                    return EmptyIterator()
            else:
                raise ValueError(
                    "could not create iterator for region '%s'" %
                    region)
            
        # use default parser if no parser is specified
        if parser is None:
            parser = fileobj.parser

        cdef TabixIterator a
        if parser is None: 
            a = TabixIterator(encoding=fileobj.encoding)
        else:
            parser.set_encoding(fileobj.encoding)
            a = TabixIteratorParsed(parser)

        a.tabixfile = fileobj
        a.iterator = itr

        return a

    ###############################################################
    ###############################################################
    ###############################################################
    ## properties
    ###############################################################
    property header:
        '''the file header.

        The file header consists of the lines at the beginning of a
        file that are prefixed by the comment character ``#``.
       
        .. note::
            The header is returned as an iterator presenting lines
            without the newline character.
        
        .. note::
            The header is only available for local files. For remote
            files an Attribute Error is raised.

        '''
        
        def __get__(self):
            if self.is_remote:
                raise AttributeError(
                    "the header is not available for remote files")
            return GZIteratorHead(self.filename)

    property contigs:
        '''list of chromosome names'''
        def __get__(self):
            cdef char ** sequences
            cdef int nsequences
            
            with nogil:
                sequences = tbx_seqnames(self.index, &nsequences)
            cdef int x
            result = []
            for x from 0 <= x < nsequences:
                result.append(force_str(sequences[x]))
            
            # htslib instructions:
            # only free container, not the sequences themselves
            free(sequences)

            return result
            
    def close(self):
        '''
        closes the :class:`pysam.TabixFile`.'''
        if self.htsfile != NULL:
            hts_close(self.htsfile)
            self.htsfile = NULL
        if self.index != NULL:
            tbx_destroy(self.index)
            self.index = NULL

    def __dealloc__( self ):
        # remember: dealloc cannot call other python methods
        # note: no doc string
        # note: __del__ is not called.
        if self.htsfile != NULL:
            hts_close(self.htsfile)
            self.htsfile = NULL
        if self.index != NULL:
            tbx_destroy(self.index)


cdef class TabixIterator:
    """iterates over rows in *tabixfile* in region
    given by *tid*, *start* and *end*.
    """

    def __init__(self, encoding="ascii"):
        self.encoding = encoding
    
    def __iter__(self):
        self.buffer.s = NULL
        self.buffer.l = 0
        self.buffer.m = 0

        return self 

    cdef int __cnext__(self):
        '''iterate to next element.
        
        Return -5 if file has been closed when this function
        was called.
        '''
        if self.tabixfile.htsfile == NULL:
            return -5

        cdef int retval

        while 1:
            with nogil:
                retval = tbx_itr_next(
                    self.tabixfile.htsfile,
                    self.tabixfile.index,
                    self.iterator,
                    &self.buffer)

            if retval < 0:
                break

            if self.buffer.s[0] != '#':
                break

        return retval

    def __next__(self): 
        """python version of next().

        pyrex uses this non-standard name instead of next()
        """
        
        cdef int retval = self.__cnext__()
        if retval == -5:
            raise IOError("iteration on closed file")
        elif retval < 0:
            raise StopIteration

        return charptr_to_str(self.buffer.s, self.encoding)

    def next(self):
        return self.__next__()

    def __dealloc__(self):
        if <void*>self.iterator != NULL:
            tbx_itr_destroy(self.iterator)
        if self.buffer.s != NULL:
            free(self.buffer.s)


class EmptyIterator:
    '''empty iterator'''

    def __iter__(self):
        return self

    def next(self):
        raise StopIteration()

    def __next__(self):
        raise StopIteration()


cdef class TabixIteratorParsed(TabixIterator):
    """iterates over mapped reads in a region.

    The *parser* determines the encoding.

    Returns parsed data.
    """

    def __init__(self, 
                 Parser parser):
        
        TabixIterator.__init__(self)
        self.parser = parser

    def __next__(self): 
        """python version of next().

        pyrex uses this non-standard name instead of next()
        """
        
        cdef int retval = self.__cnext__()
        if retval == -5:
            raise IOError("iteration on closed file")
        elif retval < 0:
            raise StopIteration

        return self.parser.parse(self.buffer.s,
                                 self.buffer.l)


cdef class GZIterator:
    def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
        '''iterate line-by-line through gzip (or bgzip)
        compressed file.
        '''
        if not os.path.exists(filename):
            raise IOError("No such file or directory: %s" % filename)

        filename = encode_filename(filename)
        cdef char *cfilename = filename
        with nogil:
            self.gzipfile = bgzf_open(cfilename, "r")
        self._filename = filename
        self.kstream = ks_init(self.gzipfile)
        self.encoding = encoding

        self.buffer.l = 0
        self.buffer.m = 0
        self.buffer.s = <char*>malloc(buffer_size)

    def __dealloc__(self):
        '''close file.'''
        if self.gzipfile != NULL:
            bgzf_close(self.gzipfile)
            self.gzipfile = NULL
        if self.buffer.s != NULL:
            free(self.buffer.s)
        if self.kstream != NULL:
            ks_destroy(self.kstream)

    def __iter__(self):
        return self

    cdef int __cnext__(self):
        cdef int dret = 0
        cdef int retval = 0
        while 1:
            with nogil:
                retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
            
            if retval < 0: 
                break

            return dret
        return -1

    def __next__(self):
        """python version of next().
        """
        cdef int retval = self.__cnext__()
        if retval < 0:
            raise StopIteration
        return force_str(self.buffer.s, self.encoding)


cdef class GZIteratorHead(GZIterator):
    '''iterate line-by-line through gzip (or bgzip)
    compressed file returning comments at top of file.
    '''

    def __next__(self):
        """python version of next().
        """
        cdef int retval = self.__cnext__()
        if retval < 0:
            raise StopIteration
        if self.buffer.s[0] == '#':
            return self.buffer.s
        else:
            raise StopIteration


cdef class GZIteratorParsed(GZIterator):
    '''iterate line-by-line through gzip (or bgzip)
    compressed file returning comments at top of file.
    '''

    def __init__(self, parser):
        self.parser = parser

    def __next__(self):
        """python version of next().
        """
        cdef int retval = self.__cnext__()
        if retval < 0:
            raise StopIteration

        return self.parser.parse(self.buffer.s,
                                 self.buffer.l)


def tabix_compress(filename_in, 
                   filename_out,
                   force=False):
    '''compress *filename_in* writing the output to *filename_out*.
    
    Raise an IOError if *filename_out* already exists, unless *force*
    is set.
    '''

    if not force and os.path.exists(filename_out):
        raise IOError(
            "Filename '%s' already exists, use *force* to "
            "overwrite" % filename_out)

    cdef int WINDOW_SIZE
    cdef int c, r
    cdef void * buffer
    cdef BGZF * fp
    cdef int fd_src
    cdef bint is_empty = True
    cdef int O_RDONLY
    O_RDONLY = os.O_RDONLY

    WINDOW_SIZE = 64 * 1024

    fn = encode_filename(filename_out)
    cdef char *cfn = fn
    with nogil:
        fp = bgzf_open(cfn, "w")
    if fp == NULL:
        raise IOError("could not open '%s' for writing" % filename_out)

    fn = encode_filename(filename_in)
    fd_src = open(fn, O_RDONLY)
    if fd_src == 0:
        raise IOError("could not open '%s' for reading" % filename_in)

    buffer = malloc(WINDOW_SIZE)
    c = 1
    
    while c > 0:
        with nogil:
            c = read(fd_src, buffer, WINDOW_SIZE)
            if c > 0:
                is_empty = False
            r = bgzf_write(fp, buffer, c)
        if r < 0:
            free(buffer)
            raise OSError("writing failed")
        
    free(buffer)
    r = bgzf_close(fp)
    if r < 0:
        raise OSError("error %i when writing to file %s" % (r, filename_out))

    r = close(fd_src)
    # an empty file will return with -1, thus ignore this.
    if r < 0:
        if not (r == -1 and is_empty):
            raise OSError("error %i when closing file %s" % (r, filename_in))


def tabix_index( filename, 
                 force = False,
                 seq_col = None, 
                 start_col = None, 
                 end_col = None,
                 preset = None,
                 meta_char = "#",
                 zerobased = False,
                 int min_shift = -1,
                ):
    '''index tab-separated *filename* using tabix.

    An existing index will not be overwritten unless
    *force* is set.

    The index will be built from coordinates
    in columns *seq_col*, *start_col* and *end_col*.

    The contents of *filename* have to be sorted by 
    contig and position - the method does not check
    if the file is sorted.

    Column indices are 0-based. Coordinates in the file
    are assumed to be 1-based.

    If *preset* is provided, the column coordinates
    are taken from a preset. Valid values for preset
    are "gff", "bed", "sam", "vcf", psltbl", "pileup".
    
    Lines beginning with *meta_char* and the first
    *line_skip* lines will be skipped.
    
    If *filename* does not end in ".gz", it will be automatically
    compressed. The original file will be removed and only the 
    compressed file will be retained. 

    If *filename* ends in *gz*, the file is assumed to be already
    compressed with bgzf.

    *min-shift* sets the minimal interval size to 1<<INT; 0 for the
    old tabix index. The default of -1 is changed inside htslib to 
    the old tabix default of 0.

    returns the filename of the compressed data

    '''
    
    if not os.path.exists(filename):
        raise IOError("No such file '%s'" % filename)

    if preset is None and \
       (seq_col is None or start_col is None or end_col is None):
        raise ValueError(
            "neither preset nor seq_col,start_col and end_col given")

    if not filename.endswith(".gz"): 
        tabix_compress(filename, filename + ".gz", force=force)
        os.unlink( filename )
        filename += ".gz"

    if not force and os.path.exists(filename + ".tbi"):
        raise IOError(
            "Filename '%s.tbi' already exists, use *force* to overwrite")

    # columns (1-based):
    #   preset-code, contig, start, end, metachar for
    #     comments, lines to ignore at beginning
    # 0 is a missing column
    preset2conf = {
        'gff' : (0, 1, 4, 5, ord('#'), 0),
        'bed' : (0x10000, 1, 2, 3, ord('#'), 0),
        'psltbl' : (0x10000, 15, 17, 18, ord('#'), 0),
        'sam' : (1, 3, 4, 0, ord('@'), 0),
        'vcf' : (2, 1, 2, 0, ord('#'), 0),
        'pileup': (3, 1, 2, 0, ord('#'), 0),
        }

    if preset:
        try:
            conf_data = preset2conf[preset]
        except KeyError:
            raise KeyError(
                "unknown preset '%s', valid presets are '%s'" %
                (preset, ",".join(preset2conf.keys())))
    else:
        if end_col == None:
            end_col = -1
        preset = 0

        # note that tabix internally works with 0-based coordinates
        # and open/closed intervals.  When using a preset, conversion
        # is automatically taken care of.  Otherwise, the coordinates
        # are assumed to be 1-based closed intervals and -1 is
        # subtracted from the start coordinate. To avoid doing this,
        # set the TI_FLAG_UCSC=0x10000 flag:
        if zerobased:
            preset = preset | 0x10000

        conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
                
    cdef tbx_conf_t conf
    conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data


    fn = encode_filename(filename)
    cdef char *cfn = fn
    with nogil:
        tbx_index_build(cfn, min_shift, &conf)
    
    return filename

# #########################################################
# cdef class tabix_file_iterator_old:
#     '''iterate over ``infile``.

#     This iterator is not safe. If the :meth:`__next__()` method is called 
#     after ``infile`` is closed, the result is undefined (see ``fclose()``).

#     The iterator might either raise a StopIteration or segfault.
#     '''


#     def __cinit__(self, 
#                   infile, 
#                   Parser parser,
#                   int buffer_size = 65536 ):

#         cdef int fd = PyObject_AsFileDescriptor( infile )
#         if fd == -1: raise ValueError( "I/O operation on closed file." )
#         self.infile = fdopen( fd, 'r')

#         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )

#         self.buffer = <char*>malloc( buffer_size )        
#         self.size = buffer_size
#         self.parser = parser

#     def __iter__(self):
#         return self

#     cdef __cnext__(self):

#         cdef char * b
#         cdef size_t nbytes
#         b = self.buffer

#         while not feof( self.infile ):
#             nbytes = getline( &b, &self.size, self.infile)

#             # stop at first error or eof
#             if (nbytes == -1): break
#             # skip comments
#             if (b[0] == '#'): continue

#             # skip empty lines
#             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue

#             # make sure that entry is complete
#             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
#                 result = b
#                 raise ValueError( "incomplete line at %s" % result )

#             # make sure that this goes fully through C
#             # otherwise buffer is copied to/from a
#             # Python object causing segfaults as
#             # the wrong memory is freed
#             return self.parser.parse( b, nbytes )

#         raise StopIteration

#     def __dealloc__(self):
#         free(self.buffer)

#     def __next__(self):
#         return self.__cnext__()

#########################################################
#########################################################
#########################################################
## Iterators for parsing through unindexed files.
#########################################################
# cdef buildGzipError(void *gzfp):
#     cdef int errnum = 0
#     cdef char *s = gzerror(gzfp, &errnum)
#     return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)


cdef class tabix_file_iterator:
    '''iterate over a compressed or uncompressed ``infile``.
    '''

    def __cinit__(self, 
                  infile, 
                  Parser parser,
                  int buffer_size=65536):

        if infile.closed:
            raise ValueError("I/O operation on closed file.")

        self.infile = infile

        cdef int fd = PyObject_AsFileDescriptor(infile)
        if fd == -1:
            raise ValueError("I/O operation on closed file.")

        self.duplicated_fd = dup(fd)

        # From the manual:
        # gzopen can be used to read a file which is not in gzip format; 
        # in this case gzread will directly read from the file without decompression. 
        # When reading, this will be detected automatically by looking 
        # for the magic two-byte gzip header. 
        self.fh = bgzf_dopen(self.duplicated_fd, 'r')

        if self.fh == NULL: 
            raise IOError('%s' % strerror(errno))

        self.kstream = ks_init(self.fh) 
        
        self.buffer.s = <char*>malloc(buffer_size)
        #if self.buffer == NULL:
        #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
        #self.size = buffer_size
        self.parser = parser

    def __iter__(self):
        return self

    cdef __cnext__(self):

        cdef char * b
        cdef int dret = 0
        cdef int retval = 0
        while 1:
            with nogil:
                retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
            
            if retval < 0: 
                break
                #raise IOError('gzip error: %s' % buildGzipError( self.fh ))

            b = self.buffer.s
            
            # skip comments
            if (b[0] == '#'):
                continue

            # skip empty lines
            if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
                continue

            # gzgets terminates at \n, no need to test

            # parser creates a copy
            return self.parser.parse(b, self.buffer.l)

        raise StopIteration

    def __dealloc__(self):
        free(self.buffer.s)
        ks_destroy(self.kstream)
        bgzf_close(self.fh)
        
    def __next__(self):
        return self.__cnext__()

    def next(self):
        return self.__cnext__()
    

class tabix_generic_iterator:
    '''iterate over ``infile``.
    
    Permits the use of file-like objects for example from the gzip module.
    '''
    def __init__(self, infile, parser):

        self.infile = infile
        if self.infile.closed:
            raise ValueError("I/O operation on closed file.")
        self.parser = parser

    def __iter__(self):
        return self

    # cython version - required for python 3
    def __next__(self):
        
        cdef char * b
        cdef char * cpy
        cdef size_t nbytes

        encoding = self.parser.get_encoding()

        # note that GzipFile.close() does not close the file
        # reading is still possible.
        if self.infile.closed:
            raise ValueError("I/O operation on closed file.")

        while 1:

            line = self.infile.readline()
            if not line:
                break
            
            s = force_bytes(line, encoding)
            b = s
            nbytes = len(line)
            assert b[nbytes] == '\0'

            # skip comments
            if b[0] == '#':
                continue

            # skip empty lines
            if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
                continue
            
            # make sure that entry is complete
            if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
                raise ValueError("incomplete line at %s" % line)
            
            bytes_cpy = <bytes> b
            cpy = <char *> bytes_cpy

            return self.parser(cpy, nbytes)            

        raise StopIteration

    # python version - required for python 2.7
    def next(self):
        return self.__next__()

def tabix_iterator(infile, parser):
    """return an iterator over all entries in a file.
    
    Results are returned parsed as specified by the *parser*. If
    *parser* is None, the results are returned as an unparsed string.
    Otherwise, *parser* is assumed to be a functor that will return
    parsed data (see for example :class:`~pysam.asTuple` and
    :class:`~pysam.asGTF`).

    """
    if PY_MAJOR_VERSION >= 3:
        return tabix_generic_iterator(infile, parser)
    else:
        return tabix_file_iterator(infile, parser)
        
    # file objects can use C stdio
    # used to be: isinstance( infile, file):
    # if PY_MAJOR_VERSION >= 3:
    #     if isinstance( infile, io.IOBase ):
    #         return tabix_copy_iterator( infile, parser )
    #     else:
    #         return tabix_generic_iterator( infile, parser )
    # else:
#        if isinstance( infile, file ):
#            return tabix_copy_iterator( infile, parser )
#        else:
#            return tabix_generic_iterator( infile, parser )
    
cdef class Tabixfile(TabixFile):
    """Tabixfile is deprecated: use TabixFile instead"""
    pass


__all__ = [
    "tabix_index", 
    "tabix_compress",
    "TabixFile",
    "Tabixfile",
    "asTuple",
    "asGTF",
    "asVCF",
    "asBed",
    "GZIterator",
    "GZIteratorHead",
    "tabix_iterator", 
    "tabix_generic_iterator", 
    "tabix_file_iterator", 
]