File: libcutils.pyx

package info (click to toggle)
python-pysam 0.15.2%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 17,604 kB
  • sloc: ansic: 125,787; python: 7,782; sh: 284; makefile: 222; perl: 41
file content (418 lines) | stat: -rw-r--r-- 12,744 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
import types
import sys
import string
import re
import tempfile
import os
import io
from contextlib import contextmanager

from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
from cpython cimport PyBytes_Check, PyUnicode_Check
from cpython cimport array as c_array
from libc.stdlib cimport calloc, free
from libc.string cimport strncpy
from libc.stdint cimport INT32_MAX, int32_t
from libc.stdio cimport fprintf, stderr, fflush
from libc.stdio cimport stdout as c_stdout
from posix.fcntl cimport open as c_open, O_WRONLY

from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \
    samtools_unset_stderr, samtools_unset_stdout, samtools_set_stdout_fn, samtools_set_optind

from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \
    bcftools_unset_stderr, bcftools_unset_stdout, bcftools_set_stdout_fn, bcftools_set_optind

#####################################################################
# hard-coded constants
cdef int MAX_POS = (1 << 31) - 1

#################################################################
# Utility functions for quality string conversions
cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
    """convert a qualitystring to an array of quality values."""
    if input_str is None:
        return None
    qs = force_bytes(input_str)
    cdef char i
    return c_array.array('B', [i - offset for i in qs])


cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
    """convert an array of quality values to a string."""
    if qualities is None:
        return None
    cdef int x

    cdef c_array.array result
    result = c_array.clone(qualities, len(qualities), zero=False)

    for x from 0 <= x < len(qualities):
        result[x] = qualities[x] + offset
    if IS_PYTHON3:
        return force_str(result.tobytes())
    else:
        return result.tostring()


cpdef qualities_to_qualitystring(qualities, int offset=33):
    """convert a list or array of quality scores to the string
    representation used in the SAM format.

    Parameters
    ----------
    offset : int
        offset to be added to the quality scores to arrive at
        the characters of the quality string (default=33).

    Returns
    -------
    string
         a quality string

    """
    cdef char x
    if qualities is None:
        return None
    elif isinstance(qualities, c_array.array):
        return array_to_qualitystring(qualities, offset=offset)
    else:
        # tuples and lists
        return force_str("".join([chr(x + offset) for x in qualities]))


########################################################################
########################################################################
########################################################################
## Python 3 compatibility functions
########################################################################

cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3

cdef from_string_and_size(const char* s, size_t length):
    if IS_PYTHON3:
        return s[:length].decode("ascii")
    else:
        return s[:length]


# filename encoding (adapted from lxml.etree.pyx)
cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'


cdef bytes encode_filename(object filename):
    """Make sure a filename is 8-bit encoded (or None)."""
    if filename is None:
        return None
    elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2:
        # Added to support path-like objects
        return os.fsencode(filename)
    elif PyBytes_Check(filename):
        return filename
    elif PyUnicode_Check(filename):
        return filename.encode(FILENAME_ENCODING)
    else:
        raise TypeError("Argument must be string or unicode.")


cdef bytes force_bytes(object s, encoding="ascii"):
    """convert string or unicode object to bytes, assuming
    ascii encoding.
    """
    if s is None:
        return None
    elif PyBytes_Check(s):
        return s
    elif PyUnicode_Check(s):
        return s.encode(encoding)
    else:
        raise TypeError("Argument must be string, bytes or unicode.")


cdef charptr_to_str(const char* s, encoding="ascii"):
    if s == NULL:
        return None
    if PY_MAJOR_VERSION < 3:
        return s
    else:
        return s.decode(encoding)


cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
    if s == NULL:
        return None
    if PY_MAJOR_VERSION < 3:
        return s[:n]
    else:
        return s[:n].decode(encoding)


cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
    if s == NULL:
        return None
    else:
        return s


cdef force_str(object s, encoding="ascii"):
    """Return s converted to str type of current Python
    (bytes in Py2, unicode in Py3)"""
    if s is None:
        return None
    if PY_MAJOR_VERSION < 3:
        return s
    elif PyBytes_Check(s):
        return s.decode(encoding)
    else:
        # assume unicode
        return s


cpdef parse_region(contig=None,
                   start=None,
                   stop=None,
                   region=None,
                   reference=None,
                   end=None):
    """parse alternative ways to specify a genomic region. A region can
    either be specified by :term:`reference`, `start` and
    `end`. `start` and `end` denote 0-based, half-open intervals.
    
    :term:`reference` and `end` are also accepted for backward
    compatibility as synonyms for :term:`contig` and `stop`,
    respectively.

    Alternatively, a samtools :term:`region` string can be supplied.

    If any of the coordinates are missing they will be replaced by the
    minimum (`start`) or maximum (`end`) coordinate.

    Note that region strings are 1-based, while `start` and `end`
    denote an interval in python coordinates.

    Returns
    -------

    tuple :  a tuple of `reference`, `start` and `end`.

    Raises
    ------

    ValueError
       for invalid or out of bounds regions.

    """
    cdef int32_t rstart
    cdef int32_t rstop

    
    if reference is not None:
        if contig is not None:
           raise ValueError('contig and reference should not both be specified')
        contig = reference

    if contig is not None and region is not None:
        raise ValueError('contig/reference and region should not both be specified')
        
    if end is not None:
        if stop is not None:
            raise ValueError('stop and end should not both be specified')
        stop = end

    if contig is None and region is None:
        raise ValueError("neither contig nor region are given")

    rstart = 0
    rstop = MAX_POS
    if start is not None:
        try:
            rstart = start
        except OverflowError:
            raise ValueError('start out of range (%i)' % start)

    if stop is not None:
        try:
            rstop = stop
        except OverflowError:
            raise ValueError('stop out of range (%i)' % stop)

    if region:
        if ":" in region:
            contig, coord = region.split(":")
            parts = coord.split("-")
            rstart = int(parts[0]) - 1
            if len(parts) >= 1:
                rstop = int(parts[1])
        else:
            contig = region

    if rstart > rstop:
        raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
    if not 0 <= rstart < MAX_POS:
        raise ValueError('start out of range (%i)' % rstart)
    if not 0 <= rstop <= MAX_POS:
        raise ValueError('stop out of range (%i)' % rstop)

    return contig, rstart, rstop


def _pysam_dispatch(collection,
                    method,
                    args=None,
                    catch_stdout=True,
                    is_usage=False,
                    save_stdout=None):
    '''call ``method`` in samtools/bcftools providing arguments in args.
    
    By default, stdout is redirected to a temporary file using the patched
    C sources except for a few commands that have an explicit output option
    (typically: -o). In these commands (such as samtools view), this explicit
    option is used. If *is_usage* is True, then these explicit output options
    will not be used.

    Catching of stdout can be turned off by setting *catch_stdout* to
    False.
    '''

    if method == "index":
        if args and not os.path.exists(args[0]):
            raise IOError("No such file or directory: '%s'" % args[0])
            
    if args is None:
        args = []
    else:
        args = list(args)

    # redirect stderr to file
    stderr_h, stderr_f = tempfile.mkstemp()
    samtools_set_stderr(stderr_h)
    bcftools_set_stderr(stderr_h)
        
    # redirect stdout to file
    if save_stdout:
        stdout_f = save_stdout
        stdout_h = c_open(force_bytes(stdout_f),
                          O_WRONLY)
        if stdout_h == -1:
            raise IOError("error while opening {} for writing".format(stdout_f))

        samtools_set_stdout_fn(force_bytes(stdout_f))
        samtools_set_stdout(stdout_h)
        bcftools_set_stdout_fn(force_bytes(stdout_f))
        bcftools_set_stdout(stdout_h)
            
    elif catch_stdout:
        stdout_h, stdout_f = tempfile.mkstemp()
        MAP_STDOUT_OPTIONS = {
        "samtools": {
            "view": "-o {}",
            "mpileup": "-o {}",
            "depad": "-o {}",
            "calmd": "",  # uses pysam_stdout_fn
        },
            "bcftools": {}
        }
        
        stdout_option = None
        if collection == "bcftools":
            # in bcftools, most methods accept -o, the exceptions
            # are below:
            if method not in ("index", "roh", "stats"):
                stdout_option = "-o {}"
        elif method in MAP_STDOUT_OPTIONS[collection]:
            # special case - samtools view -c outputs on stdout
            if not(method == "view" and "-c" in args):
                stdout_option = MAP_STDOUT_OPTIONS[collection][method]

        if stdout_option is not None and not is_usage:
            os.close(stdout_h)
            samtools_set_stdout_fn(force_bytes(stdout_f))
            bcftools_set_stdout_fn(force_bytes(stdout_f))
            args.extend(stdout_option.format(stdout_f).split(" "))
        else:
            samtools_set_stdout(stdout_h)
            bcftools_set_stdout(stdout_h)
    else:
        samtools_set_stdout_fn("-")
        bcftools_set_stdout_fn("-")

    # setup the function call to samtools/bcftools main
    cdef char ** cargs
    cdef int i, n, retval, l
    n = len(args)
    method = force_bytes(method)
    collection = force_bytes(collection)
    args = [force_bytes(a) for a in args]

    # allocate two more for first (dummy) argument (contains command)
    cdef int extra_args = 0
    if method == b"index":
        extra_args = 1
    # add extra arguments for commands accepting optional arguments
    # such as 'samtools index x.bam [out.index]'
    cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
    cargs[0] = collection
    cargs[1] = method

    # create copies of strings - getopt for long options permutes
    # arguments
    for i from 0 <= i < n:
        l = len(args[i])
        cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
        strncpy(cargs[i + 2], args[i], l)
    
    # reset getopt. On OsX there getopt reset is different
    # between getopt and getopt_long
    if method in [b'index', b'cat', b'quickcheck',
                  b'faidx', b'kprobaln']:
        samtools_set_optind(1)
        bcftools_set_optind(1)
    else:
        samtools_set_optind(0)
        bcftools_set_optind(0)

    # call samtools/bcftools
    if collection == b"samtools":
        retval = samtools_main(n + 2, cargs)
    elif collection == b"bcftools":
        retval = bcftools_main(n + 2, cargs)

    for i from 0 <= i < n:
        free(cargs[i + 2])
    free(cargs)

    # get error messages
    def _collect(fn):
        out = []
        try:
            with open(fn, "r") as inf:
                out = inf.read()
        except UnicodeDecodeError:
            with open(fn, "rb") as inf:
                # read binary output
                out = inf.read()
        finally:
            os.remove(fn)
        return out

    samtools_unset_stderr()
    bcftools_unset_stderr()

    if save_stdout or catch_stdout:
        samtools_unset_stdout()
        bcftools_unset_stdout()

    out_stderr = _collect(stderr_f)
    if save_stdout:
        out_stdout = None
    elif catch_stdout:
        out_stdout = _collect(stdout_f)
    else:
        out_stdout = None
        
    return retval, out_stderr, out_stdout


__all__ = ["qualitystring_to_array",
           "array_to_qualitystring",
           "qualities_to_qualitystring"]