File: _twobit.pyx

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (137 lines) | stat: -rw-r--r-- 4,403 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from cpython.version cimport PY_MAJOR_VERSION


cdef extern from "Python.h":
    char * PyBytes_AsString( object )
    object PyBytes_FromStringAndSize( char *, Py_ssize_t )

cdef extern from "ctype.h":
    int tolower( int )
    
cdef extern from "string.h":
    void * memset( void *, int, size_t )

import struct
import sys
from bisect import bisect


cdef char* valToNt
valToNt = "TCAG"

def read( file, seq, int fragStart, int fragEnd, bint do_mask ):
    """
    Stolen directly from Jim Kent's twoBit.c
    """
    cdef int packedStart, packedEnd, packByteCount
    cdef int pOff, pStart, pEnd
    cdef int midStart, remainder, partCount
    cdef int i, j, s, e
    cdef char * packed
    cdef char * dna
    cdef char * dna_orig
    cdef char partial
    packedStart = (fragStart>>2);
    packedEnd = ((fragEnd+3)>>2);
    packByteCount = packedEnd - packedStart;
    # Empty string in which to write unpacked DNA

    dna_py = PyBytes_FromStringAndSize(NULL, fragEnd - fragStart)
    dna = PyBytes_AsString( dna_py )

    seek_bytes = seq.sequence_offset+packedStart

    # Read it
    file.seek( seek_bytes )

    packed_py = file.read( packByteCount )
    packed = PyBytes_AsString( packed_py )

    # Handle case where everything is in one packed byte 
    if packByteCount == 1:
        pOff = (packedStart<<2)
        pStart = fragStart - pOff
        pEnd = fragEnd - pOff
        partial = packed[0]
        assert pEnd <= 4
        assert pStart >= 0
        for i from pStart <= i < pEnd:
            dna[0] = valToNt[(partial >> (6-i-i)) & 3]
            dna = dna + 1
    else:
        # Handle partial first packed byte.
        midStart = fragStart;
        remainder = ( fragStart&3 )
        if remainder > 0:
            partial = packed[0]; packed = packed + 1
            partCount = 4 - remainder;
            for i from partCount - 1 >= i >= 0:
                dna[i] = valToNt[ partial & 3 ]
                partial = partial >> 2
            midStart = midStart + partCount
            dna = dna + partCount
        # Handle middle bytes.
        remainder = fragEnd&3
        midEnd = fragEnd - remainder
        i = midStart
        while i < midEnd:
            partial = packed[0]
            packed = packed + 1;
            dna[3] = valToNt[partial&3];
            partial = partial >> 2
            dna[2] = valToNt[partial&3];
            partial = partial >> 2
            dna[1] = valToNt[partial&3];
            partial = partial >> 2
            dna[0] = valToNt[partial&3];
            dna = dna + 4;            
            # Increment
            i = i + 4
            ## sys.stderr.write( "!!!< " + dna_py + " >!!!\n" ); sys.stderr.flush()
        # End
        if remainder > 0:
            partial = packed[0];
            partial = partial >> (8-remainder-remainder)
            for i from remainder - 1 >= i >= 0:
                dna[i] = valToNt[partial&3]
                partial = partial >> 2
    # Restore DNA pointer
    dna = PyBytes_AsString( dna_py )
    # N's
    n_block_count = len( seq.n_block_starts )
    if n_block_count > 0:
        start_ix = bisect( seq.n_block_starts, fragStart ) - 1
        if start_ix < 0: start_ix = 0            
        for i from start_ix <= i < n_block_count:
            s = seq.n_block_starts[i];
            e = s + seq.n_block_sizes[i];
            if (s >= fragEnd):
                break
            if (s < fragStart):
               s = fragStart
            if (e > fragEnd):
               e = fragEnd
            if (s < e):
                memset( dna + s - fragStart, c'N', e - s)
    # Mask
    if do_mask:
        m_block_count = len( seq.masked_block_starts )
        if m_block_count > 0:
            start_ix = bisect( seq.masked_block_starts, fragStart ) - 1
            if start_ix < 0: start_ix = 0    
            for i from start_ix <= i < m_block_count:
                s = seq.masked_block_starts[i];
                e = s + seq.masked_block_sizes[i];
                if (s >= fragEnd):
                    break
                if (s < fragStart):
                   s = fragStart
                if (e > fragEnd):
                   e = fragEnd
                if (s < e):
                    for j from s <= j < e:
                        dna[j-fragStart] = tolower( dna[j-fragStart] )
    if PY_MAJOR_VERSION >= 3:
        return dna_py.decode()
    else:
        return dna_py