File: bins.py

package info (click to toggle)
python-gffutils 0.9-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 9,812 kB
  • sloc: python: 4,295; makefile: 5; sh: 5
file content (219 lines) | stat: -rw-r--r-- 7,520 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# Implementation of the UCSC genome binning strategy -- heavily commented and
# with tests to help understand what's going on.
#
# Ryan Dale 2013
#
# With help from implementations in kent src and Brent Pedersen's cruzdb,
# specifically:
#
# http://genome-source.cse.ucsc.edu/gitweb/?p=kent.git\
#                                    ;a=blob;f=src/lib/binRange.c
# and
#
# https://github.com/brentp/cruzdb/blob/master/cruzdb/__init__.py
#
# For reference, Fig 7 in http://genome.cshlp.org/content/12/6/996.abstract
# looks like this:
# ----------------------------------------------------------------------------
# |                                  1                                       |
# ----------------------------------------------------------------------------
# |       2      |         3         |         4         |         5         |
# ----------------------------------------------------------------------------
# |6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
# ----------------------------------------------------------------------------
#
#      AAAAAAAAAAAAAAAAAAAAAAA          BBBBBBBBBBBBBBB             CC
#
# The smallest bin feature "A" fits within is 1, but it also overlaps 2-3, 7-9.
# The smallest bin feature "B" fits within is 4, but it also overlaps 1, 14-17.
# The smallest bin feature "C" fits within is 20, but it also overlaps 1, 5.


# kent src: "How much to shift to get to next larger bin."
# So each level splits by 2**3 = 8 fold (octree?)
NEXT_SHIFT = 3

# kent src: "How much to shift to get to finest bin."
# 2 ** FIRST_SHIFT is the size of the smallest bin.
FIRST_SHIFT = 17

# These offsets are the bin numbers at the beginning of each level.  In Fig 7,
# these would be 6, 2, and 1.
#
# Bins with higher numbers (e.g. 585 to 4681) are the smaller-sized bins.  Run
# print_bin_sizes() for more info on this.
OFFSETS = [
    4096 + 512 + 64 + 8 + 1,  # bins 4681-585
    512 + 64 + 8 + 1,         # bins 585-73
    64 + 8 + 1,               # bins 73-9
    8 + 1,                    # bins 9-1
    1                         # bin  0
]

# for BED (0-based, half-open) or GFF (1-based, closed intervals)
COORD_OFFSETS = {'bed': 0, 'gff': 1}


def bins(start, stop, fmt='gff', one=True):
    """
    Uses the definition of a "genomic bin" described in Fig 7 of
    http://genome.cshlp.org/content/12/6/996.abstract.

    Parameters
    ----------
    one : boolean
        If `one=True` (default), then only return the smallest bin that
        completely contains these coordinates (useful for assigning a single
        bin).

        If `one=False`, then return the set of *all* bins that overlap these
        coordinates (useful for looking for features that could intersect)

    fmt : 'gff' | 'bed'
        This specifies 1-based start coords (gff) or 0-based start coords (bed)
    """
    # Jump to highest resolution bin that will fit these coords (depending on
    # whether we have a BED or GFF-style coordinate).
    #
    # Some GFF files include negative coords, which will throw off this
    # calculation.  If negative coords, then set the bin to the largest
    # possible.
    if start < 0:
        if one:
            return 1
        else:
            return set([1])

    if stop < 0:
        if one:
            return 1
        else:
            return set([1])

    start = (start - COORD_OFFSETS[fmt]) >> FIRST_SHIFT
    stop = (stop) >> FIRST_SHIFT

    # We always at least fit within the chrom, which is bin 1.
    bins = set([1])

    for offset in OFFSETS:
        # Since we're going from smallest to largest bins, the first one where
        # the feature's start and stop positions are both within the same bin
        # is the smallest one these coords fit within.
        if one:
            if start == stop:
                # Note that at this point, because of the bit-shifting, `start`
                # is the number of bins (at this current level).  So we need to
                # add it to `offset` to get the actual bin ID.
                return offset + start

        # See the Fig 7 reproduction above to see why range().
        bins.update(list(range(offset + start, offset + stop + 1)))

        # Move to the next level (8x larger bin size; i.e., 2**NEXT_SHIFT
        # larger bin size)
        start >>= NEXT_SHIFT
        stop >>= NEXT_SHIFT
    return bins


def print_bin_sizes():
    """
    Useful for debugging: how large is each bin, and what are the bin IDs?
    """
    for i, offset in enumerate(OFFSETS):
        binstart = offset
        try:
            binstop = OFFSETS[i + 1]
        except IndexError:
            binstop = binstart

        bin_size = 2 ** (FIRST_SHIFT + (i * NEXT_SHIFT))
        actual_size = bin_size

        # nice formatting
        bin_size, suffix = bin_size / 1024, 'Kb'
        if bin_size >= 1024:
            bin_size, suffix = bin_size / 1024, 'Mb'
        if bin_size >= 1024:
            bin_size, suffix = bin_size / 1024, 'Gb'
        size = '(%s %s)' % (bin_size, suffix)
        actual_size = '%s bp' % (actual_size)

        print('level: {i:1};  bins {binstart:<4} to {binstop:<4}; '
              'size: {actual_size:<12} {size:<6}'.format(**locals()))


def test():
    # These should obviously fit inside the first bin
    assert bins(0, 1, fmt='bed') == 4681
    assert bins(1, 1, fmt='gff') == 4681

    # All bins that overlap:
    assert bins(0, 1, fmt='bed', one=False) == set([1, 73, 9, 585, 4681])

    # Or, more generally,
    assert bins(0, 1, fmt='bed', one=False) == set(OFFSETS)

    # Since the smallest bin size is 2**17, this rather large feature should
    # fit completely inside the first bin, too
    assert bins(2 ** 17 - 1, 2 ** 17 - 1, fmt='bed') == 4681

    # or, more generally,
    assert bins(
        2 ** FIRST_SHIFT - 1,
        2 ** FIRST_SHIFT - 1,
        fmt='bed') == OFFSETS[0]

    # At exactly 2**17 in BED coords it moves over to the next bin
    assert bins(2 ** 17, 2 ** 17, fmt='bed') == 4682

    # or
    assert bins(
        2 ** FIRST_SHIFT,
        2 ** FIRST_SHIFT,
        fmt='bed') == OFFSETS[0] + 1

    # The other bins it overlaps should be similar to before, with the
    # exception of 4682
    assert bins(2 ** 17, 2 ** 17, fmt='bed', one=False) \
        == set([1, 9, 73, 585, 4682])

    assert bins(2 ** 17, 2 ** 17, fmt='bed', one=False)\
        .difference(bins(2 ** 17 - 1, 2 ** 17 - 1, fmt='bed', one=False)) \
        == set([4682])

    # Spanning the first and second smallest bins should cause it to bump up
    # a level
    assert bins(2 ** 17 - 1, 2 ** 17, fmt='bed') == 585

    # or
    assert bins(
        2 ** FIRST_SHIFT - 1,
        2 ** FIRST_SHIFT,
        fmt='bed') == OFFSETS[1]

    # Make it as big as it can get in this bin:
    assert bins(
        2 ** FIRST_SHIFT - 1,
        2 ** (FIRST_SHIFT + NEXT_SHIFT) - 1,
        fmt='bed') == OFFSETS[1]

    # Then make it big enough to jump another level
    assert bins(
        2 ** FIRST_SHIFT - 1,
        2 ** (FIRST_SHIFT + NEXT_SHIFT),
        fmt='bed') == OFFSETS[2]

    # When start or stop or both are negative, bin should be the largest
    # possible.
    assert bins(-1, 1000, one=True) == 1
    assert bins(-1, 1000, one=False) == set([1])
    assert bins(-1, -1000, one=True) == 1
    assert bins(-1, -1000, one=False) == set([1])
    assert bins(1, -1000, one=True) == 1
    assert bins(1, -1000, one=False) == set([1])

if __name__ == "__main__":
    print_bin_sizes()
    test()