File: bitset_builders.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (169 lines) | stat: -rw-r--r-- 5,772 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
Support for creating dictionaries of `Bitset`s / `BinnedBitset`s from text
files containg sets of "covered" intervals in sequences (e.g. `BED`_ files).

.. BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
"""

import re
from warnings import warn

from bx.bitset import (
    BinnedBitSet,
    MAX,
)


def binned_bitsets_from_file(
    f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={}
):
    """
    Read a file into a dictionary of bitsets. The defaults arguments

    - 'f' should be a file like object (or any iterable containing strings)
    - 'chrom_col', 'start_col', and 'end_col' must exist in each line.
    - 'strand_col' is optional, any line without it will be assumed to be '+'
    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
      chromosomes will be assumed to be the maximum size
    """
    last_chrom = None
    last_bitset = None
    bitsets = {}
    for line in f:
        if line.startswith("#") or line.isspace():
            continue
        fields = line.split()
        chrom = fields[chrom_col]
        if chrom != last_chrom:
            if chrom not in bitsets:
                if chrom in lens:
                    size = lens[chrom]
                else:
                    size = MAX
                bitsets[chrom] = BinnedBitSet(size)
            last_chrom = chrom
            last_bitset = bitsets[chrom]
        start, end = int(fields[start_col]), int(fields[end_col])
        if upstream_pad:
            start = max(0, start - upstream_pad)
        if downstream_pad:
            end = min(size, end + downstream_pad)
        if start > end:
            warn("Interval start after end!")
        last_bitset.set_range(start, end - start)
    return bitsets


def binned_bitsets_from_bed_file(
    f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={}
):
    """
    Read a file into a dictionary of bitsets. The defaults arguments

    - 'f' should be a file like object (or any iterable containing strings)
    - 'chrom_col', 'start_col', and 'end_col' must exist in each line.
    - 'strand_col' is optional, any line without it will be assumed to be '+'
    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
      chromosomes will be assumed to be the maximum size
    """
    last_chrom = None
    last_bitset = None
    bitsets = {}
    offset = 0
    for line in f:
        if line.startswith("#") or line.isspace():
            continue
        # Ignore browser lines completely
        if line.startswith("browser"):
            continue
        # Need to check track lines due to the offset
        if line.startswith("track"):
            m = re.search(r"offset=(\d+)", line)
            if m and m.group(1):
                offset = int(m.group(1))
            continue
        fields = line.split()
        chrom = fields[chrom_col]
        if chrom != last_chrom:
            if chrom not in bitsets:
                if chrom in lens:
                    size = lens[chrom]
                else:
                    size = MAX
                bitsets[chrom] = BinnedBitSet(size)
            last_chrom = chrom
            last_bitset = bitsets[chrom]
        start, end = int(fields[start_col]) + offset, int(fields[end_col]) + offset
        if upstream_pad:
            start = max(0, start - upstream_pad)
        if downstream_pad:
            end = min(size, end + downstream_pad)
        if start > end:
            warn("Interval start after end!")
        last_bitset.set_range(start, end - start)
    return bitsets


def binned_bitsets_proximity(f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream=0, downstream=0):
    """Read a file into a dictionary of bitsets"""
    last_chrom = None
    last_bitset = None
    bitsets = {}
    for line in f:
        if line.startswith("#"):
            continue
        fields = line.split()
        strand = "+"
        if len(fields) >= strand_col + 1:
            if fields[strand_col] == "-":
                strand = "-"
        chrom = fields[chrom_col]
        if chrom != last_chrom:
            if chrom not in bitsets:
                bitsets[chrom] = BinnedBitSet(MAX)
            last_chrom = chrom
            last_bitset = bitsets[chrom]
        start, end = int(fields[start_col]), int(fields[end_col])
        if strand == "+":
            if upstream:
                start = max(0, start - upstream)
            if downstream:
                end = min(MAX, end + downstream)
        if strand == "-":
            if upstream:
                end = min(MAX, end + upstream)
            if downstream:
                start = max(0, start - downstream)
        if end - start > 0:
            last_bitset.set_range(start, end - start)
    return bitsets


def binned_bitsets_from_list(list=[]):
    """Read a list into a dictionary of bitsets"""
    last_chrom = None
    last_bitset = None
    bitsets = {}
    for l in list:
        chrom = l[0]
        if chrom != last_chrom:
            if chrom not in bitsets:
                bitsets[chrom] = BinnedBitSet(MAX)
            last_chrom = chrom
            last_bitset = bitsets[chrom]
        start, end = int(l[1]), int(l[2])
        last_bitset.set_range(start, end - start)
    return bitsets


def binned_bitsets_by_chrom(f, chrom, chrom_col=0, start_col=1, end_col=2):
    """Read a file by chrom name into a bitset"""
    bitset = BinnedBitSet(MAX)
    for line in f:
        if line.startswith("#"):
            continue
        fields = line.split()
        if fields[chrom_col] == chrom:
            start, end = int(fields[start_col]), int(fields[end_col])
            bitset.set_range(start, end - start)
    return bitset