File: subtract.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (79 lines) | stat: -rw-r--r-- 3,346 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python3
"""
Subtract one set of genomic intervals from another (base-by-base or whole
intervals). The returned GenomicIntervals will be in the order
of the first set of intervals passed in, with the corresponding
meta-data.
"""

from warnings import warn

from bx.intervals.io import (
    BitsetSafeReaderWrapper,
    GenomicInterval,
)
from bx.intervals.operations import bits_clear_in_range
from bx.tabular.io import (
    Comment,
    Header,
)


def subtract(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True):
    # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
    # Read all but first into bitsets and union to one (if confused, read DeMorgan's...)
    primary = readers[0]
    union = readers[1:]
    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
    # the bitsets are being created by skipping the problem lines
    union[0] = BitsetSafeReaderWrapper(union[0], lens=lens)
    bitsets = union[0].binned_bitsets(upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens)
    union = union[1:]
    for andset in union:
        bitset2 = andset.binned_bitsets(upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens)
        for chrom in bitset2:
            if chrom not in bitsets:
                bitsets[chrom] = bitset2[chrom]
            else:
                bitsets[chrom].ior(bitset2[chrom])

    # Read remaining intervals and subtract
    for interval in primary:
        if isinstance(interval, Header):
            yield interval
        if isinstance(interval, Comment) and comments:
            yield interval
        elif isinstance(interval, GenomicInterval):
            chrom = interval.chrom
            if chrom not in bitsets:
                yield interval
            else:
                start = int(interval.start)
                end = int(interval.end)
                if start > end:
                    warn("Interval start after end!")
                out_intervals = []
                # Find the intervals that meet the criteria (for the three sensible
                # permutations of reverse and pieces)
                try:
                    if bitsets[chrom].count_range(start, end - start) >= mincols:
                        if pieces:
                            out_intervals = bits_clear_in_range(bitsets[chrom], start, end)
                    else:
                        out_intervals = [(start, end)]
                    # Write the intervals
                    for start, end in out_intervals:
                        new_interval = interval.copy()
                        new_interval.start = start
                        new_interval.end = end
                        yield new_interval
                except IndexError as e:
                    try:
                        # This will work only if primary is a NiceReaderWrapper
                        primary.skipped += 1
                        # no reason to stuff an entire bad file into memmory
                        if primary.skipped < 10:
                            primary.skipped_lines.append((primary.linenum, primary.current_line, str(e)))
                    except Exception:
                        pass
                    continue