File: maf_extract_ranges.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (77 lines) | stat: -rwxr-xr-x 2,211 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/python3

"""
Reads a list of intervals (start, stop) and a maf. Produces a new maf
containing the blocks from the original that overlapped the intervals.

NOTE: See maf_extract_ranges_indexed.py which works better / faster for many
      use cases.

NOTE: chromosome/src information in the MAF is ignored by this variant.

NOTE: if a single alignment in a block become empty during slicing, the block
      is ignored.

usage: %prog interval_file refindex [options] < maf_file
   -m, --mincols=10: Minimum length (columns) required for alignment to be output
"""

import sys

import bx.align.maf
from bx import intervals
from bx.cookbook import doc_optparse


def __main__():
    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        range_filename = args[0]
        refindex = int(args[1])
        if options.mincols:
            mincols = int(options.mincols)
        else:
            mincols = 10
    except Exception:
        doc_optparse.exit()

    # Load Intervals

    intersecter = intervals.Intersecter()
    for line in open(range_filename):
        fields = line.split()
        intersecter.add_interval(intervals.Interval(int(fields[0]), int(fields[1])))

    # Start MAF on stdout

    out = bx.align.maf.Writer(sys.stdout)

    # Iterate over input MAF

    for maf in bx.align.maf.Reader(sys.stdin, parse_e_rows=True):
        ref = maf.components[refindex]
        # Find overlap with reference component
        intersections = sorted(intersecter.find(ref.get_forward_strand_start(), ref.get_forward_strand_end()))
        # Keep output maf ordered
        # Write each intersecting block
        for interval in intersections:
            start = max(interval.start, ref.get_forward_strand_start())
            end = min(interval.end, ref.get_forward_strand_end())
            sliced = maf.slice_by_component(refindex, start, end)
            good = True
            for c in sliced.components:
                if c.size < 1 and not c.empty:
                    good = False
            if good and sliced.text_size > mincols:
                out.write(sliced)

    # Close output MAF

    out.close()


if __name__ == "__main__":
    __main__()