File: ucsc_gene_table_to_intervals.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (101 lines) | stat: -rwxr-xr-x 3,598 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python3

"""
Read a table dump in the UCSC gene table format and print a tab separated
list of intervals corresponding to requested features of each gene.

usage: ucsc_gene_table_to_intervals.py [options] < gene_table.txt

options:
  -h, --help            show this help message and exit
  -rREGION, --region=REGION
                        Limit to region: one of coding, utr3, utr5, transcribed [default]
  -e, --exons           Only print intervals overlapping an exon
"""

import optparse
import string
import sys


def main():
    # Parse command line
    parser = optparse.OptionParser(usage="%prog [options] < gene_table.txt")
    parser.add_option(
        "-r",
        "--region",
        dest="region",
        default="transcribed",
        help="Limit to region: one of coding, utr3, utr5, transcribed [default]",
    )
    parser.add_option(
        "-e", "--exons", action="store_true", dest="exons", help="Only print intervals overlapping an exon"
    )
    parser.add_option("-s", "--strand", action="store_true", dest="strand", help="Print strand after interval")
    parser.add_option(
        "-b",
        "--nobin",
        action="store_false",
        dest="discard_first_column",
        default=True,
        help="file doesn't contain a 'bin' column (use this for pre-hg18 files)",
    )
    options, args = parser.parse_args()
    assert options.region in ("coding", "utr3", "utr5", "transcribed"), "Invalid region argument"

    # Read table from stdin and handle each gene
    for line in sys.stdin:
        # Parse fields from gene tabls
        fields = line.split("\t")
        if options.discard_first_column:
            fields.pop(0)
        chrom = fields[1]
        strand = fields[2]
        tx_start = int(fields[3])
        tx_end = int(fields[4])
        cds_start = int(fields[5])
        cds_end = int(fields[6])

        # Determine the subset of the transcribed region we are interested in
        if options.region == "utr3":
            if strand == "-":
                region_start, region_end = tx_start, cds_start
            else:
                region_start, region_end = cds_end, tx_end
        elif options.region == "utr5":
            if strand == "-":
                region_start, region_end = cds_end, tx_end
            else:
                region_start, region_end = tx_start, cds_start
        elif options.region == "coding":
            region_start, region_end = cds_start, cds_end
        else:
            region_start, region_end = tx_start, tx_end

        # If only interested in exons, print the portion of each exon overlapping
        # the region of interest, otherwise print the span of the region
        if options.exons:
            exon_starts = [int(_) for _ in fields[8].rstrip(",\n").split(",")]
            exon_ends = [int(_) for _ in fields[9].rstrip(",\n").split(",")]
            for start, end in zip(exon_starts, exon_ends):
                start = max(start, region_start)
                end = min(end, region_end)
                if start < end:
                    if strand:
                        print_tab_sep(chrom, start, end, strand)
                    else:
                        print_tab_sep(chrom, start, end)
        else:
            if strand:
                print_tab_sep(chrom, region_start, region_end, strand)
            else:
                print_tab_sep(chrom, region_start, region_end)


def print_tab_sep(*args):
    """Print items in `l` to stdout separated by tabs"""
    print(string.join((str(f) for f in args), "\t"))


if __name__ == "__main__":
    main()