File: intron_exon_reads.py

package info (click to toggle)
python-pybedtools 0.10.0-4
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 16,620 kB
  • sloc: python: 10,030; cpp: 899; makefile: 142; sh: 57
file content (109 lines) | stat: -rw-r--r-- 3,605 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python

"""
Example from pybedtools documentation: find reads in introns and exons using
multiple CPUs.

Prints a tab-separated file containing class (exon, intron, both) and number of
reads in each class.
"""
import pybedtools
import argparse
import os
import sys
import multiprocessing

if __name__ == "__main__":

    ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), usage=__doc__)
    ap.add_argument(
        "--gff", required=True, help="GFF or GTF file containing annotations"
    )
    ap.add_argument(
        "--bam", required=True, help="BAM file containing reads to be counted"
    )
    ap.add_argument(
        "--stranded",
        action="store_true",
        help="Use strand-specific merging and overlap. " "Default is to ignore strand",
    )
    ap.add_argument(
        "--processes",
        default=1,
        type=int,
        help="Number of processes to use in parallel.",
    )
    ap.add_argument(
        "-v", "--verbose", action="store_true", help="Verbose (goes to stderr)"
    )
    args = ap.parse_args()

    gff = args.gff
    bam = args.bam
    stranded = args.stranded

    if args.processes > 3:
        print(
            "Only need 3 processes (one each for exon, intron, both), so "
            "resetting processes from {0} to 3".format(args.processes)
        )
        args.processes = 3

    def featuretype_filter(feature, featuretype):
        """
        Only passes features with the specified *featuretype*
        """
        if feature[2] == featuretype:
            return True
        return False

    def subset_featuretypes(featuretype):
        """
        Returns the filename containing only `featuretype` features.
        """
        return g.filter(featuretype_filter, featuretype).saveas().fn

    def count_reads_in_features(features):
        """
        Callback function to count reads in features
        """
        return (
            pybedtools.BedTool(bam).intersect(
                features, s=stranded, bed=True, stream=True
            )
        ).count()

    # Some GFF files have invalid entries -- like chromosomes with negative coords
    # or features of length = 0.  This line removes them (the `remove_invalid`
    # method) and saves the result in a tempfile
    g = pybedtools.BedTool(gff).remove_invalid().saveas()

    # Set up pool of workers
    with multiprocessing.Pool(processes=args.processes) as pool:

        # Get separate files for introns and exons in parallel
        featuretypes = ["intron", "exon"]
        introns, exons = pool.map(subset_featuretypes, featuretypes)

        # Since `subset_featuretypes` returns filenames, we convert to BedTool objects
        # to do intersections below.
        introns = pybedtools.BedTool(introns)
        exons = pybedtools.BedTool(exons)

        # Identify unique and shared regions using bedtools commands subtract, merge,
        # and intersect.
        exon_only = exons.subtract(introns).merge()
        intron_only = introns.subtract(exons).merge()
        intron_and_exon = exons.intersect(introns).merge()

        # Do intersections with BAM file in parallel. Note that we're passing filenames
        # to multiprocessing.Pool rather than BedTool objects.
        features = (exon_only.fn, intron_only.fn, intron_and_exon.fn)

        # Run count_reads_in_features in parallel over features
        results = pool.map(count_reads_in_features, features)

        labels = ("exon_only", "intron_only", "intron_and_exon")

    for label, reads in zip(labels, results):
        print("{0}\t{1}".format(label, reads))