1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
|
#!/usr/bin/env python
"""
Example from pybedtools documentation: find reads in introns and exons using
multiple CPUs.
Prints a tab-separated file containing class (exon, intron, both) and number of
reads in each class.
"""
import pybedtools
import argparse
import os
import sys
import multiprocessing
if __name__ == "__main__":
ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), usage=__doc__)
ap.add_argument(
"--gff", required=True, help="GFF or GTF file containing annotations"
)
ap.add_argument(
"--bam", required=True, help="BAM file containing reads to be counted"
)
ap.add_argument(
"--stranded",
action="store_true",
help="Use strand-specific merging and overlap. " "Default is to ignore strand",
)
ap.add_argument(
"--processes",
default=1,
type=int,
help="Number of processes to use in parallel.",
)
ap.add_argument(
"-v", "--verbose", action="store_true", help="Verbose (goes to stderr)"
)
args = ap.parse_args()
gff = args.gff
bam = args.bam
stranded = args.stranded
if args.processes > 3:
print(
"Only need 3 processes (one each for exon, intron, both), so "
"resetting processes from {0} to 3".format(args.processes)
)
args.processes = 3
def featuretype_filter(feature, featuretype):
"""
Only passes features with the specified *featuretype*
"""
if feature[2] == featuretype:
return True
return False
def subset_featuretypes(featuretype):
"""
Returns the filename containing only `featuretype` features.
"""
return g.filter(featuretype_filter, featuretype).saveas().fn
def count_reads_in_features(features):
"""
Callback function to count reads in features
"""
return (
pybedtools.BedTool(bam).intersect(
features, s=stranded, bed=True, stream=True
)
).count()
# Some GFF files have invalid entries -- like chromosomes with negative coords
# or features of length = 0. This line removes them (the `remove_invalid`
# method) and saves the result in a tempfile
g = pybedtools.BedTool(gff).remove_invalid().saveas()
# Set up pool of workers
with multiprocessing.Pool(processes=args.processes) as pool:
# Get separate files for introns and exons in parallel
featuretypes = ["intron", "exon"]
introns, exons = pool.map(subset_featuretypes, featuretypes)
# Since `subset_featuretypes` returns filenames, we convert to BedTool objects
# to do intersections below.
introns = pybedtools.BedTool(introns)
exons = pybedtools.BedTool(exons)
# Identify unique and shared regions using bedtools commands subtract, merge,
# and intersect.
exon_only = exons.subtract(introns).merge()
intron_only = introns.subtract(exons).merge()
intron_and_exon = exons.intersect(introns).merge()
# Do intersections with BAM file in parallel. Note that we're passing filenames
# to multiprocessing.Pool rather than BedTool objects.
features = (exon_only.fn, intron_only.fn, intron_and_exon.fn)
# Run count_reads_in_features in parallel over features
results = pool.map(count_reads_in_features, features)
labels = ("exon_only", "intron_only", "intron_and_exon")
for label, reads in zip(labels, results):
print("{0}\t{1}".format(label, reads))
|