1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
|
"""
Module for integration with pybedtools
"""
import os
import pybedtools
from pybedtools import featurefuncs
from gffutils import helpers
def to_bedtool(iterator):
"""
Convert any iterator into a pybedtools.BedTool object.
Note that the supplied iterator is not consumed by this function. To save
to a temp file or to a known location, use the `.saveas()` method of the
returned BedTool object.
"""
def gen():
for i in iterator:
yield helpers.asinterval(i)
return pybedtools.BedTool(gen())
def tsses(
db,
merge_overlapping=False,
attrs=None,
attrs_sep=":",
merge_kwargs=None,
as_bed6=False,
bedtools_227_or_later=True,
):
"""
Create 1-bp transcription start sites for all transcripts in the database
and return as a sorted pybedtools.BedTool object pointing to a temporary
file.
To save the file to a known location, use the `.moveto()` method on the
resulting `pybedtools.BedTool` object.
To extend regions upstream/downstream, see the `.slop()` method on the
resulting `pybedtools.BedTool object`.
Requires pybedtools.
Parameters
----------
db : gffutils.FeatureDB
The database to use
as_bed6 : bool
If True, output file is in BED6 format; otherwise it remains in the
GFF/GTF format and dialect of the file used to create the database.
Note that the merge options below necessarily force `as_bed6=True`.
merge_overlapping : bool
If True, output will be in BED format. Overlapping TSSes will be merged
into a single feature, and their names will be collapsed using
`merge_sep` and placed in the new name field.
merge_kwargs : dict
If `merge_overlapping=True`, these keyword arguments are passed to
pybedtools.BedTool.merge(), which are in turn sent to `bedtools merge`.
The merge operates on a BED6 file which will have had the name field
constructed as specified by other arguments here. See the available
options for your installed version of BEDTools; the defaults used here
are `merge_kwargs=dict(o='distinct', c=4, s=True)`.
Any provided `merge_kwargs` are used to *update* the default. It is
recommended to not override `c=4` and `s=True`, otherwise the
post-merge fixing may not work correctly. Good candidates for tweaking
are `d` (merge distance), `o` (operation), `delim` (delimiter to use
for collapse operations).
attrs : str or list
Only has an effect when `as_bed6=True` or `merge_overlapping=True`.
Determines what goes in the name field of an output BED file. By
default, "gene_id" for GTF databases and "ID" for GFF. If a list of
attributes is supplied, e.g. ["gene_id", "transcript_id"], then these
will be joined by `attr_join_sep` and then placed in the name field.
attrs_sep: str
If `as_bed6=True` or `merge_overlapping=True`, then use this character
to separate attributes in the name field of the output BED. If also
using `merge_overlapping=True`, you'll probably want this to be
different than `merge_sep` in order to parse things out later.
bedtools_227_or_later : bool
In version 2.27, BEDTools changed the output for merge. By default,
this function expects BEDTools version 2.27 or later, but set this to
False to assume the older behavior.
For testing purposes, the environment variable
GFFUTILS_USES_BEDTOOLS_227_OR_LATER is set to either "true" or "false"
and is used to override this argument.
Examples
--------
>>> import gffutils
>>> db = gffutils.create_db(
... gffutils.example_filename('FBgn0031208.gtf'),
... ":memory:",
... keep_order=True,
... verbose=False)
Default settings -- no merging, and report a separate TSS on each line even
if they overlap (as in the first two):
>>> print(tsses(db)) # doctest: +NORMALIZE_WHITESPACE
chr2L gffutils_derived transcript_TSS 7529 7529 . + . gene_id "FBgn0031208"; transcript_id "FBtr0300689";
chr2L gffutils_derived transcript_TSS 7529 7529 . + . gene_id "FBgn0031208"; transcript_id "FBtr0300690";
chr2L gffutils_derived transcript_TSS 11000 11000 . - . gene_id "Fk_gene_1"; transcript_id "transcript_Fk_gene_1";
chr2L gffutils_derived transcript_TSS 12500 12500 . - . gene_id "Fk_gene_2"; transcript_id "transcript_Fk_gene_2";
<BLANKLINE>
Default merging, showing the first two TSSes merged and reported as
a single unique TSS for the gene. Note the conversion to BED:
>>> x = tsses(db, merge_overlapping=True)
>>> print(x) # doctest: +NORMALIZE_WHITESPACE
chr2L 7528 7529 FBgn0031208 . +
chr2L 10999 11000 Fk_gene_1 . -
chr2L 12499 12500 Fk_gene_2 . -
<BLANKLINE>
Report both gene ID and transcript ID in the name. In some cases this can
be easier to parse than the original GTF or GFF file. With no merging
specified, we must add `as_bed6=True` to see the names in BED format.
>>> x = tsses(db, attrs=['gene_id', 'transcript_id'], as_bed6=True)
>>> print(x) # doctest: +NORMALIZE_WHITESPACE
chr2L 7528 7529 FBgn0031208:FBtr0300689 . +
chr2L 7528 7529 FBgn0031208:FBtr0300690 . +
chr2L 10999 11000 Fk_gene_1:transcript_Fk_gene_1 . -
chr2L 12499 12500 Fk_gene_2:transcript_Fk_gene_2 . -
<BLANKLINE>
Use a 3kb merge distance so the last 2 features are merged together:
>>> x = tsses(db, merge_overlapping=True, merge_kwargs=dict(d=3000))
>>> print(x) # doctest: +NORMALIZE_WHITESPACE
chr2L 7528 7529 FBgn0031208 . +
chr2L 10999 12500 Fk_gene_1,Fk_gene_2 . -
<BLANKLINE>
The set of unique TSSes for each gene, +1kb upstream and 500bp downstream:
>>> x = tsses(db, merge_overlapping=True)
>>> x = x.slop(l=1000, r=500, s=True, genome='dm3')
>>> print(x) # doctest: +NORMALIZE_WHITESPACE
chr2L 6528 8029 FBgn0031208 . +
chr2L 10499 12000 Fk_gene_1 . -
chr2L 11999 13500 Fk_gene_2 . -
<BLANKLINE>
"""
_override = os.environ.get("GFFUTILS_USES_BEDTOOLS_227_OR_LATER", None)
if _override is not None:
if _override == "true":
bedtools_227_or_later = True
elif _override == "false":
bedtools_227_or_later = False
else:
raise ValueError(
"Unknown value for GFFUTILS_USES_BEDTOOLS_227_OR_LATER "
"environment variable: {0}".format(_override)
)
if bedtools_227_or_later:
_merge_kwargs = dict(o="distinct", s=True, c="4,5,6")
else:
_merge_kwargs = dict(o="distinct", s=True, c="4")
if merge_kwargs is not None:
_merge_kwargs.update(merge_kwargs)
def gen():
"""
Generator of pybedtools.Intervals representing TSSes.
"""
for gene in db.features_of_type("gene"):
for transcript in db.children(gene, level=1):
if transcript.strand == "-":
transcript.start = transcript.stop
else:
transcript.stop = transcript.start
transcript.featuretype = transcript.featuretype + "_TSS"
yield helpers.asinterval(transcript)
# GFF/GTF format
x = pybedtools.BedTool(gen()).sort()
# Figure out default attrs to use, depending on the original format.
if attrs is None:
if db.dialect["fmt"] == "gtf":
attrs = "gene_id"
else:
attrs = "ID"
if merge_overlapping or as_bed6:
if isinstance(attrs, str):
attrs = [attrs]
def to_bed(f):
"""
Given a pybedtools.Interval, return a new Interval with the name
set according to the kwargs provided above.
"""
name = attrs_sep.join([f.attrs[i] for i in attrs])
return pybedtools.Interval(
f.chrom, f.start, f.stop, name, str(f.score), f.strand
)
x = x.each(to_bed).saveas()
if merge_overlapping:
if bedtools_227_or_later:
x = x.merge(**_merge_kwargs)
else:
def fix_merge(f):
f = featurefuncs.extend_fields(f, 6)
return pybedtools.Interval(f.chrom, f.start, f.stop, f[4], ".", f[3])
x = x.merge(**_merge_kwargs).saveas().each(fix_merge).saveas()
return x
|