1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
|
"""
Interface between pybedtools and the R package VennDiagram.
Rather than depend on the user to have rpy2 installed, this simply writes an
R script that can be edited and tweaked by the user before being run in R.
"""
from __future__ import print_function
import os
import string
import pybedtools
import six
from pybedtools import helpers
import subprocess
from collections import OrderedDict
# really just fill in x and filename...leave the rest up to the user.
#
# Note that the closing parentheses is missing -- that's so the user can add
# kwargs from the calling function
template = string.Template("""
library(VennDiagram)
venn.diagram(
x=$x,
filename=$filename,
category.names = $names
""")
def _list_to_R_syntax(x):
"""
Convert items in `x` to a string, and replace tabs with pipes in Interval
string representations. Put everything into an R vector and return as one
big string.
"""
items = []
for i in x:
if isinstance(i, pybedtools.Interval):
i = str(i).replace('\t', '|')
items.append('"%s"' % i)
return 'c(%s)' % ','.join(items)
def _dict_to_R_named_list(d):
"""
Calls _list_to_R_syntax for each item. Returns one big string.
"""
items = []
for key, val in list(d.items()):
items.append('"%s" = %s' % (key, _list_to_R_syntax(val)))
return 'list(%s)' % ', '.join(items)
def truncator(feature):
"""
Convert a feature of any format into a BED3 format.
"""
return pybedtools.create_interval_from_list(
[feature.chrom, str(feature.start), str(feature.stop)])
def cleaned_intersect(items):
"""
Perform interval intersections such that the end products have identical \
features for overlapping intervals.
The VennDiagram package does *set* intersection, not *interval*
intersection. So the goal here is to represent intersecting intervals as
intersecting sets of strings.
Doing a simple BEDTools intersectBed call doesn't do the trick (even with
the -u argument). As a concrete example, what would the string be for an
intersection of the feature "chr1:1-100" in file `x` and "chr1:50-200" in
file `y`?
The method used here is to substitute the intervals in `y` that overlap `x`
with the corresponding elements in `x`. This means that in the resulting
sets, the overlapping features are identical. To follow up with the
example, both `x` and `y` would have an item "chr1:50-200" in their sets,
simply indicating *that* one interval overlapped.
Venn diagrams are not well suited for nested overlaps or multi-overlaps.
To illustrate, try drawing the 2-way Venn diagram of the following two
files. Specifically, what number goes in the middle -- the number of
features in `x` that intersect `y` (1) or the number of features in `y`
that intersect `x` (2)?::
x:
chr1 1 100
chr1 500 6000
y:
chr1 50 100
chr1 80 200
chr9 777 888
In this case, this function will return the following sets::
x:
chr1:1-100
chr1:500-6000
y:
chr1:1-100
chr9:777-888
This means that while `x` does not change in length, `y` can. For example,
if there are 2 features in `x` that overlap one feature in `y`, then `y`
will gain those two features in place of its single original feature.
This strategy is extended for multiple intersections -- see the source for
details.
"""
if len(items) == 2:
x = items[0].each(truncator).saveas()
y = items[1].each(truncator).saveas()
# Combine the unique-to-y intervals with the shared-with-x intervals.
# Since x is first in x+y, resulting features are from x.
new_y = (y - x).cat(x + y)
return x, new_y
if len(items) == 3:
x = items[0].each(truncator).saveas()
y = items[1].each(truncator).saveas()
z = items[2].each(truncator).saveas()
# Same as above. Don't care about z yet; this means that y will not
# change because of z.
new_y = (y - x).cat(x + y)
# Combine:
# unique-to-z
# shared-with-any-x
# shared-with-unique-to-y
new_z = (z - y - x).cat(x + z).cat((y - x) + z)
return x, new_y, new_z
if len(items) == 4:
x = items[0].each(truncator).saveas()
y = items[1].each(truncator).saveas()
z = items[2].each(truncator).saveas()
q = items[3].each(truncator).saveas()
# Same as 2-way
new_y = (y - x).cat(x + y)
# Same as 3-way
new_z = (z - y - x).cat(x + z).cat((y - x) + z)
# Combine:
# unique-to-q
# shared-with-any-x
# shared-with-unique-to-y
# shared-with-unique-to-z
new_q = (q - z - y - x)\
.cat(x + q)\
.cat((y - x) + q)\
.cat((z - y - x) + q)
return x, new_y, new_z, new_q
def venn_maker(beds, names=None, figure_filename=None, script_filename=None,
additional_args=None, run=False):
"""
Given a list of interval files, write an R script to create a Venn \
diagram of overlaps (and optionally run it).
The R script calls the venn.diagram function of the R package VennDiagram
for extremely flexible Venn and Euler diagram creation. Uses
`cleaned_intersect()` to create string representations of shared intervals.
`beds` is a list of up to 4 filenames or BedTools.
`names` is a list of names to use for the Venn diagram, in the same order
as `beds`. Default is "abcd"[:len(beds)].
`figure_filename` is the TIFF file to save the figure as.
`script_filename` is the optional filename to write the R script to
`additional_args` is list that will be inserted into the R script,
verbatim. For example, to use scaled Euler diagrams with different colors,
use::
additional_args = ['euler.d=TRUE',
'scaled=TRUE',
'cat.col=c("red","blue")']
If `run` is True, then assume R is installed, is on the path, and has
VennDiagram installed . . . and run the script. The resulting filename
will be saved as `figure_filename`.
"""
if figure_filename is None:
figure_filename = 'NULL'
else:
figure_filename = '"%s"' % figure_filename
if names is None:
names = "abcd"[:len(beds)]
_beds = []
for bed in beds:
if not isinstance(bed, pybedtools.BedTool):
bed = pybedtools.BedTool(bed)
_beds.append(bed)
cleaned = cleaned_intersect(_beds)
results = OrderedDict(list(zip(names, cleaned)))
s = template.substitute(
x=_dict_to_R_named_list(results),
filename=figure_filename,
names=_list_to_R_syntax(names))
if additional_args:
s += ',' + ', '.join(additional_args)
s += ")"
if not script_filename:
fn = pybedtools.BedTool._tmp()
else:
fn = script_filename
fout = open(fn, 'w')
fout.write(s)
fout.close()
out = fn + '.Rout'
if run:
if not pybedtools.settings._R_installed:
helpers._check_for_R()
cmds = [os.path.join(pybedtools.settings._R_path, 'R'), 'CMD', 'BATCH',
fn, out]
p = subprocess.Popen(
cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if stdout or stderr:
print("stdout:", stdout)
print("stderr:", stderr)
if not script_filename:
return s
return None
|