1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
|
import os
import sys
import sqlite3
import pybedtools
import time
import collections
def now():
return time.time()
def get_name(fname):
return os.path.splitext(os.path.basename(fname))[0]
class IntersectionMatrix(object):
"""
Class to handle many pairwise comparisons of interval files
"""
def __init__(self, beds, genome, iterations, dbfn=None, force=False):
"""
Class to handle and keep track of many pairwise comparisons of interval
files.
A lightweight database approach is used to minimize computational time.
The database stores filenames and calculation timestamps;
re-calculating a matrix using the same interval files will only
re-calculate values for those files whose modification times are newer
than the timestamp in the database.
`beds` is a list of bed files.
`genome` is the string assembly name, e.g., "hg19" or "dm3".
`dbfn` is the filename of the database you'd like to use to track
what's been completed.
Example usage:
First, get a list of bed files to use:
#>>> beds = [
#... pybedtools.example_filename(i) for i in [
#... 'Cp190_Kc_Bushey_2009.bed',
#... 'CTCF_Kc_Bushey_2009.bed',
#... 'SuHw_Kc_Bushey_2009.bed',
#... 'BEAF_Kc_Bushey_2009.bed'
#... ]]
Set some parameters. "dm3" is the genome to use; info will be stored
in "ex.db". `force=True` means to overwrite what's in the database
#>>> # In practice, you'll want many more iterations...
#>>> im = IntersectionMatrix(beds, 'dm3',
#... dbfn='ex.db', iterations=3, force=True)
#>>> # Use 4 CPUs for randomization
#>>> matrix = im.create_matrix(verbose=True, processes=4)
"""
self.beds = beds
self.genome = genome
self.dbfn = dbfn
self.iterations = iterations
if self.dbfn:
self._init_db(force)
self.conn = sqlite3.connect(dbfn)
self.conn.row_factory = sqlite3.Row
self.c = self.conn.cursor()
def _init_db(self, force=False):
"""
Prepare the database if it doesn't already exist
"""
if self.dbfn is None:
return
if os.path.exists(self.dbfn) and not force:
return
conn = sqlite3.connect(self.dbfn)
c = conn.cursor()
if force:
c.execute("DROP TABLE IF EXISTS intersections;")
c.executescript(
"""
CREATE TABLE intersections (
filea TEXT,
fileb TEXT,
timestamp FLOAT,
actual FLOAT,
median FLOAT,
iterations INT,
self INT,
other INT,
fractionabove FLOAT,
fractionbelow FLOAT,
percentile FLOAT,
PRIMARY KEY (filea, fileb, iterations));
"""
)
conn.commit()
def get_row(self, fa, fb, iterations):
"""
Return the sqlite3.Row from the database corresponding to files `fa`
and `fb`; returns None if not found.
"""
if self.dbfn is None:
return
results = list(
self.c.execute(
"""
SELECT * FROM intersections
WHERE
filea=:fa AND fileb=:fb AND iterations=:iterations
""",
locals(),
)
)
if len(results) == 0:
return
assert len(results) == 1
return results[0]
def done(self, fa, fb, iterations):
"""
Retrieves row from db and only returns True if there's something in
there and the timestamp is newer than the input files.
"""
row = self.get_row(fa, fb, iterations)
if row:
tfa = os.path.getmtime(fa)
tfb = os.path.getmtime(fb)
if (row["timestamp"] > tfa) and (row["timestamp"] > tfb):
return True
return False
def run_and_insert(self, fa, fb, **kwargs):
a = pybedtools.BedTool(fa).set_chromsizes(self.genome)
kwargs["iterations"] = self.iterations
results = a.randomstats(fb, **kwargs)
self.add_row(results)
def add_row(self, results):
"""
Inserts data into db. `results` is a dictionary as returned by
BedTool.randomstats with keys like::
'iterations'
'actual'
'file_a'
'file_b'
self.fn
other.fn
'self'
'other'
'frac randomized above actual'
'frac randomized below actual'
'median randomized'
'normalized'
'percentile'
'lower_%sth' % lower_thresh
'upper_%sth' % upper_thresh
"""
# translate results keys into db-friendly versions
translations = [
("file_a", "filea"),
("file_b", "fileb"),
("median randomized", "median"),
("frac randomized above actual", "fractionabove"),
("frac randomized below actual", "fractionbelow"),
]
for orig, new in translations:
results[new] = results[orig]
results["timestamp"] = now()
sql = """
INSERT OR REPLACE INTO intersections (
filea,
fileb,
timestamp,
actual,
median,
iterations,
self,
other,
fractionabove,
fractionbelow,
percentile)
VALUES (
:filea,
:fileb,
:timestamp,
:actual,
:median,
:iterations,
:self,
:other,
:fractionabove,
:fractionbelow,
:percentile)
"""
self.c.execute(sql, results)
self.conn.commit()
def create_matrix(self, verbose=False, **kwargs):
"""
Matrix (implemented as a dictionary), where the final values are
sqlite3.ROW objects from the database::
{
filea: {
filea: ROW,
fileb: ROW,
...},
fileb: {
filea: ROW,
fileb: ROW,
...},
}
}
"""
nfiles = len(self.beds)
total = nfiles ** 2
i = 0
matrix = collections.defaultdict(dict)
for fa in self.beds:
for fb in self.beds:
i += 1
if verbose:
sys.stderr.write("%(i)s of %(total)s: %(fa)s + %(fb)s\n" % locals())
sys.stderr.flush()
if not self.done(fa, fb, self.iterations):
self.run_and_insert(fa, fb, **kwargs)
matrix[get_name(fa)][get_name(fb)] = self.get_row(
fa, fb, self.iterations
)
return matrix
def print_matrix(self, matrix, key):
"""
Prints a pairwise matrix of values. `matrix` is a dict-of-dicts from
create_matrix(), and `key` is a field name from the database -- one of:
['filea', 'fileb', 'timestamp', 'actual', 'median', 'iterations',
'self', 'other', 'fractionabove', 'fractionbelow', 'percentile']
"""
|