1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
|
import pyBigWig
import numpy as np
import os
import sys
import shutil
import warnings
# deepTools packages
import deeptools.mapReduce as mapReduce
import deeptools.utilities
# debug = 0
old_settings = np.seterr(all='ignore')
def countReadsInRegions_wrapper(args):
# Using arguments unpacking!
return countFragmentsInRegions_worker(*args)
def countFragmentsInRegions_worker(chrom, start, end,
bigWigFiles,
stepSize, binLength,
save_data,
bedRegions=None
):
""" returns the average score in each bigwig file at each 'stepSize'
position within the interval start, end for a 'binLength' window.
Because the idea is to get counts for window positions at
different positions for sampling the bins are equally spaced
and *not adjacent*.
If a list of bedRegions is given, then the number of reads
that overlaps with each region is counted.
Test dataset with two samples covering 200 bp.
>>> test = Tester()
Fragment coverage.
>>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
array([[1., 1., 2., 2.],
[1., 1., 1., 3.]])
>>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
array([[1.5],
[1.5]])
BED regions:
>>> bedRegions = [[test.chrom, [(45, 55)]], [test.chrom, [(95, 105)]], [test.chrom, [(145, 155)]]]
>>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
... bedRegions=bedRegions)[0])
array([[1. , 1.5, 2. ],
[1. , 1. , 2. ]])
"""
assert start < end, "start {} bigger that end {}".format(start, end)
# array to keep the scores for the regions
sub_score_per_bin = []
rows = 0
bigwig_handles = []
for foo in bigWigFiles:
bigwig_handles.append(pyBigWig.open(foo))
regions_to_consider = []
if bedRegions:
for reg in bedRegions:
regs = []
for exon in reg[1]:
regs.append((exon[0], exon[1]))
regions_to_consider.append(regs)
else:
for i in range(start, end, stepSize):
if (i + binLength) > end:
regions_to_consider.append([(i, end)]) # last bin (may be smaller)
else:
regions_to_consider.append([(i, i + binLength)])
if save_data:
_file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
_file_name = _file.name
else:
_file_name = ''
warnings.simplefilter("default")
i = 0
for reg in regions_to_consider:
avgReadsArray = []
i += 1
for idx, bwh in enumerate(bigwig_handles):
if chrom not in bwh.chroms():
unmod_name = chrom
if chrom.startswith('chr'):
# remove the chr part from chromosome name
chrom = chrom[3:]
else:
# prefix with 'chr' the chromosome name
chrom = 'chr' + chrom
if chrom not in bwh.chroms():
exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))
weights = []
scores = []
for exon in reg:
weights.append(exon[1] - exon[0])
score = bwh.stats(chrom, exon[0], exon[1])
if score is None or score == [None] or np.isnan(score[0]):
score = [np.nan]
scores.extend(score)
avgReadsArray.append(np.average(scores, weights=weights)) # mean of fragment coverage for region
sub_score_per_bin.extend(avgReadsArray)
rows += 1
if save_data:
starts = []
ends = []
for exon in reg:
starts.append(str(exon[0]))
ends.append(str(exon[1]))
starts = ",".join(starts)
ends = ",".join(ends)
_file.write("\t".join(map(str, [chrom, starts, ends])) + "\t")
_file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")
if save_data:
_file.close()
warnings.resetwarnings()
# the output is a matrix having as many rows as the variable 'row'
# and as many columns as bigwig files. The rows correspond to
# each of the regions processed by the worker.
# np.array([[score1_1, score1_2],
# [score2_1, score2_2]]
return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
def getChromSizes(bigwigFilesList):
"""
Get chromosome sizes from bigWig file with pyBigWig
Test dataset with two samples covering 200 bp.
>>> test = Tester()
Chromosome name(s) and size(s).
>>> assert getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))
"""
def print_chr_names_and_size(chr_set):
sys.stderr.write("chromosome\tlength\n")
for name, size in chr_set:
sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))
bigwigFilesList = bigwigFilesList[:]
common_chr = set()
for fname in bigwigFilesList:
fh = pyBigWig.open(fname)
common_chr = common_chr.union(set(fh.chroms().items()))
fh.close()
non_common_chr = set()
for bw in bigwigFilesList:
_names_and_size = set(pyBigWig.open(bw).chroms().items())
if len(common_chr & _names_and_size) == 0:
# try to add remove 'chr' from the chromosme name
_corr_names_size = set()
for chrom_name, size in _names_and_size:
if chrom_name.startswith('chr'):
_corr_names_size.add((chrom_name[3:], size))
else:
_corr_names_size.add(('chr' + chrom_name, size))
if len(common_chr & _corr_names_size) == 0:
message = "No common chromosomes found. Are the bigwig files " \
"from the same species and same assemblies?\n"
sys.stderr.write(message)
print_chr_names_and_size(common_chr)
sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
"lengths from file\n{}\n".format(bw))
print_chr_names_and_size(_names_and_size)
exit(1)
else:
_names_and_size = _corr_names_size
non_common_chr |= common_chr ^ _names_and_size
common_chr = common_chr & _names_and_size
if len(non_common_chr) > 0:
sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n")
print_chr_names_and_size(non_common_chr)
# get the list of common chromosome names and sizes
return sorted(common_chr), non_common_chr
def getScorePerBin(bigWigFiles, binLength,
numberOfProcessors=1,
verbose=False, region=None,
bedFile=None,
blackListFileName=None,
stepSize=None,
chrsToSkip=[],
out_file_for_raw_data=None,
allArgs=None):
"""
This function returns a matrix containing scores (median) for the coverage
of fragments within a region. Each row corresponds to a sampled region.
Likewise, each column corresponds to a bigwig file.
Test dataset with two samples covering 200 bp.
>>> test = Tester()
>>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
array([[1., 1., 2., 2.],
[1., 1., 1., 3.]])
"""
# Try to determine an optimal fraction of the genome (chunkSize)
# that is sent to workers for analysis. If too short, too much time
# is spent loading the files
# if too long, some processors end up free.
# the following is a heuristic
# get list of common chromosome names and sizes
chrom_sizes, non_common = getChromSizes(bigWigFiles)
# skip chromosome in the list. This is usually for the
# X chromosome which may have either one copy in a male sample
# or a mixture of male/female and is unreliable.
# Also the skip may contain heterochromatic regions and
# mitochondrial DNA
if chrsToSkip and len(chrsToSkip):
chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]
chrnames, chrlengths = list(zip(*chrom_sizes))
if stepSize is None:
stepSize = binLength # for adjacent bins
# set chunksize based on number of processors used
chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
# make chunkSize multiple of binLength
chunkSize -= chunkSize % binLength
if verbose:
print("step size is {}".format(stepSize))
if region:
# in case a region is used, append the tilesize
region += ":{}".format(binLength)
# mapReduce( (staticArgs), func, chromSize, etc. )
if out_file_for_raw_data:
save_file = True
else:
save_file = False
# Handle GTF options
transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
countReadsInRegions_wrapper,
chrom_sizes,
genomeChunkLength=chunkSize,
bedFile=bedFile,
blackListFileName=blackListFileName,
region=region,
numberOfProcessors=numberOfProcessors,
transcriptID=transcriptID,
exonID=exonID,
keepExons=keepExons,
transcript_id_designator=transcript_id_designator)
if out_file_for_raw_data:
if len(non_common):
sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
"the chromosomes that were not common between the bigwig files\n")
# concatenate intermediary bedgraph files
ofile = open(out_file_for_raw_data, "w")
for _values, tempFileName in imap_res:
if tempFileName:
# concatenate all intermediate tempfiles into one
f = open(tempFileName, 'r')
shutil.copyfileobj(f, ofile)
f.close()
os.remove(tempFileName)
ofile.close()
# the matrix scores are in the first element of each of the entries in imap_res
score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
return score_per_bin
class Tester(object):
def __init__(self):
"""
The the two bigWig files are as follows:
$ cat /tmp/testA.bg
3R 0 100 1
3R 100 200 2
$ cat /tmp/testB.bg
3R 0 150 1
3R 150 200 3
They cover 200 bp:
0 50 100 150 200
|------------------------------------------------------------|
A 111111111111111111111111111111122222222222222222222222222222
B 111111111111111111111111111111111111111111111333333333333333
"""
self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
self.bwFile1 = self.root + "testA.bw"
self.bwFile2 = self.root + "testB.bw"
self.bwFile_PE = self.root + "test_paired2.bw"
self.chrom = '3R'
# global debug
# debug = 0
|