1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
|
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import os
import shutil
import tempfile
import numpy as np
import sys
# NGS packages
import pyBigWig
# own module
from deeptools import mapReduce
from deeptools.utilities import getCommonChrNames, toBytes
from deeptools.writeBedGraph import *
from deeptools import bamHandler
old_settings = np.seterr(all='ignore')
def getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize,
missingDataAsZero=False):
try:
coverage = np.asarray(bigwigHandle.values(chrom, start, end))
except TypeError:
# this error happens when chromosome
# is not into the bigwig file
return []
if coverage is None:
return []
if missingDataAsZero is True:
coverage[np.isnan(coverage)] = 0
# average the values per bin
cov = np.array(
[np.mean(coverage[x:x + tileSize])
for x in range(0, len(coverage), tileSize)])
return cov
def writeBedGraph_wrapper(args):
return writeBedGraph_worker(*args)
def writeBedGraph_worker(
chrom, start, end, tileSize, defaultFragmentLength,
bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0,
skipZeroOverZero=False, missingDataAsZero=False, fixed_step=False):
r"""
Writes a bedgraph having as base a number of bam files.
The given func is called to compute the desired bedgraph value
using the funcArgs
tileSize
"""
if start > end:
raise NameError("start position ({0}) bigger than "
"end position ({1})".format(start, end))
coverage = []
for indexFile, fileFormat in bamOrBwFileList:
if fileFormat == 'bam':
bamHandle = bamHandler.openBam(indexFile)
coverage.append(getCoverageFromBam(
bamHandle, chrom, start, end, tileSize,
defaultFragmentLength, extendPairedEnds,
True))
bamHandle.close()
elif fileFormat == 'bigwig':
bigwigHandle = pyBigWig.open(indexFile)
coverage.append(
getCoverageFromBigwig(
bigwigHandle, chrom, start, end,
tileSize, missingDataAsZero))
bigwigHandle.close()
_file = tempfile.NamedTemporaryFile(delete=False)
previousValue = None
lengthCoverage = len(coverage[0])
for tileIndex in range(lengthCoverage):
tileCoverage = []
for index in range(len(bamOrBwFileList)):
if smoothLength > 0:
vectorStart, vectorEnd = getSmoothRange(
tileIndex, tileSize, smoothLength, lengthCoverage)
tileCoverage.append(
np.mean(coverage[index][vectorStart:vectorEnd]))
else:
try:
tileCoverage.append(coverage[index][tileIndex])
except IndexError:
sys.exit("Chromosome {} probably not in one of the bigwig "
"files. Remove this chromosome from the bigwig file "
"to continue".format(chrom))
if skipZeroOverZero and np.sum(tileCoverage) == 0:
previousValue = None
continue
value = func(tileCoverage, funcArgs)
if fixed_step:
writeStart = start + tileIndex * tileSize
writeEnd = min(writeStart + tileSize, end)
try:
_file.write(toBytes("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
writeEnd, value)))
except TypeError:
_file.write(toBytes("{}\t{}\t{}\t{}\n".format(chrom, writeStart,
writeEnd, value)))
else:
if previousValue is None:
writeStart = start + tileIndex * tileSize
writeEnd = min(writeStart + tileSize, end)
previousValue = value
elif previousValue == value:
writeEnd = min(writeEnd + tileSize, end)
elif previousValue != value:
if not np.isnan(previousValue):
_file.write(
toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart,
writeEnd, previousValue)))
previousValue = value
writeStart = writeEnd
writeEnd = min(writeStart + tileSize, end)
if not fixed_step:
# write remaining value if not a nan
if previousValue and writeStart != end and \
not np.isnan(previousValue):
_file.write(toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart,
end, previousValue)))
tempFileName = _file.name
_file.close()
return chrom, start, end, tempFileName
def writeBedGraph(
bamOrBwFileList, outputFileName, fragmentLength,
func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1,
format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False):
r"""
Given a list of bamfiles, a function and a function arguments,
this method writes a bedgraph file (or bigwig) file
for a partition of the genome into tiles of given size
and a value for each tile that corresponds to the given function
and that is related to the coverage underlying the tile.
"""
bamHandles = []
mappedList = []
for indexedFile, fileFormat in bamOrBwFileList:
if fileFormat == 'bam':
bam, mapped, unmapped, stats = bamHandler.openBam(indexedFile, returnStats=True, nThreads=numberOfProcessors)
bamHandles.append(bam)
mappedList.append(mapped)
if len(bamHandles):
genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList)
# check if both bam files correspond to the same species
# by comparing the chromosome names:
chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose)
else:
genomeChunkLength = int(10e6)
cCommon = []
chromNamesAndSize = {}
for fileName, fileFormat in bamOrBwFileList:
if fileFormat == 'bigwig':
fh = pyBigWig.open(fileName)
else:
continue
for chromName, size in list(fh.chroms().items()):
if chromName in chromNamesAndSize:
cCommon.append(chromName)
if chromNamesAndSize[chromName] != size:
print("\nWARNING\n"
"Chromosome {} length reported in the "
"input files differ.\n{} for {}\n"
"{} for {}.\n\nThe smallest "
"length will be used".format(
chromName, chromNamesAndSize[chromName],
bamOrBwFileList[0][0], size, fileName))
chromNamesAndSize[chromName] = min(
chromNamesAndSize[chromName], size)
else:
chromNamesAndSize[chromName] = size
fh.close()
# get the list of common chromosome names and sizes
chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
if k in cCommon]
if region:
# in case a region is used, append the tilesize
region += ":{}".format(tileSize)
res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
func, funcArgs, extendPairedEnds, smoothLength,
skipZeroOverZero, missingDataAsZero, fixed_step),
writeBedGraph_wrapper,
chromNamesAndSize,
genomeChunkLength=genomeChunkLength,
region=region,
blackListFileName=blackListFileName,
numberOfProcessors=numberOfProcessors,
verbose=verbose)
# Determine the sorted order of the temp files
chrom_order = dict()
for i, _ in enumerate(chromNamesAndSize):
chrom_order[_[0]] = i
res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
res.sort()
if format == 'bedgraph':
of = open(outputFileName, 'wb')
for r in res:
if r is not None:
_ = open(r[3], 'rb')
shutil.copyfileobj(_, of)
_.close()
os.remove(r[3])
of.close()
else:
bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)
|