File: writeBedGraph_bam_and_bw.py

package info (click to toggle)
python-deeptools 3.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 34,624 kB
  • sloc: python: 14,765; xml: 4,090; sh: 38; makefile: 11
file content (234 lines) | stat: -rw-r--r-- 8,951 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import os
import shutil
import tempfile
import numpy as np
import sys

# NGS packages
import pyBigWig

# own module
from deeptools import mapReduce
from deeptools.utilities import getCommonChrNames, toBytes
from deeptools.writeBedGraph import *
from deeptools import bamHandler

old_settings = np.seterr(all='ignore')


def getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize,
                          missingDataAsZero=False):
    try:
        coverage = np.asarray(bigwigHandle.values(chrom, start, end))
    except TypeError:
        # this error happens when chromosome
        # is not into the bigwig file
        return []
    if coverage is None:
        return []
    if missingDataAsZero is True:
        coverage[np.isnan(coverage)] = 0
    # average the values per bin
    cov = np.array(
        [np.mean(coverage[x:x + tileSize])
         for x in range(0, len(coverage), tileSize)])
    return cov


def writeBedGraph_wrapper(args):
    return writeBedGraph_worker(*args)


def writeBedGraph_worker(
        chrom, start, end, tileSize, defaultFragmentLength,
        bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0,
        skipZeroOverZero=False, missingDataAsZero=False, fixed_step=False):
    r"""
    Writes a bedgraph having as base a number of bam files.

    The given func is called to compute the desired bedgraph value
    using the funcArgs

    tileSize
    """
    if start > end:
        raise NameError("start position ({0}) bigger than "
                        "end position ({1})".format(start, end))

    coverage = []

    for indexFile, fileFormat in bamOrBwFileList:
        if fileFormat == 'bam':
            bamHandle = bamHandler.openBam(indexFile)
            coverage.append(getCoverageFromBam(
                bamHandle, chrom, start, end, tileSize,
                defaultFragmentLength, extendPairedEnds,
                True))
            bamHandle.close()
        elif fileFormat == 'bigwig':
            bigwigHandle = pyBigWig.open(indexFile)
            coverage.append(
                getCoverageFromBigwig(
                    bigwigHandle, chrom, start, end,
                    tileSize, missingDataAsZero))
            bigwigHandle.close()

    _file = tempfile.NamedTemporaryFile(delete=False)

    previousValue = None
    lengthCoverage = len(coverage[0])
    for tileIndex in range(lengthCoverage):

        tileCoverage = []
        for index in range(len(bamOrBwFileList)):
            if smoothLength > 0:
                vectorStart, vectorEnd = getSmoothRange(
                    tileIndex, tileSize, smoothLength, lengthCoverage)
                tileCoverage.append(
                    np.mean(coverage[index][vectorStart:vectorEnd]))
            else:
                try:
                    tileCoverage.append(coverage[index][tileIndex])
                except IndexError:
                    sys.exit("Chromosome {} probably not in one of the bigwig "
                             "files. Remove this chromosome from the bigwig file "
                             "to continue".format(chrom))

        if skipZeroOverZero and np.sum(tileCoverage) == 0:
            previousValue = None
            continue

        value = func(tileCoverage, funcArgs)

        if fixed_step:
            writeStart = start + tileIndex * tileSize
            writeEnd = min(writeStart + tileSize, end)
            try:
                _file.write(toBytes("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                            writeEnd, value)))
            except TypeError:
                _file.write(toBytes("{}\t{}\t{}\t{}\n".format(chrom, writeStart,
                                                              writeEnd, value)))
        else:
            if previousValue is None:
                writeStart = start + tileIndex * tileSize
                writeEnd = min(writeStart + tileSize, end)
                previousValue = value

            elif previousValue == value:
                writeEnd = min(writeEnd + tileSize, end)

            elif previousValue != value:
                if not np.isnan(previousValue):
                    _file.write(
                        toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart,
                                                                writeEnd, previousValue)))
                previousValue = value
                writeStart = writeEnd
                writeEnd = min(writeStart + tileSize, end)

    if not fixed_step:
        # write remaining value if not a nan
        if previousValue and writeStart != end and \
                not np.isnan(previousValue):
            _file.write(toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart,
                                                                end, previousValue)))

    tempFileName = _file.name
    _file.close()
    return chrom, start, end, tempFileName


def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """
    bamHandles = []
    mappedList = []
    for indexedFile, fileFormat in bamOrBwFileList:
        if fileFormat == 'bam':
            bam, mapped, unmapped, stats = bamHandler.openBam(indexedFile, returnStats=True, nThreads=numberOfProcessors)
            bamHandles.append(bam)
            mappedList.append(mapped)

    if len(bamHandles):
        genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose)
    else:
        genomeChunkLength = int(10e6)
        cCommon = []
        chromNamesAndSize = {}
        for fileName, fileFormat in bamOrBwFileList:
            if fileFormat == 'bigwig':
                fh = pyBigWig.open(fileName)
            else:
                continue

            for chromName, size in list(fh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "input files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bamOrBwFileList[0][0], size, fileName))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            fh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               skipZeroOverZero, missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              blackListFileName=blackListFileName,
                              numberOfProcessors=numberOfProcessors,
                              verbose=verbose)

    # Determine the sorted order of the temp files
    chrom_order = dict()
    for i, _ in enumerate(chromNamesAndSize):
        chrom_order[_[0]] = i
    res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
    res.sort()

    if format == 'bedgraph':
        of = open(outputFileName, 'wb')
        for r in res:
            if r is not None:
                _ = open(r[3], 'rb')
                shutil.copyfileobj(_, of)
                _.close()
                os.remove(r[3])
        of.close()
    else:
        bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)