File: writeBedGraph.py

package info (click to toggle)
python-deeptools 3.5.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 34,456 kB
  • sloc: python: 14,503; xml: 4,212; sh: 33; makefile: 5
file content (354 lines) | stat: -rw-r--r-- 13,223 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
import os
import sys
import shutil
import numpy as np
import pyBigWig

# own modules
from deeptools import mapReduce
from deeptools.utilities import getCommonChrNames
import deeptools.countReadsPerBin as cr
from deeptools import bamHandler
from deeptools import utilities

debug = 0
old_settings = np.seterr(all='ignore')


def writeBedGraph_wrapper(args):
    """
    Passes the arguments to writeBedGraph_worker.
    This is a step required given
    the constrains from the multiprocessing module.
    The args var, contains as first element the 'self' value
    from the WriteBedGraph object

    """
    return WriteBedGraph.writeBedGraph_worker(*args)


class WriteBedGraph(cr.CountReadsPerBin):

    r"""Reads bam files coverages and writes a bedgraph or bigwig file

    Extends the CountReadsPerBin object such that the coverage
    of bam files is writen to multiple bedgraph files at once.

    The bedgraph files are later merge into one and converted
    into a bigwig file if necessary.

    The constructor arguments are the same as for CountReadsPerBin. However,
    when calling the `run` method, the following parameters have
    to be passed

    Examples
    --------

    Given the following distribution of reads that cover 200 on
    a chromosome named '3R'::

          0                              100                           200
          |------------------------------------------------------------|
        A                                ===============
                                                        ===============


        B                 ===============               ===============
                                         ===============
                                                        ===============

    >>> import tempfile
    >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"

    >>> outFile = tempfile.NamedTemporaryFile()
    >>> bam_file = test_path +  "testA.bam"

    For the example a simple scaling function is going to be used. This function
    takes the coverage found at each region and multiplies it to the scaling factor.
    In this case the scaling factor is 1.5

    >>> function_to_call = scaleCoverage
    >>> funcArgs = {'scaleFactor': 1.5}

    Restrict process to a region between positions 0 and 200 of chromosome 3R

    >>> region = '3R:0:200'

    Set up such that coverage is computed for consecutive bins of length 25 bp
    >>> bin_length = 25
    >>> step_size = 25

    >>> num_sample_sites = 0 #overruled by step_size
    >>> c = WriteBedGraph([bam_file], binLength=bin_length, region=region, stepSize=step_size)
    >>> c.run(function_to_call, funcArgs, outFile.name)
    >>> f = open(outFile.name, 'r')
    >>> f.readlines()
    ['3R\t0\t100\t0\n', '3R\t100\t200\t1.5\n']
    >>> f.close()
    >>> outFile.close()


    """

    def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        getStats = len(self.mappedList) < len(self.bamFilesList)
        bam_handles = []
        for x in self.bamFilesList:
            if getStats:
                bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=self.numberOfProcessors)
                self.mappedList.append(mapped)
                self.statsList.append(stats)
            else:
                bam = bamHandler.openBam(x)
            bam_handles.append(bam)

        genome_chunk_length = getGenomeChunkLength(bam_handles, self.binLength, self.mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            if x in ["mappedList", "statsList"]:
                continue
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # Determine the sorted order of the temp files
        chrom_order = dict()
        for i, _ in enumerate(chrom_names_and_size):
            chrom_order[_[0]] = i
        res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
        res.sort()

        if format == 'bedgraph':
            out_file = open(out_file_name, 'wb')
            for r in res:
                if r[3]:
                    _foo = open(r[3], 'rb')
                    shutil.copyfileobj(_foo, out_file)
                    _foo.close()
                    os.remove(r[3])
            out_file.close()
        else:
            bedGraphToBigWig(chrom_names_and_size, [x[3] for x in res], out_file_name)

    def writeBedGraph_worker(self, chrom, start, end,
                             func_to_call, func_args,
                             bed_regions_list=None):
        r"""Writes a bedgraph based on the read coverage found on bamFiles

        The given func is called to compute the desired bedgraph value
        using the funcArgs

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`.
        smoothLength : int
            Distance in bp for smoothing the coverage per tile.
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        A list of [chromosome, start, end, temporary file], where the temporary file contains the bedgraph results for the region queried.

        Examples
        --------
        >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bamFile1 = test_path +  "testA.bam"
        >>> bin_length = 50
        >>> number_of_samples = 0 # overruled by step_size
        >>> func_to_call = scaleCoverage
        >>> funcArgs = {'scaleFactor': 1.0}

        >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50)
        >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs)
        >>> f = open(tempFile[3], 'r')
        >>> f.readlines()
        ['3R\t0\t100\t0\n', '3R\t100\t200\t1\n']
        >>> f.close()
        >>> os.remove(tempFile[3])


        """
        if start > end:
            raise NameError("start position ({0}) bigger "
                            "than end position ({1})".format(start, end))

        coverage, _ = self.count_reads_in_region(chrom, start, end)

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None
        line_string = "{}\t{}\t{}\t{:g}\n"
        for tileIndex in range(coverage.shape[0]):

            if self.smoothLength is not None and self.smoothLength > 0:
                vector_start, vector_end = self.getSmoothRange(tileIndex,
                                                               self.binLength,
                                                               self.smoothLength,
                                                               coverage.shape[0])
                tileCoverage = np.mean(coverage[vector_start:vector_end, :], axis=0)
            else:
                tileCoverage = coverage[tileIndex, :]
            if self.skipZeroOverZero and np.sum(tileCoverage) == 0:
                continue

            value = func_to_call(tileCoverage, func_args)
            """
            # uncomment these lines if fixed step bedgraph is required
            if not np.isnan(value):
                writeStart = start + tileIndex * self.binLength
                writeEnd  =  min(writeStart + self.binLength, end)
                _file.write(line_string.format(chrom, writeStart,
                                               writeEnd, value))
            continue
            """

            if previous_value is None:
                writeStart = start + tileIndex * self.binLength
                writeEnd = min(writeStart + self.binLength, end)
                previous_value = value

            elif previous_value == value:
                writeEnd = min(writeEnd + self.binLength, end)

            elif previous_value != value:
                if not np.isnan(previous_value):
                    _file.write(
                        line_string.format(chrom, writeStart, writeEnd, previous_value))
                previous_value = value
                writeStart = writeEnd
                writeEnd = min(writeStart + self.binLength, end)

        # write remaining value if not a nan
        if previous_value is not None and writeStart != end and not np.isnan(previous_value):
            _file.write(line_string.format(chrom, writeStart,
                                           end, previous_value))

        tempfilename = _file.name
        _file.close()
        return chrom, start, end, tempfilename


def bedGraphToBigWig(chromSizes, bedGraphFiles, bigWigPath):
    """
    Takes a sorted list of bedgraph files and write them to a single bigWig file using pyBigWig.
    The order of bedGraphFiles must match that of chromSizes!
    """
    bw = pyBigWig.open(bigWigPath, "w")
    assert bw is not None
    bw.addHeader(chromSizes, maxZooms=10)
    lastChrom = None
    starts = []
    ends = []
    vals = []
    for bg in bedGraphFiles:
        if bg is not None:
            f = open(bg)
            for line in f:
                interval = line.split()
                # Buffer up to a million entries
                if interval[0] != lastChrom or len(starts) == 1000000:
                    if lastChrom is not None:
                        bw.addEntries([lastChrom] * len(starts), starts, ends=ends, values=vals)
                    lastChrom = interval[0]
                    starts = [int(interval[1])]
                    ends = [int(interval[2])]
                    vals = [float(interval[3])]
                else:
                    starts.append(int(interval[1]))
                    ends.append(int(interval[2]))
                    vals.append(float(interval[3]))
            f.close()
            os.remove(bg)
    if len(starts) > 0:
        bw.addEntries([lastChrom] * len(starts), starts, ends=ends, values=vals)
    bw.close()


def getGenomeChunkLength(bamHandles, tile_size, mappedList):
    """
    Tries to estimate the length of the genome sent to the workers
    based on the density of reads per bam file and the number
    of bam files.

    The chunk length should be a multiple of the tileSize

    """

    genomeLength = sum(bamHandles[0].lengths)

    max_reads_per_bp = max([float(x) / genomeLength for x in mappedList])

    # 2e6 is an empirical estimate
    genomeChunkLength = int(min(5e6, int(2e6 / (max_reads_per_bp * len(bamHandles)))))

    genomeChunkLength -= genomeChunkLength % tile_size
    return genomeChunkLength


def scaleCoverage(tile_coverage, args):
    """
    tileCoverage should be an list with only one element
    """
    return args['scaleFactor'] * tile_coverage[0]


def ratio(tile_coverage, args):
    """
    tileCoverage should be an list of two elements
    """
    return float(tile_coverage[0]) / tile_coverage[1]