File: getScorePerBigWigBin.py

package info (click to toggle)
python-deeptools 3.5.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 34,456 kB
  • sloc: python: 14,503; xml: 4,212; sh: 33; makefile: 5
file content (322 lines) | stat: -rw-r--r-- 11,967 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import pyBigWig
import numpy as np
import os
import sys
import shutil
import warnings

# deepTools packages
import deeptools.mapReduce as mapReduce
import deeptools.utilities
# debug = 0

old_settings = np.seterr(all='ignore')


def countReadsInRegions_wrapper(args):
    # Using arguments unpacking!
    return countFragmentsInRegions_worker(*args)


def countFragmentsInRegions_worker(chrom, start, end,
                                   bigWigFiles,
                                   stepSize, binLength,
                                   save_data,
                                   bedRegions=None
                                   ):
    """ returns the average score in each bigwig file at each 'stepSize'
    position within the interval start, end for a 'binLength' window.
    Because the idea is to get counts for window positions at
    different positions for sampling the bins are equally spaced
    and *not adjacent*.

    If a list of bedRegions is given, then the number of reads
    that overlaps with each region is counted.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Fragment coverage.
    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
    array([[1., 1., 2., 2.],
           [1., 1., 1., 3.]])

    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
    array([[1.5],
           [1.5]])

    BED regions:
    >>> bedRegions = [[test.chrom, [(45, 55)]], [test.chrom, [(95, 105)]], [test.chrom, [(145, 155)]]]
    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
    ... bedRegions=bedRegions)[0])
    array([[1. , 1.5, 2. ],
           [1. , 1. , 2. ]])
    """
    assert start < end, "start {} bigger that end {}".format(start, end)

    # array to keep the scores for the regions
    sub_score_per_bin = []

    rows = 0

    bigwig_handles = []
    for foo in bigWigFiles:
        bigwig_handles.append(pyBigWig.open(foo))

    regions_to_consider = []
    if bedRegions:
        for reg in bedRegions:
            regs = []
            for exon in reg[1]:
                regs.append((exon[0], exon[1]))
            regions_to_consider.append(regs)
    else:
        for i in range(start, end, stepSize):
            if (i + binLength) > end:
                regions_to_consider.append([(i, end)])  # last bin (may be smaller)
            else:
                regions_to_consider.append([(i, i + binLength)])

    if save_data:
        _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
        _file_name = _file.name
    else:
        _file_name = ''
    warnings.simplefilter("default")
    i = 0
    for reg in regions_to_consider:
        avgReadsArray = []
        i += 1

        for idx, bwh in enumerate(bigwig_handles):
            if chrom not in bwh.chroms():
                unmod_name = chrom
                if chrom.startswith('chr'):
                    # remove the chr part from chromosome name
                    chrom = chrom[3:]
                else:
                    # prefix with 'chr' the chromosome name
                    chrom = 'chr' + chrom
                if chrom not in bwh.chroms():
                    exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))

            weights = []
            scores = []
            for exon in reg:
                weights.append(exon[1] - exon[0])
                score = bwh.stats(chrom, exon[0], exon[1])

                if score is None or score == [None] or np.isnan(score[0]):
                    score = [np.nan]
                scores.extend(score)
            avgReadsArray.append(np.average(scores, weights=weights))  # mean of fragment coverage for region

        sub_score_per_bin.extend(avgReadsArray)
        rows += 1
        if save_data:
            starts = []
            ends = []
            for exon in reg:
                starts.append(str(exon[0]))
                ends.append(str(exon[1]))
            starts = ",".join(starts)
            ends = ",".join(ends)
            _file.write("\t".join(map(str, [chrom, starts, ends])) + "\t")
            _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")

    if save_data:
        _file.close()
    warnings.resetwarnings()

    # the output is a matrix having as many rows as the variable 'row'
    # and as many columns as bigwig files. The rows correspond to
    # each of the regions processed by the worker.
    # np.array([[score1_1, score1_2],
    #           [score2_1, score2_2]]
    return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name


def getChromSizes(bigwigFilesList):
    """
    Get chromosome sizes from bigWig file with pyBigWig

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Chromosome name(s) and size(s).
    >>> assert getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))
    """
    def print_chr_names_and_size(chr_set):
        sys.stderr.write("chromosome\tlength\n")
        for name, size in chr_set:
            sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))

    bigwigFilesList = bigwigFilesList[:]

    common_chr = set()
    for fname in bigwigFilesList:
        fh = pyBigWig.open(fname)
        common_chr = common_chr.union(set(fh.chroms().items()))
        fh.close()

    non_common_chr = set()
    for bw in bigwigFilesList:
        _names_and_size = set(pyBigWig.open(bw).chroms().items())
        if len(common_chr & _names_and_size) == 0:
            #  try to add remove 'chr' from the chromosme name
            _corr_names_size = set()
            for chrom_name, size in _names_and_size:
                if chrom_name.startswith('chr'):
                    _corr_names_size.add((chrom_name[3:], size))
                else:
                    _corr_names_size.add(('chr' + chrom_name, size))
            if len(common_chr & _corr_names_size) == 0:
                message = "No common chromosomes found. Are the bigwig files " \
                          "from the same species and same assemblies?\n"
                sys.stderr.write(message)
                print_chr_names_and_size(common_chr)

                sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
                                 "lengths from file\n{}\n".format(bw))
                print_chr_names_and_size(_names_and_size)
                exit(1)
            else:
                _names_and_size = _corr_names_size

        non_common_chr |= common_chr ^ _names_and_size
        common_chr = common_chr & _names_and_size

    if len(non_common_chr) > 0:
        sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n")
        print_chr_names_and_size(non_common_chr)

    # get the list of common chromosome names and sizes
    return sorted(common_chr), non_common_chr


def getScorePerBin(bigWigFiles, binLength,
                   numberOfProcessors=1,
                   verbose=False, region=None,
                   bedFile=None,
                   blackListFileName=None,
                   stepSize=None,
                   chrsToSkip=[],
                   out_file_for_raw_data=None,
                   allArgs=None):
    """
    This function returns a matrix containing scores (median) for the coverage
    of fragments within a region. Each row corresponds to a sampled region.
    Likewise, each column corresponds to a bigwig file.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()
    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
    array([[1., 1., 2., 2.],
           [1., 1., 1., 3.]])

    """
    # Try to determine an optimal fraction of the genome (chunkSize)
    # that is sent to workers for analysis. If too short, too much time
    # is spent loading the files
    # if too long, some processors end up free.
    # the following is a heuristic

    # get list of common chromosome names and sizes
    chrom_sizes, non_common = getChromSizes(bigWigFiles)
    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if chrsToSkip and len(chrsToSkip):
        chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]

    chrnames, chrlengths = list(zip(*chrom_sizes))
    if stepSize is None:
        stepSize = binLength  # for adjacent bins

    # set chunksize based on number of processors used
    chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
    # make chunkSize multiple of binLength
    chunkSize -= chunkSize % binLength
    if verbose:
        print("step size is {}".format(stepSize))

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)
    # mapReduce( (staticArgs), func, chromSize, etc. )
    if out_file_for_raw_data:
        save_file = True
    else:
        save_file = False

    # Handle GTF options
    transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)

    imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
                                   countReadsInRegions_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunkSize,
                                   bedFile=bedFile,
                                   blackListFileName=blackListFileName,
                                   region=region,
                                   numberOfProcessors=numberOfProcessors,
                                   transcriptID=transcriptID,
                                   exonID=exonID,
                                   keepExons=keepExons,
                                   transcript_id_designator=transcript_id_designator)

    if out_file_for_raw_data:
        if len(non_common):
            sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                             "the chromosomes that were not common between the bigwig files\n")

        # concatenate intermediary bedgraph files
        ofile = open(out_file_for_raw_data, "w")
        for _values, tempFileName in imap_res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                f = open(tempFileName, 'r')
                shutil.copyfileobj(f, ofile)
                f.close()
                os.remove(tempFileName)

        ofile.close()

    # the matrix scores are in the first element of each of the entries in imap_res
    score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
    return score_per_bin


class Tester(object):

    def __init__(self):
        """
        The the two bigWig files are as follows:
        $ cat /tmp/testA.bg
        3R      0       100     1
        3R      100     200     2

        $ cat /tmp/testB.bg
        3R      0       150     1
        3R      150     200     3

        They cover 200 bp:

              0              50              100            150            200
              |------------------------------------------------------------|
            A  111111111111111111111111111111122222222222222222222222222222


            B  111111111111111111111111111111111111111111111333333333333333

        """

        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        self.bwFile1 = self.root + "testA.bw"
        self.bwFile2 = self.root + "testB.bw"
        self.bwFile_PE = self.root + "test_paired2.bw"
        self.chrom = '3R'
        # global debug
        # debug = 0