File: SES_scaleFactor.py

package info (click to toggle)
python-deeptools 3.5.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 34,456 kB
  • sloc: python: 14,503; xml: 4,212; sh: 33; makefile: 5
file content (195 lines) | stat: -rw-r--r-- 7,004 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import os
import numpy as np

# own packages
from deeptools import bamHandler
import deeptools.countReadsPerBin as countR

old_settings = np.seterr(all='ignore')
debug = 0


def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
                        normalizationLength,
                        avg_method='median', blackListFileName=None, numberOfProcessors=1,
                        verbose=False, chrsToSkip=[], mappingStatsList=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.

    Parameters
    ----------
    bamFilesList : list
        list of bam files to normalize
    binLength : int
        the window size in bp, where reads are going to be
        counted.
    numberOfSamples : int
        number of sites to sample from the genome. For more info see
        the documentation of the CountReadsPerBin class
    normalizationLength : int
        length, in bp, to normalize the data.
        For a value of 1, on average
        1 read per base pair is found
    avg_method : str
        defines how the different values are to be summarized.
        The options are 'mean' and 'median'
    chrsToSkip : list
        name of the chromosomes to be excluded from the
        scale estimation. Usually the chrX is included.
    blackListFileName : str
        BED file containing blacklisted regions
    mappingStatsList : list
        List of the number of mapped reads per file

    Returns
    -------
    dict
        Dictionary with the following keys::
            'size_factors'
            'size_factors_based_on_mapped_reads'
            'size_factors_SES'
            'size_factors_based_on_mean'
            'size_factors_based_on_median'
            'mean'
            'meanSES'
            'median'
            'reads_per_bin'
            'std'
            'sites_sampled'


    Examples
    --------
    >>> test = Tester()
    >>> bin_length = 50
    >>> num_samples = 4
    >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples,  1)
    >>> _dict['size_factors']
    array([1. , 0.5])
    >>> _dict['size_factors_based_on_mean']
    array([1. , 0.5])
    """

    assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files"

    if len(mappingStatsList) == len(bamFilesList):
        mappedReads = mappingStatsList
    else:
        mappedReads = []
        for fname in bamFilesList:
            mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1])

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads

    cr = countR.CountReadsPerBin(bamFilesList,
                                 binLength=binLength,
                                 numberOfSamples=numberOfSamples,
                                 extendReads=False,
                                 blackListFileName=blackListFileName,
                                 numberOfProcessors=numberOfProcessors,
                                 verbose=verbose,
                                 chrsToSkip=chrsToSkip)

    try:
        num_reads_per_bin = cr.run()
    except Exception as detail:
        exit("*ERROR*: {}".format(detail))

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while maxIndex < len(p):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
               np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print("all genomic regions sampled from one ")
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    if min(median) == 0:
        idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
        exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
             "Try selecting a larger sample size or a region with coverage\n".format(idx_zero))

    sizeFactor = sizeFactorsSES
    return {'size_factors': sizeFactor,
            'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
            'size_factors_SES': sizeFactorsSES,
            'size_factors_based_on_mean': mean.min() / mean,
            'size_factors_based_on_median': median.min() / median,
            'mean': mean,
            'meanSES': meanSES,
            'median': median,
            'reads_per_bin': readsPerBin,
            'std': std,
            'sites_sampled': sitesSampled}


class Tester(object):

    def __init__(self):
        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        self.bamFile1 = self.root + "testA.bam"
        self.bamFile2 = self.root + "testB.bam"
        global debug
        debug = 0
        self.chrom = '3R'