File: guess_baits.py

package info (click to toggle)
cnvkit 0.9.12-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 96,464 kB
  • sloc: python: 12,407; makefile: 263; sh: 84; xml: 38
file content (255 lines) | stat: -rwxr-xr-x 10,373 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#!/usr/bin/python3
"""Guess the coordinates of captured regions from sample read depths.

Two approaches available:

- (Faster) Scan a given list of exons and/or other potentially targeted regions.
  The script checks each region and drops those with very low coverage
  indicating they were not captured.
- (Slower) Scan the entire genome, or the given sequencing-accessible regions,
  for regions with elevated coverage. Choose reasonable boundaries for each
  apparently captured region.

Use multiple BAMs for greater robustness in detecting targeted regions, as a
single sample may have poor coverage are some targets by chance.
Still, this script does not guarantee correctly detecting all targets.

See also: https://github.com/brentp/goleft
"""
import argparse
import collections
import logging
import subprocess
import sys

import numpy as np
import pandas as pd

import cnvlib
from cnvlib import parallel
from cnvlib.descriptives import modal_location
from skgenome import tabio, GenomicArray as GA

logging.basicConfig(level=logging.INFO, format="%(message)s")


# ___________________________________________
# Guided method: guess from potential targets

def filter_targets(target_bed, sample_bams, procs, fasta):
    """Check if each potential target has significant coverage."""
    try:
        baits = tabio.read(target_bed, 'bed4')
    except:
        raise RuntimeError("Targets must be in BED format; try skg_convert.py")
    logging.info("Loaded %d candidate regions from %s", len(baits), target_bed)
    # Loop over BAMs to calculate weighted averages of bin coverage depths
    total_depths = np.zeros(len(baits), dtype=np.float64)
    for bam_fname in sample_bams:
        logging.info("Evaluating targets in %s", bam_fname)
        sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta)
        assert len(sample) == len(baits), \
                "%d != %d" % (len(sample), len(baits))
        total_depths += sample['depth'].values
    baits['depth'] = total_depths / len(sample_bams)
    logging.info("Average candidate-target depth:\n%s",
                 baits['depth'].describe())
    return baits


# _________________________________________
# Unguided method: guess from raw depths

def scan_targets(access_bed, sample_bams, min_depth, min_gap, min_length,
                 procs):
    """Estimate baited regions from a genome-wide, per-base depth profile."""
    bait_chunks = []
    # ENH: context manager to call rm on bed chunks? with to_chunks as pool, ck?
    logging.info("Scanning for enriched regions in:\n  %s",
                 '\n  '.join(sample_bams))
    #  with futures.ProcessPoolExecutor(procs) as pool:
    with parallel.pick_pool(procs) as pool:
        args_iter = ((bed_chunk, sample_bams,
                    min_depth, min_gap, min_length)
                    for bed_chunk in parallel.to_chunks(access_bed))
        for bed_chunk_fname, bait_chunk in pool.map(_scan_depth, args_iter):
            bait_chunks.append(bait_chunk)
            parallel.rm(bed_chunk_fname)
    baits = GA(pd.concat(bait_chunks))
    baits['depth'] /= len(sample_bams)
    return baits


def _scan_depth(args):
    """Wrapper for parallel map"""
    bed_fname, bam_fnames, min_depth, min_gap, min_length = args
    regions = list(drop_small(merge_gaps(scan_depth(bed_fname, bam_fnames,
                                                    min_depth),
                                         min_gap),
                              min_length))
    result = pd.DataFrame.from_records(list(regions),
                                       columns=regions[0]._fields)
    return bed_fname, result


def scan_depth(bed_fname, bam_fnames, min_depth):
    """Locate sub-regions with enriched read depth in the given regions.

    Yields
    ------
    tuple
        Region coordinates (0-indexed, half-open): chromosome name, start, end
    """
    Region = collections.namedtuple('Region', 'chromosome start end depth')

    nsamples = len(bam_fnames)
    if nsamples == 1:
        def get_depth(depths):
            return int(depths[0])
    else:
        min_depth *= nsamples
        # NB: samtools emits additional BAMs' depths as trailing columns
        def get_depth(depths):
            return sum(map(int, depths))

    proc = subprocess.Popen(['samtools', 'depth',
                             '-Q',  '1',  # Skip pseudogenes
                             '-b', bed_fname,
                            ] + bam_fnames,
                            stdout=subprocess.PIPE,
                            shell=False)

    # Detect runs of >= min_depth; emit their coordinates
    chrom = start = depths = None
    for line in proc.stdout:
        fields = line.split('\t')
        depth = get_depth(fields[2:])
        is_enriched = (depth >= min_depth)
        if start is None:
            if is_enriched:
                # Entering a new captured region
                chrom = fields[0]
                start = int(fields[1]) - 1
                depths = [depth]
            else:
                # Still not in a captured region
                continue
        elif is_enriched and fields[0] == chrom:
            # Still in a captured region -- extend it
            depths.append(depth)
        else:
            # Exiting a captured region
            # Update target region boundaries
            darr = np.array(depths)
            half_depth = 0.5 * darr.max()
            ok_dp_idx = np.nonzero(darr >= half_depth)[0]
            start_idx = ok_dp_idx[0]
            end_idx = ok_dp_idx[-1] + 1
            yield Region(chrom,
                            start + start_idx,
                            start + end_idx,
                            darr[start_idx:end_idx].mean())
            chrom = start = depths = None


def merge_gaps(regions, min_gap):
    """Merge regions across small gaps."""
    prev = next(regions)
    for reg in regions:
        if reg.start - prev.end < min_gap:
            # Merge
            prev = prev._replace(end=reg.end)
        else:
            # Done merging; move on
            yield prev
            prev = reg
    # Residual
    yield prev


def drop_small(regions, min_length):
    """Merge small gaps and filter by minimum length."""
    return (reg for reg in regions
            if reg.end - reg.start >= min_length)


# ___________________________________________
# Shared

def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1):
    """Calculate normalized depth, add log2 column, filter by enrich_ratio."""
    # Normalize depths to a neutral value of 1.0
    dp_mode = modal_location(baits.data.loc[baits['depth'] > min_depth,
                                            'depth'].values)
    norm_depth = baits['depth'] / dp_mode
    # Drop low-coverage targets
    keep_idx = (norm_depth >= enrich_ratio)
    logging.info("Keeping %d/%d bins with coverage depth >= %f, modal depth %f",
                 keep_idx.sum(), len(keep_idx), dp_mode * enrich_ratio, dp_mode)
    return baits[keep_idx]



if __name__ == '__main__':
    AP = argparse.ArgumentParser(description=__doc__)
    AP.add_argument('sample_bams', nargs='+',
                    help="""Sample BAM file(s) to test for target coverage.""")
    AP.add_argument('-o', '--output', metavar='FILENAME',
                    help="""The inferred targets, in BED format.""")
    AP.add_argument('-c', '--coverage', metavar='FILENAME',
                    help="""Filename to output average coverage depths in .cnn
                    format.""")
    AP.add_argument('-p', '--processes', metavar='CPU',
                    nargs='?', type=int, const=0, default=1,
                    help="""Number of subprocesses to segment in parallel.
                    If given without an argument, use the maximum number
                    of available CPUs. [Default: use 1 process]""")
    AP.add_argument('-f', '--fasta', metavar="FILENAME",
            help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")

    AP_x = AP.add_mutually_exclusive_group(required=True)
    AP_x.add_argument('-t', '--targets', metavar='TARGET_BED',
                    help="""Potentially targeted genomic regions, e.g. all known
                    exons in the reference genome, in BED format. Each of these
                    regions will be tested as a whole for enrichment. (Faster
                    method)""")
    AP_x.add_argument('-a', '--access', metavar='ACCESS_BED',
                    # default="../data/access-5k-mappable.grch37.bed",
                    help="""Sequencing-accessible genomic regions (e.g. from
                    'cnvkit.py access'), or known genic regions in the reference
                    genome, in BED format. All bases will be tested for
                    enrichment. (Slower method)""")

    AP_target = AP.add_argument_group("With --targets only")
    AP_target.add_argument('-d', '--min-depth', metavar='DEPTH',
                    type=int, default=5,
                    help="""Minimum sequencing read depth to accept as captured.
                    [Default: %(default)s]""")

    AP_access = AP.add_argument_group("With --access only")
    AP_access.add_argument('-g', '--min-gap', metavar='GAP_SIZE',
                    type=int, default=25,
                    help="""Merge regions separated by gaps smaller than this.
                    [Default: %(default)s]""")
    AP_access.add_argument('-l', '--min-length', metavar='TARGET_SIZE',
                    type=int, default=50,
                    help="""Minimum region length to accept as captured.
                    [Default: %(default)s]""")

    args = AP.parse_args()

    # ENH: can we reserve multiple cores for htslib?
    if args.processes < 1:
        args.processes = None

    if args.targets:
        baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta)
    else:
        baits = scan_targets(args.access, args.sample_bams,
                             0.5 * args.min_depth,  # More sensitive 1st pass
                             args.min_gap, args.min_length, args.processes)
    baits = normalize_depth_log2_filter(baits, args.min_depth)
    tabio.write(baits, args.output or sys.stdout, 'bed')
    if args.coverage:
        baits['log2'] = np.log2(baits['depth'] / baits['depth'].median())
        tabio.write(baits, args.coverage, 'tab')