File: bcbio_fastq_umi_prep.py

package info (click to toggle)
bcbio 1.2.9-4
  • links: PTS, VCS
  • area: contrib
  • in suites: sid
  • size: 36,792 kB
  • sloc: python: 45,766; sh: 209; makefile: 159; xml: 129
file content (278 lines) | stat: -rwxr-xr-x 11,506 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/python3
"""Convert fastq inputs into paired inputs with UMIs in read names.

Handles three cases:

- Separate UMI read files (read 1, read 2, UMI)

  Usage:
    bcbio_fastq_umi_prep.py single <out basename> <read 1 fastq> <read 2 fastq> <umi fastq>
  or:
    bcbio_fastq_umi_prep.py autopair [<list> <of> <fastq> <files>]

- Duplex barcodes where the tags are incorporated into read 1 and read 2

  Usage:
    bcbio_fastq_umi_prep.py single --tag1 5 --tag2 5 <out basename> <read 1 fastq> <read 2 fastq>
  or:
    bcbio_fastq_umi_prep.py autopair --tag1 5 --tag2 5 [<list> <of> <fastq> <files>]

- Adds UMI_ to fastq line1 of Dragen's UMI output, input is R1.fq.gz and R2.fq.gz

Creates two fastq files with embedded UMIs: <out_basename>_R1.fq.gz <out_basename>_R2.fq.gz
or a directory of fastq files with UMIs added to the names.

autopair assumes 3 sets of reads based on the way bcl2fastq exports index reads:

- R1 -- The first read pair
- R2 -- The UMI barcode reads
- R3 -- The second read pair

If you're using a different approach to generate the UMIs, please
maintain the same R1/R2/R3 naming scheme.
"""
from __future__ import print_function
import argparse
import math
import os
import sys
import gzip
import itertools

from bcbio import utils
from bcbio.bam import fastq
from bcbio.provenance import do
from bcbio.distributed.multi import run_multicore, zeromq_aware_logging

transform_json = r"""{
    "read1": "(?P<name>@.*)\\n(?P<seq>.*)\\n\\+(.*)\\n(?P<qual>.*)\\n",
    "read2": "(?P<name>@.*)\\n(?P<seq>.*)\\n\\+(.*)\\n(?P<qual>.*)\\n",
    "read3": "(@.*)\\n(?P<MB>.*)\\n\\+(.*)\\n(.*)\\n"
}
"""

duplex_transform = r"""{
    "read1": "(?P<name>@.*)\\n(?P<CB1>.{%s})(?P<seq>.*)\\n\\+(.*)\\n(.{%s})(?P<qual>.*)\\n",
    "read2": "(?P<name>@.*)\\n(?P<CB2>.{%s})(?P<seq>.*)\\n\\+(.*)\\n(.{%s})(?P<qual>.*)\\n"
}"""

r1_transform = r"""{
    "read1": "(?P<name>@.*)\\n(?P<MB>.{%s})(?P<seq>.*)\\n\\+(.*)\\n(.{%s})(?P<qual>.*)\\n",
    "read2": "(?P<name>@.*)\\n(?P<seq>.*)\\n\\+(.*)\\n(?P<qual>.*)\\n",
}"""

r2_transform = r"""{
    "read1": "(?P<name>@.*)\\n(?P<seq>.*)\\n\\+(.*)\\n(?P<qual>.*)\\n",
    "read2": "(?P<name>@.*)\\n(?P<MB>.{%s})(?P<seq>.*)\\n\\+(.*)\\n(.{%s})(?P<qual>.*)\\n"
}"""

def run_single(args):
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    if args.umi_fq and not tags:
        add_umis_to_fastq(args.out_base, args.read1_fq, None, args.umi_fq, tags, cores=args.cores)
        add_umis_to_fastq(args.out_base, None, args.read2_fq, args.umi_fq, tags, cores=args.cores)
    else:
        add_umis_to_fastq(args.out_base, args.read1_fq, args.read2_fq, args.umi_fq, tags, cores=args.cores)

@utils.map_wrap
@zeromq_aware_logging
def add_umis_to_fastq_parallel(out_base, read1_fq, read2_fq, umi_fq, tags, cores, config):
    add_umis_to_fastq(out_base, read1_fq, read2_fq, umi_fq, tags, cores)

def add_umis_to_fastq(out_base, read1_fq, read2_fq, umi_fq, tags=None, cores=1):
    print("Adding UMIs from", umi_fq, "to read headers in", " ".join([x for x in [read1_fq, read2_fq] if x]))
    out1_fq = out_base + "_R1.fq.gz"
    out2_fq = out_base + "_R2.fq.gz"
    if umi_fq and not tags:
        if read1_fq:
            assert not read2_fq
            return _add_umis_with_fastp(read1_fq, umi_fq, out1_fq, cores)
        else:
            assert read2_fq
            return _add_umis_with_fastp(read2_fq, umi_fq, out2_fq, cores)

    transform_json_file = out_base + "-transform.json"
    with open(transform_json_file, "w") as out_handle:
        if tags:
            tag1, tag2 = tags
            if tag1 and tag2:
                out_handle.write(duplex_transform % (tag1, tag1, tag2, tag2))
            elif tag1:
                out_handle.write(r1_transform % (tag1, tag1))
            else:
                out_handle.write(r2_transform % (tag2, tag2))
        else:
            out_handle.write(transform_json)
    with utils.open_gzipsafe(read1_fq) as in_handle:
        ex_name = in_handle.readline().split(" ")
        fastq_tags_arg = "--keep_fastq_tags" if len(ex_name) == 2 else ""
    tag_arg = "--separate_cb" if tags else ""
    cmd = ("umis fastqtransform {fastq_tags_arg} {tag_arg} "
           "--fastq1out >(bgzip --threads {cores} -c > {out1_fq}) "
           "--fastq2out >(bgzip --threads {cores} -c > {out2_fq}) "
           "{transform_json_file} {read1_fq} "
           "{read2_fq}")
    if umi_fq:
        cmd += " {umi_fq}"
    do.run(cmd.format(**locals()), "Add UMIs to paired fastq files")

    os.remove(transform_json_file)

def _add_umis_with_fastp(read_fq, umi_fq, out_fq, cores):
    """Add UMIs to reads from separate UMI file using fastp.
    """
    with utils.open_gzipsafe(umi_fq) as in_handle:
        in_handle.readline()  # name
        umi_size = len(in_handle.readline().strip())
    cmd = ("fastp -Q -A -L -G -w 1 --in1 {read_fq} --in2 {umi_fq} "
           "--umi --umi_prefix UMI --umi_loc read2 --umi_len {umi_size} "
           "--out1 >(bgzip --threads {cores} -c > {out_fq}) --out2 /dev/null "
           "-j /dev/null -h /dev/null")
    do.run(cmd.format(**locals()), "Add UMIs to fastq file with fastp")

def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    if not tags:
        # Aim for 2 or 3 simultaneous processes, each with multiple cores
        target_processes = 2
        process_cores = max(1, (args.cores // target_processes) + (args.cores % target_processes))
        overall_processes = max(1, int(math.ceil(args.cores / float(process_cores))))
    else:
        process_cores = 1
        overall_processes = args.cores
    for r1, r2 in to_run:
        target = _commonprefix([r1, r2])
        if tags:
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2])))
            umi = None
        else:
            r3 = None
            for test_r3 in extras:
                if (_commonprefix([r1, test_r3]) == target and _commonprefix([r2, test_r3]) == target):
                    r3 = test_r3
                    break
            assert r3, (r1, r2, extras)
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3])))
            r1, r2, umi = _find_umi([r1, r2, r3])
        # fastp handles a single pair of reads so we split processing to run on each
        if umi and not tags:
            ready_to_run.append([base_name, r1, None, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
            ready_to_run.append([base_name, None, r2, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
        else:
            ready_to_run.append([base_name, r1, r2, umi, tags, process_cores, {"algorithm": {}, "resources": {}}])
    parallel = {"type": "local", "cores": overall_processes, "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)

def _add_umi_str(fq_header):
    ar = fq_header.split(" ")
    ar1 = ar[0].split(":")
    prefix = ar1[:-1]
    suffix = ar1[-1]
    new_suffix = "UMI_" + suffix
    ending = ar[1].strip()
    new_header = ":".join(prefix) + ":" + new_suffix + " " + ending
    return new_header

def run_dragen(args):
    to_run = []
    outdir = utils.safe_makedir(args.outdir)
    for fnames in fastq.combine_pairs(sorted(args.files)):
        to_run.append(fnames)
    for r1, r2 in to_run:
        out1_fq = os.path.join(outdir, r1)
        out2_fq = os.path.join(outdir, r2)
        n = 0
        with utils.open_gzipsafe(r1) as r1_handle, \
             utils.open_gzipsafe(r2) as r2_handle, \
             gzip.open(out1_fq, "wb") as out1_handle, \
             gzip.open(out2_fq, "wb") as out2_handle:
            for line1, line2 in itertools.zip_longest(r1_handle, r2_handle):
                if line1 is not None:
                    if n%4 == 0:
                      # parse header line
                      new_header1 = _add_umi_str(line1) + "\n"
                      new_header2 = _add_umi_str(line2) + "\n"
                      out1_handle.write(new_header1.encode())
                      out2_handle.write(new_header2.encode())
                    else:
                      out1_handle.write(line1.encode())
                      out2_handle.write(line2.encode())
                    n += 1

def _find_umi(files):
    """Find UMI file using different naming schemes.

    R1/R2/R3 => R1/R3 with R2 UMI
    R1/R2/I1 => R1/R2 with I1 UMI
    """
    base = os.path.basename(_commonprefix(files))

    def _file_ext(f):
        exts = utils.splitext_plus(os.path.basename(f).replace(base, ""))[0].split("_")
        exts = [x for x in exts if x]
        return exts[0]

    exts = dict([(_file_ext(f), f) for f in files])
    if "I1" in exts:
        return exts["R1"], exts["R2"], exts["I1"]
    else:
        assert "R3" in exts, exts
        return exts["R1"], exts["R3"], exts["R2"]

def _commonprefix(files):
    """Retrieve a common prefix for files without extra _R1 _I1 extensions.

    Allows alternative naming schemes (R1/R2/R3) (R1/R2/I1).
    """
    out = os.path.commonprefix(files)
    out = out.rstrip("_R")
    out = out.rstrip("_I")
    out = out.rstrip("_")
    return out


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Add UMIs to fastq read names")
    sp = parser.add_subparsers(title="[sub-commands]")

    p = sp.add_parser("autopair", help="Automatically pair R1/R2 (and maybe R3) fastq inputs")
    p.add_argument("-c", "--cores", default=1, type=int, help="Number of cores, allowing running samples in parallel")
    p.add_argument("--outdir", default="with_umis", help="Output directory to write UMI prepped fastqs")
    p.add_argument("--tag1", help="Duplex read 1 tag -- bases to trim from 5' end")
    p.add_argument("--tag2", help="Duplex read 2 tag -- bases to trim from 5' end")
    p.add_argument("files", nargs="*", help="All fastq files to pair and process")
    p.set_defaults(func=run_autopair)

    p = sp.add_parser("single", help="Run single set of fastq files with UMIs/duplexes")
    p.add_argument("-c", "--cores", default=1, type=int, help="Number of cores to use for parallel bgzip")
    p.add_argument("--tag1", help="Duplex read 1 tag -- bases to trim from 5' end", type=int)
    p.add_argument("--tag2", help="Duplex read 2 tag -- bases to trim from 5' end", type=int)
    p.add_argument("out_base", help="Base name for output files -- you get <base_name>_R1.fq.gz")
    p.add_argument("read1_fq", help="Input fastq, read 1")
    p.add_argument("read2_fq", help="Input fastq, read 2")
    p.add_argument("umi_fq", help="Input fastq, UMIs", nargs="?")
    p.set_defaults(func=run_single)

    p = sp.add_parser("dragen", help="Make Dragen UMI input suitable for bcbio")
    p.add_argument("--outdir", default="with_umis", help="Output directory to write UMI prepped fastqs")
    p.add_argument("files", nargs="*", help="All fastq files to pair and process")
    p.set_defaults(func=run_dragen)

    if len(sys.argv) == 1:
        parser.print_help()
    args = parser.parse_args()
    args.func(args)