File: subsample.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (71 lines) | stat: -rwxr-xr-x 2,345 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/python3

import argparse
import gzip
import os
import os.path
import subprocess
import sys

parser = argparse.ArgumentParser(description="Subsample reads for highly-covered organisms")

parser.add_argument("file", type=str, help=".info file with chosen samples")
parser.add_argument("reads", type=str, help="Input reads directory")
parser.add_argument("output", type=str, help="Output directory")

args = parser.parse_args()

MAX_COV = 100

def reads_file(sample, suffix=""):
    return os.path.join(args.reads, sample + suffix + ".fastq.gz")

with open(args.file) as samples_info:
    samples = []
    total = 0
    for line in samples_info:
        sample_data = line.split()
        if sample_data[0][0] == "+":
            sample = sample_data[0][1:]
            if not os.path.exists(reads_file(sample, "_1")):
                print("\033[33mWarning: {} contains no reads\033[0m".format(sample))
                continue
            cov = float(sample_data[1])
            samples.append(sample)
            total += cov

frac = MAX_COV / total
if frac > 1:
    frac = 1.0

output = open(os.path.join(args.output, "reads.info"), "w")

subtotal = total * frac
print(subtotal, file=output)

if frac < 1:
    print("Subsampling reads to reduce ", total, " to ", subtotal, "(", frac, "x)", sep="")

    filenames = tuple(os.path.join(args.output, basename + ".fq.gz") for name in ("left", "right"))

    def open_gzipped(filename):
        return subprocess.Popen(["gzip"], stdin=subprocess.PIPE, stdout=open(fullpath, "wb"))

    left_gz, right_gz = [open_gzipped(name) for name in filenames]

    def run_subsampler(file, reads, frac):
        params = ["seqtk", "sample", "-s100", reads, str(frac)]
        print("Calling", " ".join(params))
        subprocess.check_call(params, stdout=file.stdin, stderr=sys.stdout)

    for sample in samples:
        for file, reads in [(left_gz, os.path.join(args.reads, "{}_1.fastq.gz".format(sample))),
                            (right_gz, os.path.join(args.reads, "{}_2.fastq.gz".format(sample)))]:
            run_subsampler(file, reads, frac)

    files = [filenames]
else: #Nothing to subsample; reuse old files
    files = [(reads_file(sample, "_1"), reads_file(sample, "_2")) for sample in samples]

for left_file, right_file in files:
    print(left_file, right_file, file=output)