File: choose_samples.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (81 lines) | stat: -rwxr-xr-x 2,780 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/python3

import argparse
from operator import itemgetter
import os.path
import subprocess
import sys
import yaml

parser = argparse.ArgumentParser(description="Choose samples for bin reassembly")

parser.add_argument("-r", "--reads", type=str, help="Directory with reads")
parser.add_argument("-o", "--output", type=str, help="Output path")
parser.add_argument("prof", type=str, help="File with bin profiles")
parser.add_argument("-f", "--filtered", type=str, help="File with filtered bins")
parser.add_argument("--min-sample", type=float, default=4, help="Minimal coverage in a single sample to be used")
parser.add_argument("--min-total", type=float, default=1, help="Minimal total coverage of the bin to be reassembled")

args = parser.parse_args()

BINS = set()
if args.filtered:
    with open(args.filtered) as input:
        for line in input:
            bin = line.split()[0]
            BINS.add(bin)

prof_dict = dict()

make_excluded = True
excluded_dir = os.path.join(args.output, "excluded")

input = open(args.prof)
for line in input:
    exclude = False
    samples = []
    params = line.split()
    bin = params[0]
    profile = list(map(float, params[1:]))
    print("Profile of", bin, ":", profile)
    if BINS and bin not in BINS:
        print(bin, "was excluded from the reassembly")
        exclude = True
    total = 0

    #Current strategy: choose all samples larger than soft threshold.
    #If there's not enough, use the lower hard threshold.
    #Sort samples by their abundancies
    weighted_profile = list((i, ab)
        for i, ab in enumerate(profile, start=1) if ab >= args.min_sample)
    weighted_profile.sort(key=itemgetter(1), reverse=True)

    for i, cov in weighted_profile:
        if not os.path.exists(os.path.join(args.reads, bin, "sample{}_1.fastq.gz".format(i))):
            print("WARNING: sample", i, "does not contain reads for", bin)
            continue
        total += cov
        samples.append(i)

    print("Chosen samples are", samples, "with total mean abundance", total)
    prof_dict[bin] = total

    if total < args.min_total:
        print(bin, "is too scarce; excluding from the reassembly")
        exclude = True

    config_dir = args.output
    if exclude:
        if make_excluded and not os.path.isdir(excluded_dir):
            os.mkdir(excluded_dir)
        make_excluded = False
        config_dir = excluded_dir
    config_path = os.path.join(config_dir, bin + ".info")
    print("Dumping config into", config_path)
    with open(config_path, "w") as out:
        print("total", sum(profile), file=out)
        for i, ab in enumerate(profile, start=1):
            line = "sample" + str(i)
            if i in samples:
                line = "+" + line
            print(line, ab, file=out)