File: quast_all.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (73 lines) | stat: -rw-r--r-- 3,005 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/python3

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################


#input_folder, output_folder
import sys
import os
import getopt
import math

def DefaultZero(d, key):
    if key in d:
        return d[key]
    else:
        return "0"
    
cnt = 0
reports = list()
input_folder = sys.argv[1]
output_folder = sys.argv[2]
index = [file[:-4] for file in os.listdir(input_folder) if file.endswith(".tsv")]
#names = ['index', 'Assembly', '# contigs (>= 0 bp)', '# contigs (>= 1000 bp)', 'Total length (>= 0 bp)', 'Total length (>= 1000 bp)', '# contigs', 'Largest contig', 'Total length', 'Reference length', 'GC (%)', 'Reference GC (%)', 'N50', 'NG50', 'N75', 'NG75', 'L50', 'LG50', 'L75', 'LG75', '# misassemblies', '# misassembled contigs', 'Misassembled contigs length', '# local misassemblies', '# unaligned contigs', 'Unaligned length', 'Genome fraction (%)', 'Duplication ratio', "# N's per 100 kbp", '# mismatches per 100 kbp', '# indels per 100 kbp', 'Largest alignment', 'NA50', 'NGA50', 'NA75', 'LA50', 'LGA50', 'LA75']
#names = ['# misassemblies', '# misassembled contigs', 'Misassembled contigs length', '# local misassemblies', 'Unaligned length', 'Genome fraction (%)', '# mismatches per 100 kbp', '# indels per 100 kbp']
names = []
values = dict()
for name in names:
    values[name] = 0
for l in index:
    cnt += 1
    report_path = os.path.join(input_folder,  + "/" + l.strip() + ".tsv")
    new_item = dict()
    new_item["index"] = l

    if os.path.isfile(report_path):
        report = open(report_path)
        for param in report:
            params = param.split("\t")
            new_item[params[0]] = params[1].strip()
            if not params[0] in names:
                values[params[0]] = 0
                names.append(params[0])
    reports.append(new_item)
index.close()

all_quast = open(os.path.join(output_folder, "table.tsv"), "w")
all_quast.write("\t".join(names))
names.append("#partially unaligned")
values["#partially unaligned"] = 0
for line in reports:
    all_quast.write("\t".join([DefaultZero(line, x) for x in names]))
    all_quast.write("\n")
    for name in names:
        if name != "# unaligned contigs" and name != "index" and name != "Assembly":
            values[name] += float(DefaultZero(line, name))
        else:
            tmp = DefaultZero(line, name).split(" ")
            if len(tmp) > 1:
                values["# unaligned contigs"] += int(tmp[0])
            if len(tmp) >= 3:
                values["#partially unaligned"] += int (tmp[2])


all_quast.close()
results = open(os.path.join(output_folder, "results.tsv"), "w")
for name in names:
    results.write(name + "\t" + str(int(values[name])) + "\t" + str(values[name] / cnt) + "\n")
results.close()