File: get_misassembled_contigs.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (60 lines) | stat: -rwxr-xr-x 1,874 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python3 -O

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################


import sys
import os
import re

if len(sys.argv) < 2:
    print("Misassembled contigs getter: prints IDs of misassembled contigs or save these contigs in fasta (if input contigs file and output file specified)")
    print("Usage: " + sys.argv[0] + " <plantagota output> [input_contigs.fasta misassembled_contigs.fasta]")        
    sys.exit()

in_file = open(sys.argv[1], "r")

mis_contigs_ids = []

#skipping prologue
for line in in_file:
    if line.startswith("Analyzing contigs..."):
        break

# main part of plantagora output
cur_contig_id = ""
for line in in_file:
    if line.startswith("	CONTIG:"):
        cur_contig_id = line.split("	CONTIG:")[1].strip()
    if (line.find("Extensive misassembly") != -1) and (cur_contig_id != ""):
        mis_contigs_ids.append(cur_contig_id.split()[0])
        cur_contig_id = ""
    if line.startswith("Analyzing coverage..."):
        break
            
# printing IDs of misassembled contigs
print("Misassembled contigs:")
for contig_id in mis_contigs_ids:
    print(contig_id) 

in_file.close()

if (len(sys.argv) == 4):
    import fastaparser
    input_contigs = fastaparser.read_fasta(sys.argv[2])
    mis_contigs = open(sys.argv[3], "w")    

    for (name, seq) in input_contigs:
        corr_name = re.sub(r'\W', '', re.sub(r'\s', '_', name))

        if mis_contigs_ids.count(corr_name) != 0:
            mis_contigs.write(name + '\n')
            for i in xrange(0, len(seq), 60):
                mis_contigs.write(seq[i:i+60] + '\n')

    mis_contigs.close()