File: debug_1seq_tip_removal.py

package info (click to toggle)
minia 3.2.6-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,024 kB
  • sloc: cpp: 1,377; sh: 395; python: 208; makefile: 11
file content (32 lines) | stat: -rw-r--r-- 1,065 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# take an assembly and a reference genome.
# determines whether each unitig is either fully erroneous or fully genomic, but that'd be an assembly error if it was both
import sys
fasta = open(sys.argv[1])
genome = [
"CTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGC",
"GCCGAACGGCCCTGGTCACCGTCTGGCAGGTTAAAGGAAACGGTAGCTCGTTAGCCAGAGCGTGGTCCGCCCCACAATTCCCCGCGACAG"
]
id=""
k=31
for line in fasta:
    if line.startswith('>'):
        #print abundance
        id=line
    else:
        seq = line.strip()
        is_error = False
        is_genome = False
        for i in range(len(seq)-k+1):
            kmer = seq[i:i+k]
            if kmer not in genome[0] and kmer not in genome[1]:
                is_error = True
            else:
                is_genome = True

        if is_error and is_genome:
            print("mix of erroneous kmers and true genomic kmers inside the same assembled contig!!")
            print(id)
            print(seq)
            exit(1)
        else:
            print(id,is_error,is_genome)