File: compare_fasta.py

package info (click to toggle)
minia 3.2.6-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,024 kB
  • sloc: cpp: 1,377; sh: 395; python: 208; makefile: 11
file content (36 lines) | stat: -rw-r--r-- 1,041 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#very basic, one seq per line only, no pysam for max portability
#doesn't care if sequences are in revcomp or forward order
import sys

fasta1 = sys.argv[1]
fasta2 = sys.argv[2]

revcomp = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A'}[B] for B in x][::-1])
def normalize(seq):
    rev = revcomp(seq)
    return min(rev,seq)

def read_seqs(fasta):
    seqs = set()
    for  line in open(fasta):
        if line[0] == '>': continue
        seqs.add(normalize(line.strip()))
    return seqs

s1 = read_seqs(fasta1)
s2 = read_seqs(fasta2)

if s1 == s2:
    sys.exit(0)

if len(s1) == len(s2) and len(s1) == 1:
    # special case, e.g. for read50 and genome10k, let's see if one is included in the other
    seq1= s1.copy().pop()
    seq2= s2.copy().pop()
    if seq1 in seq2 or seq2 in seq1:
        print("one seq is perfectly included in the other")
    else:
        print("BAD: one sequence doesn't exactly match the other")

print(("NOT EQUAL: %d sequence(s) in %s not in %s" % (len(s1.difference(s2)), fasta1, fasta2)))
sys.exit(1)