1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
|
#very basic, one seq per line only, no pysam for max portability
#doesn't care if sequences are in revcomp or forward order
import sys
fasta1 = sys.argv[1]
fasta2 = sys.argv[2]
revcomp = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A'}[B] for B in x][::-1])
def normalize(seq):
rev = revcomp(seq)
return min(rev,seq)
def read_seqs(fasta):
seqs = set()
for line in open(fasta):
if line[0] == '>': continue
seqs.add(normalize(line.strip()))
return seqs
s1 = read_seqs(fasta1)
s2 = read_seqs(fasta2)
if s1 == s2:
sys.exit(0)
if len(s1) == len(s2) and len(s1) == 1:
# special case, e.g. for read50 and genome10k, let's see if one is included in the other
seq1= s1.copy().pop()
seq2= s2.copy().pop()
if seq1 in seq2 or seq2 in seq1:
print("one seq is perfectly included in the other")
else:
print("BAD: one sequence doesn't exactly match the other")
print(("NOT EQUAL: %d sequence(s) in %s not in %s" % (len(s1.difference(s2)), fasta1, fasta2)))
sys.exit(1)
|