1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
#!/usr/bin/env python
"""
Compare two gfa files, reporting differences but ignoring some trivial ones
---------------------------------------------------------------------------
:Author: Bob Harris (rsharris@bx.psu.edu)
"""
import sys,re
def usage(s=None):
message = """
gfa_compare [--sort] gfa_file1 gfa_file2
"""
if (s == None): sys.exit (message)
else: sys.exit ("%s\n%s" % (s,message))
def main():
# parse the command line
sortEm = False
if (sys.argv[1] == "--sort"):
sortEm = True
del sys.argv[1]
if (len(sys.argv) < 3):
usage("you must specify two gfa files")
elif (len(sys.argv) > 3):
usage("wrong number of arguments")
gfa1Filename = sys.argv[1]
gfa2Filename = sys.argv[2]
# compare the files
gfa1 = file(gfa1Filename,"rt")
gfa2 = file(gfa2Filename,"rt")
if (sortEm): (different,lineNum) = compare_unsorted_files(gfa1,gfa2)
else: (different,lineNum) = compare_sorted_files (gfa1,gfa2)
if (different):
print >>sys.stderr,"FAILURE: %s and %s are different (line %d)" \
% (gfa1Filename,gfa2Filename,lineNum)
sys.exit(1)
print >>sys.stderr,"SUCCESS: %s and %s are equivalent" \
% (gfa1Filename,gfa2Filename)
# compare files that we expect are in the same order
def compare_sorted_files(gfa1,gfa2):
lineNum = 0
while (True):
lineNum += 1
line1 = gfa1.readline()
line2 = gfa2.readline()
if (line1 == "") and (line2 == ""):
return (False,lineNum)
line1 = line1.rstrip()
line2 = line2.rstrip()
stanza = line1.split()[0]
stanza2 = line2.split()[0]
if (stanza2 != stanza):
return (True,lineNum)
if (stanza == "d"):
continue # ignore command line differences
elif (stanza == "h"):
line1 = " ".join(header_strip(line1))
line2 = " ".join(header_strip(line2))
if (line1 != line2):
# print >>sys.stderr,"%s\n%s" % (line1,line2)
return (True,lineNum)
# compare files that we suspect might not be in the same order
def compare_unsorted_files(gfa1,gfa2):
lines1 = read_lines(gfa1)
lines1.sort()
lines2 = read_lines(gfa2)
lines2.sort()
compareNum = 0
while (True):
if (compareNum < len(lines1)): (line1,lineNum1) = lines1[compareNum]
else: (line1,lineNum1) = ("",None)
if (compareNum < len(lines2)): (line2,lineNum2) = lines2[compareNum]
else: (line2,lineNum2) = ("",None)
compareNum += 1
if (line1 == "") and (line2 == ""):
return (False,compareNum)
stanza = line1.split()[0]
stanza2 = line2.split()[0]
if (stanza2 != stanza):
return (True,compareNum)
if (stanza == "d"):
continue # ignore command line differences
elif (stanza == "h"):
line1 = " ".join(header_strip(line1))
line2 = " ".join(header_strip(line2))
if (line1 != line2):
# print >>sys.stderr,"%s\n%s" % (line1,line2)
return (True,compareNum)
# read all lines from a file, a return a list of (line,lineNumber)
def read_lines(gfa):
lines = []
lineNum = 0
while (True):
lineNum += 1
line = gfa.readline()
if (line == ""):
break
lines += [(line.rstrip(),lineNum)]
return lines
headerRe = re.compile("^(?P<stanza>.+) +\"(?P<name1>.+)\" +\"(?P<name2>.+)\"$")
def header_strip(s):
m = headerRe.match(s)
if (m == None):
return s
stanza = m.group("stanza")
name1 = m.group("name1").strip()
name2 = m.group("name2").strip()
if (name1.startswith(">")): name1 = name1[1:].strip()
if (name2.startswith(">")): name2 = name2[1:].strip()
return [stanza,name1,name2]
if __name__ == "__main__": main()
|