1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
|
from chempy.champ import Champ
# see if we can read-in and read-out all of the NCI
# database SMILES strings without a hitch
import re
num_re = re.compile("[0-9]")
pnum_re = re.compile(r"\%[0-9][0-9]")
ch = Champ()
f = open("nci.smi")
c = 1
lst = []
while 1:
l = f.readline()
if not l: break
l = string.strip(l)
if len(l):
if 1000*(c/1000)==c:
print c
c = c + 1
# print l
idx = ch.insert_pattern_string(l)
# ch.pattern_dump(idx)
o = ch.get_pattern_string(idx)
if len(o) != len(l): # same length?
print "orig: ",l
print "champ: ",o
ch.pattern_dump(idx)
break
else:
# now, anonymize cycle identifiers and then compare...
o = pnum_re.sub("|",o)
o = num_re.sub("|",o)
l = pnum_re.sub("|",l)
l = num_re.sub("|",l)
fail = 0
if o!=l:
if o!="S=1(=NC(=O)C(=C(S[C])N=1)C#N)([C])[C]":
print "orig: ",l
print "champ: ",o
ch.pattern_dump(idx)
fail = 1
break
if fail:
break
ch.pattern_free(idx)
|