1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
import urllib2
import urllib
from sgmllib import SGMLParser
#"http://online-old.ectaco.com/online/diction.php3?lang=7&q=2&refid=316&pagelang=23&word=test&direction=1"
# fetches the html document for the given word and language pair
def fetchData(word,from_lang,to_lang):
url = "http://online-old.ectaco.com/online/diction.php3"
params = [("lang",7),("q",2),("refid",31),("pagelang",23),("word",word),("direction",1)]
#param_word_trn = ("q",word) #set query parameter
#param_lang_pair = ("langpair",from_lang+"|"+to_lang)
request_url = url + "?" + urllib.urlencode(params)
#print request_url
results = urllib2.urlopen(request_url)
#results.encoding = 'windows-1250'
return results.read().decode("cp1250")
#return unicode(results.read(),'iso-8859-1')
#parses data and returns the parser object (that contains the translations/langpairs found)
def parseData(data):
p = myParser()
p.feed(data)
p.close()
return p
# called by Parley to translate the word
def fetchTranslation(word,from_lang,to_lang):
data = fetchData(word,from_lang,to_lang)
parser = parseData(data)
return parser.words
# called by Parley to retrieve the language pairs provided by this script
# should return: [("en","fr"),("en","de")] for translation from english to french and english to german
#def getLanguagePairs():
#data = fetchData("ignorethis","en","fr")
#parser = parseData(data)
#return map(split_langpair,parser.langpairs)
# function to split a language pair string into a tuple
#def split_langpair(s):
#[f,t] = s.split("|",1)
#return (f,t)
# ------------ HTML Parser ----------- #
class myParser(SGMLParser):
#for every start_tagname function you add you have to make sure the tag is added to the self.tags_stack
def reset(self):
SGMLParser.reset(self)
self.words = [] #translated words found in html
self.langpairs = [] #language pairs found in html file
self.tags_stack = []
self.next_table_contains_translations = False
self.inside_translation_table = False
self.td_count = 0
def unknown_starttag(self,tag,attrs):
self.tags_stack.append(tag)
#print "unknown : ", tag, " ", len(self.tags_stack)
def start_table(self,attrs):
if self.next_table_contains_translations == True:
#print "next is translation table"
self.inside_translation_table = True
self.next_table_contains_translations = False
self.td_count = 0
self.tags_stack.append("table")
def start_td(self,attrs):
if self.inside_translation_table == True:
#print "inside translation table (tdcount = ", self.td_count, ")"
if self.td_count % 2 == 1:
self.tags_stack.append("<!translation!>")
else:
self.tags_stack.append("td")
self.td_count = self.td_count + 1
else:
self.tags_stack.append("td")
def end_table(self):
if self.inside_translation_table == True:
self.inside_translation_table = False
def handle_data(self,data):
if len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] == "b":
if data.startswith("Results:"):
#print "-", data, "-","hi"
self.next_table_contains_translations = True
if len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] == "<!translation!>":
#print "data: ", data
Words = data.split(",")
for w in Words:
#print w.strip()
self.words.append(w.strip())
def unknown_endtag(self,tag):
myParser.remove_not_closed_tags(self,tag)
if len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] == tag:
#print "end_tag : ", tag, " ", len(self.tags_stack)
self.tags_stack.pop()
#removes all the tags from the stack that have no closed tags (don't modify)
def remove_not_closed_tags(self,tag):
while len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] != tag:
self.tags_stack.pop()
|