File: etymology.py

package info (click to toggle)
phenny 2~hg28-3
  • links: PTS, VCS
  • area: main
  • in suites: buster, stretch
  • size: 448 kB
  • ctags: 190
  • sloc: python: 6,520; sh: 607; makefile: 15
file content (103 lines) | stat: -rwxr-xr-x 2,975 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
"""
etymology.py - Phenny Etymology Module
Copyright 2007, Sean B. Palmer, inamidst.com
Licensed under the Eiffel Forum License 2.

http://inamidst.com/phenny/
"""

import re
import web
from tools import deprecated

etyuri = 'http://etymonline.com/?term=%s'
etysearch = 'http://etymonline.com/?search=%s'

r_definition = re.compile(r'(?ims)<dd[^>]*>.*?</dd>')
r_tag = re.compile(r'<(?!!)[^>]+>')
r_whitespace = re.compile(r'[\t\r\n ]+')

abbrs = [
   'cf', 'lit', 'etc', 'Ger', 'Du', 'Skt', 'Rus', 'Eng', 'Amer.Eng', 'Sp', 
   'Fr', 'N', 'E', 'S', 'W', 'L', 'Gen', 'J.C', 'dial', 'Gk', 
   '19c', '18c', '17c', '16c', 'St', 'Capt', 'obs', 'Jan', 'Feb', 'Mar', 
   'Apr', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'c', 'tr'
]
t_sentence = r'^.*?(?<!%s)(?:\.(?= [A-Z0-9]|\Z)|\Z)'
r_sentence = re.compile(t_sentence % ')(?<!'.join(abbrs))

def unescape(s): 
   s = s.replace('&gt;', '>')
   s = s.replace('&lt;', '<')
   s = s.replace('&amp;', '&')
   return s

def text(html): 
   html = r_tag.sub('', html)
   html = r_whitespace.sub(' ', html)
   return unescape(html).strip()

def etymology(word): 
   # @@ <nsh> sbp, would it be possible to have a flag for .ety to get 2nd/etc
   # entries? - http://swhack.com/logs/2006-07-19#T15-05-29
   
   if len(word) > 25: 
      raise ValueError("Word too long: %s[...]" % word[:10])
   word = {'axe': 'ax/axe'}.get(word, word)

   bytes = web.get(etyuri % word)
   definitions = r_definition.findall(bytes)

   if not definitions: 
      return None

   defn = text(definitions[0])
   m = r_sentence.match(defn)
   if not m: 
      return None
   sentence = m.group(0)

   try: 
      sentence = unicode(sentence, 'iso-8859-1')
      sentence = sentence.encode('utf-8')
   except: pass

   maxlength = 275
   if len(sentence) > maxlength: 
      sentence = sentence[:maxlength]
      words = sentence[:-5].split(' ')
      words.pop()
      sentence = ' '.join(words) + ' [...]'

   sentence = '"' + sentence.replace('"', "'") + '"'
   return sentence + ' - ' + (etyuri % word)

@deprecated
def f_etymology(self, origin, match, args): 
   word = match.group(2)

   try: result = etymology(word.encode('utf-8'))
   except IOError: 
      msg = "Can't connect to etymonline.com (%s)" % (etyuri % word)
      self.msg(origin.sender, msg)
      return

   if result is not None: 
      if (origin.sender == '#esp') and (origin.nick == 'nsh'): 
         self.msg(origin.nick, result)
         note = 'nsh: see privmsg (yes, this only happens for you)'
         self.msg(origin.sender, note)
      else: self.msg(origin.sender, result)
   else: 
      uri = etysearch % word
      msg = 'Can\'t find the etymology for "%s". Try %s' % (word, uri)
      self.msg(origin.sender, msg)
# @@ Cf. http://swhack.com/logs/2006-01-04#T01-50-22
f_etymology.rule = (['ety'], r"([A-Za-z0-9' .-]+)")
f_etymology.thread = True
f_etymology.priority = 'high'

if __name__=="__main__": 
   import sys
   print etymology(sys.argv[1])