File: __init__.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (58 lines) | stat: -rw-r--r-- 2,116 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#### PATTERN | VECTOR | WORDLIST ###################################################################
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern

####################################################################################################

import os

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

class Wordlist:
    
    def __init__(self, name, data=[]):
        """ Lazy read-only list of words.
        """
        self._name = name
        self._data = data
    
    def _load(self):
        if not self._data:
            self._data = open(os.path.join(MODULE, self._name+".txt")).read().split(", ")
        
    def __repr__(self):
        self._load(); return repr(self._data)
    def __iter__(self):
        self._load(); return iter(self._data)
    def __len__(self):
        self._load(); return len(self._data)
    def __contains__(self, w):
        self._load(); return w in self._data
    def __add__(self, iterable):
        self._load(); return Wordlist(None, data=sorted(self._data + list(iterable)))
    def __getitem__(self, i):
        self._load(); return self._data[i]
    def __setitem__(self, i, v):
        self._load(); self._data[i] = v
    def insert(self, i, v):
        self._load(); self._data.insert(i, v)
    def append(self, v):
        self._load(); self._data.append(v)
    def extend(self, v):
        self._load(); self._data.extend(v)

ACADEMIC  = Wordlist("academic")  # English academic words.
BASIC     = Wordlist("basic")     # English basic words (850) that express 90% of concepts.
PROFANITY = Wordlist("profanity") # English swear words.
TIME      = Wordlist("time")      # English time and date words.
STOPWORDS = Wordlist("stopwords") # English stop words ("a", "the", ...).

# Note: if used for lookups, performance can be increased by using a dict:
# blacklist = dict.fromkeys(PROFANITY+TIME, True)
# for i in range(1000):
#    corpus.append(Document(src[i], exclude=blacklist))