File: 8e239e278103c7ce0c1bcba1fc9939533b865241.patch

package info (click to toggle)
python-pattern 2.6%2Bgit20180818-4
links: PTS
area: main
in suites: bookworm
size: 95,148 kB
sloc: python: 28,136; xml: 15,085; javascript: 5,810; makefile: 194
file content (201 lines) | stat: -rw-r--r-- 7,912 bytes
parent folder | download | duplicates (2)
From 8e239e278103c7ce0c1bcba1fc9939533b865241 Mon Sep 17 00:00:00 2001
From: James Powell <james@dutc.io>
Date: Sun, 3 Jul 2022 02:42:33 -0400
Subject: [PATCH] ensure unit tests succeed; small clean-up (incl. context
 managers to reduce test noise)

---
 pattern/text/__init__.py             | 38 +++++++++++++++-----------
 pattern/text/en/wordlist/__init__.py |  3 +-
 pattern/text/en/wordnet/__init__.py  |  3 +-
 pattern/text/ru/wordlist/__init__.py |  3 +-
 pattern/text/search.py               | 41 ++++++----------------------
 5 files changed, 36 insertions(+), 52 deletions(-)

diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py
index aa77af0b..aaeda064 100644
--- a/pattern/text/__init__.py
+++ b/pattern/text/__init__.py
@@ -589,24 +589,28 @@ def _read(path, encoding="utf-8", comment=";;;"):
     """ Returns an iterator over the lines in the file at the given path,
         strippping comments and decoding each line to Unicode.
     """
+    def process(lines):
+        for i, line in enumerate(lines):
+            line = line.strip(BOM_UTF8) if i == 0 and isinstance(line, str) else line
+            line = line.strip()
+            line = decode_utf8(line, encoding)
+            if not line or (comment and line.startswith(comment)):
+                continue
+            yield line
     if path:
         if isinstance(path, str) and os.path.exists(path):
             # From file path.
-            f = open(path, "r", encoding="utf-8")
+            with open(path, "r", encoding="utf-8") as f:
+                yield from process(f)
         elif isinstance(path, str):
             # From string.
             f = path.splitlines()
+            yield from process(f)
         else:
             # From file or buffer.
             f = path
-        for i, line in enumerate(f):
-            line = line.strip(BOM_UTF8) if i == 0 and isinstance(line, str) else line
-            line = line.strip()
-            line = decode_utf8(line, encoding)
-            if not line or (comment and line.startswith(comment)):
-                continue
-            yield line
-    raise StopIteration
+            yield from process(f)
+    # raise StopIteration
 
 
 class Lexicon(lazydict):
@@ -1851,7 +1855,11 @@ def commandline(parse=Parser().parse):
         print(__version__)
         sys.path.pop(0)
     # Either a text file (-f) or a text string (-s) must be supplied.
-    s = o.file and codecs.open(o.file, "r", o.encoding).read() or o.string
+    if o.file:
+        with codes.open(o.file, "r", o.encoding):
+            s = f.read()
+    else:
+        s = o.string
     # The given text can be parsed in two modes:
     # - implicit: parse everything (tokenize, tag/chunk, find relations, lemmatize).
     # - explicit: define what to parse manually.
@@ -2594,9 +2602,8 @@ def save(self, path):
                              "label=\"%s\""   % self.labeler.get(w, "")
                     ))
         a.append("</sentiment>")
-        f = open(path, "w", encoding="utf-8")
-        f.write(BOM_UTF8 + encode_utf8("\n".join(a)))
-        f.close()
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(BOM_UTF8 + encode_utf8("\n".join(a)))
 
 #### SPELLING CORRECTION ###########################################################################
 # Based on: Peter Norvig, "How to Write a Spelling Corrector", http://norvig.com/spell-correct.html
@@ -2640,9 +2647,8 @@ def train(self, s, path="spelling.txt"):
             model[w] = w in model and model[w] + 1 or 1
         model = ("%s %s" % (k, v) for k, v in sorted(model.items()))
         model = "\n".join(model)
-        f = open(path, "w", encoding="utf-8")
-        f.write(model)
-        f.close()
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(model)
 
     def _edit1(self, w):
         """ Returns a set of words with edit distance 1 from the given word.
diff --git a/pattern/text/en/wordlist/__init__.py b/pattern/text/en/wordlist/__init__.py
index 047a805f..8f859b34 100644
--- a/pattern/text/en/wordlist/__init__.py
+++ b/pattern/text/en/wordlist/__init__.py
@@ -33,7 +33,8 @@ def __init__(self, name, data=[]):
 
     def _load(self):
         if not self._data:
-            self._data = open(os.path.join(MODULE, self._name + ".txt")).read().split(", ")
+            with open(os.path.join(MODULE, self._name + ".txt")) as f:
+                self._data = f.read().split(", ")
 
     def __repr__(self):
         self._load()
diff --git a/pattern/text/en/wordnet/__init__.py b/pattern/text/en/wordnet/__init__.py
index 022285fd..5345d32d 100644
--- a/pattern/text/en/wordnet/__init__.py
+++ b/pattern/text/en/wordnet/__init__.py
@@ -411,7 +411,8 @@ def map32(id, pos=NOUN):
     """
     global _map32_cache
     if not _map32_cache:
-        _map32_cache = open(os.path.join(MODULE, "dict", "index.32"), encoding="latin-1").readlines()
+        with open(os.path.join(MODULE, "dict", "index.32"), encoding="latin-1") as f:
+            _map32_cache = f.readlines()
         _map32_cache = (x for x in _map32_cache if x[0] != ";") # comments
         _map32_cache = dict(x.strip().split(" ") for x in _map32_cache)
     k = pos in _map32_pos2 and pos or _map32_pos1.get(pos, "x")
diff --git a/pattern/text/ru/wordlist/__init__.py b/pattern/text/ru/wordlist/__init__.py
index 01e5b6dc..a5cc4d42 100644
--- a/pattern/text/ru/wordlist/__init__.py
+++ b/pattern/text/ru/wordlist/__init__.py
@@ -33,7 +33,8 @@ def __init__(self, name, data=[]):
 
     def _load(self):
         if not self._data:
-            self._data = open(os.path.join(MODULE, self._name + ".txt")).read().split("\n")
+            with open(os.path.join(MODULE, self._name + ".txt")) as f:
+                self._data = f.read().split("\n")
 
     def __repr__(self):
         self._load()
diff --git a/pattern/text/search.py b/pattern/text/search.py
index 6efd5199..e7291e0e 100644
--- a/pattern/text/search.py
+++ b/pattern/text/search.py
@@ -19,8 +19,7 @@
 
 import re
 import itertools
-
-from functools import cmp_to_key
+from itertools import product, compress, combinations
 
 #--- TEXT, SENTENCE AND WORD -----------------------------------------------------------------------
 # The search() and match() functions work on Text, Sentence and Word objects (see pattern.text.tree),
@@ -150,35 +149,6 @@ def find(function, iterable):
             return x
 
 
-def combinations(iterable, n):
-    # Backwards compatibility.
-    return product(iterable, repeat=n)
-
-
-def product(*args, **kwargs):
-    """ Yields all permutations with replacement:
-        list(product("cat", repeat=2)) => 
-        [("c", "c"), 
-         ("c", "a"), 
-         ("c", "t"), 
-         ("a", "c"), 
-         ("a", "a"), 
-         ("a", "t"), 
-         ("t", "c"), 
-         ("t", "a"), 
-         ("t", "t")]
-    """
-    p = [[]]
-    for iterable in map(tuple, args) * kwargs.get("repeat", 1):
-        p = [x + [y] for x in p for y in iterable]
-    for p in p:
-        yield tuple(p)
-
-try:
-    from itertools import product
-except:
-    pass
-
 
 def variations(iterable, optional=lambda x: False):
     """ Returns all possible variations of a sequence with optional items.
@@ -200,8 +170,13 @@ def variations(iterable, optional=lambda x: False):
         v = tuple(iterable[i] for i in range(len(v)) if not v[i])
         a.add(v)
     # Longest-first.
-    f = lambda x, y: len(y) - len(x)
-    return sorted(a, key=cmp_to_key(f))
+    return sorted(a, key=len, reverse=True)
+
+def variations(iterable, optional=lambda x: False):
+    optional = [*map(optional, iterable)]
+    candidates = [*product(*([False, True] if opt else [True] for opt in optional))]
+    candidates.sort(key=sum, reverse=True)
+    return [(*compress(iterable, cnd),) for cnd in candidates]
 
 #### TAXONOMY ######################################################################################