File: 06-optional.py

package info (click to toggle)
python-pattern 2.6%2Bgit20180818-2
  • links: PTS
  • area: main
  • in suites:
  • size: 93,888 kB
  • sloc: python: 28,119; xml: 15,085; makefile: 194
file content (46 lines) | stat: -rw-r--r-- 1,782 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int

import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.search import search
from pattern.en import parsetree

# Constraints ending in "?" are optional, matching one or no word.
# Pattern.search() uses a "greedy" approach:
# it will attempt to include as many optional constraints as possible.

# The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns).
# A preceding adjective, adverb or determiner are picked up as well.
for s in (
  "the cat",              # DT NN
  "the very black cat",   # DT RB JJ NN
  "tasty cat food",       # JJ NN NN
  "the funny black cat",  # JJ NN
  "very funny",           # RB JJ => no match, since there is no noun.
  "my cat is black and your cat is white"):  # NN + NN
    t = parsetree(s)
    m = search("DT? RB? JJ? NN+", t)
    print("")
    print(t)
    print(m)
    if m:
        for w in m[0].words:
            print("%s matches %s" % (w, m[0].constraint(w)))

# Before Pattern 2.4, "( )" was used instead of "?".
# For example: "(JJ)" instead of "JJ?".
# The syntax was changed to resemble regular expressions, which use "?".
# The old syntax "(JJ)" still works in Pattern 2.4, but it may change later.

# Note: the above pattern could also be written as "DT|RB|JJ?+ NN+"
# to include multiple adverbs/adjectives.
# By combining "*", "?" and "+" patterns can become quite complex.
# Optional constraints are useful for very specific patterns, but slow.
# Also, depending on which parser you use (e.g. MBSP), words can be tagged differently
# and may not match in the way you expect.
# Consider using a simple, robust "NP" search pattern.