from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int

import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.search import search, taxonomy, Classifier
from pattern.en import parsetree

# The search module includes a Taxonomy class
# that can be used to define semantic word types.
# For example, consider that you want to extract flower names from a text.
# This would make search patterns somewhat unwieldy:
# search("rose|lily|daisy|daffodil|begonia", txt).

# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
    taxonomy.append(flower, type="flower")

print(taxonomy.children("flower"))
print(taxonomy.parents("rose"))
print(taxonomy.classify("rose"))  # Yields the most recently added parent.
print("")

# Taxonomy terms can be included in a pattern by using uppercase:
t = parsetree("A field of white daffodils.", lemmata=True)
m = search("FLOWER", t)
print(t)
print(m)
print("")

# Another example:
taxonomy.append("chicken", type="food")
taxonomy.append("chicken", type="bird")
taxonomy.append("penguin", type="bird")
taxonomy.append("bird", type="animal")
print(taxonomy.parents("chicken"))
print(taxonomy.children("animal", recursive=True))
print(search("FOOD", "I'm eating chicken."))
print("")

# The advantage is that the taxonomy can hold an entire hierarchy.
# For example, "flower" could be classified as "organism".
# Other organisms could be defined as well (insects, trees, mammals, ...)
# The ORGANISM constraint then matches everything that is an organism.

# A taxonomy entry can also be a proper name containing spaces
# (e.g. "windows vista", case insensitive).
# It will be detected as long as it is contained in a single chunk:
taxonomy.append("windows vista", type="operating system")
taxonomy.append("ubuntu", type="operating system")

t = parsetree("Which do you like more, Windows Vista, or Ubuntu?")
m = search("OPERATING_SYSTEM", t)
print(t)
print(m)
print(m[0].constituents())
print("")

# Taxonomy entries cannot have wildcards (*),
# but you can use a classifier to simulate this.
# Classifiers are quite slow but useful in many ways.
# For example, a classifier could be written to dynamically
# retrieve word categories from WordNet.


def find_parents(word):
    if word.startswith(("mac os", "windows", "ubuntu")):
        return ["operating system"]
c = Classifier(parents=find_parents)
taxonomy.classifiers.append(c)

t = parsetree("I like Mac OS X 10.5 better than Windows XP or Ubuntu.")
m = search("OPERATING_SYSTEM", t)
print(t)
print(m)
print(m[0].constituents())
print(m[1].constituents())
print("")
