# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""
Given a trie, find all occurrences of a word in the trie in a string.

Like searching a string for a substring, except that the substring is
any word in a trie.

Functions:
match         Find longest key in a trie matching the beginning of the string.
match_all     Find all keys in a trie matching the beginning of the string.
find          Find keys in a trie matching anywhere in a string.
find_words    Find keys in a trie matching whole words in a string.

"""

import string
import re


def match(string, trie):
    """match(string, trie) -> longest key or None

    Find the longest key in the trie that matches the beginning of the
    string.

    """
    longest = None
    for i in range(len(string)):
        substr = string[:i + 1]
        if not trie.has_prefix(substr):
            break
        if substr in trie:
            longest = substr
    return longest


def match_all(string, trie):
    """match_all(string, trie) -> list of keys

    Find all the keys in the trie that matches the beginning of the
    string.

    """
    matches = []
    for i in range(len(string)):
        substr = string[:i + 1]
        if not trie.has_prefix(substr):
            break
        if substr in trie:
            matches.append(substr)
    return matches


def find(string, trie):
    """find(string, trie) -> list of tuples (key, start, end)

    Find all the keys in the trie that match anywhere in the string.

    """
    results = []
    start = 0     # index to start the search
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            results.append((key, start, start + len(key)))
        start += 1
    return results

DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace


def find_words(string, trie):
    """find_words(string, trie) -> list of tuples (key, start, end)

    Find all the keys in the trie that match full words in the string.
    Word boundaries are defined as any punctuation or whitespace.

    """
    _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))

    results = []
    start = 0     # index of word boundary
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            l = len(key)
            # Make sure it ends at a boundary.
            if start + l == len(string) or \
               _boundary_re.match(string[start + l]):
                results.append((key, start, start + l))
        # Move forward to the next boundary.
        m = _boundary_re.search(string, start)
        if m is None:
            break
        start = m.end()
    return results
