File: triefind.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 46,860 kB
  • ctags: 13,237
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (102 lines) | stat: -rw-r--r-- 2,882 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""
Given a trie, find all occurrences of a word in the trie in a string.

Like searching a string for a substring, except that the substring is
any word in a trie.

Functions:
match         Find longest key in a trie matching the beginning of the string.
match_all     Find all keys in a trie matching the beginning of the string.
find          Find keys in a trie matching anywhere in a string.
find_words    Find keys in a trie matching whole words in a string.

"""

import string
import re


def match(string, trie):
    """match(string, trie) -> longest key or None

    Find the longest key in the trie that matches the beginning of the
    string.

    """
    longest = None
    for i in range(len(string)):
        substr = string[:i + 1]
        if not trie.has_prefix(substr):
            break
        if substr in trie:
            longest = substr
    return longest


def match_all(string, trie):
    """match_all(string, trie) -> list of keys

    Find all the keys in the trie that matches the beginning of the
    string.

    """
    matches = []
    for i in range(len(string)):
        substr = string[:i + 1]
        if not trie.has_prefix(substr):
            break
        if substr in trie:
            matches.append(substr)
    return matches


def find(string, trie):
    """find(string, trie) -> list of tuples (key, start, end)

    Find all the keys in the trie that match anywhere in the string.

    """
    results = []
    start = 0     # index to start the search
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            results.append((key, start, start + len(key)))
        start += 1
    return results

DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace


def find_words(string, trie):
    """find_words(string, trie) -> list of tuples (key, start, end)

    Find all the keys in the trie that match full words in the string.
    Word boundaries are defined as any punctuation or whitespace.

    """
    _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))

    results = []
    start = 0     # index of word boundary
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            l = len(key)
            # Make sure it ends at a boundary.
            if start + l == len(string) or \
               _boundary_re.match(string[start + l]):
                results.append((key, start, start + l))
        # Move forward to the next boundary.
        m = _boundary_re.search(string, start)
        if m is None:
            break
        start = m.end()
    return results