File: triefind.py

package info (click to toggle)
python-biopython 1.73%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 57,852 kB
  • sloc: python: 169,977; xml: 97,539; ansic: 15,653; sql: 1,208; makefile: 159; sh: 63
file content (114 lines) | stat: -rw-r--r-- 3,694 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Copyright 2002-2003 Jeff Chang.  All rights reserved.
# Revisions copyright 2012 by Christian Brueffer.  All rights reserved.
# Revisions copyright 2012-2017 by Peter Cock.  All rights reserved.
# Revisions copyright 2015 by Brian Osborne.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
#

"""Given a trie, find all occurrences of a word in the trie in a string.

Like searching a string for a substring, except that the substring is
any word in a trie.

Functions:
 - match         Find longest key in a trie matching the beginning of the string.
 - match_all     Find all keys in a trie matching the beginning of the string.
 - find          Find keys in a trie matching anywhere in a string.
 - find_words    Find keys in a trie matching whole words in a string.

This module is DEPRECATED. We encourage users to switch to alternative libraries
implementing a trie data structure, for example pygtrie.
"""

import string
import re

from Bio import BiopythonDeprecationWarning
import warnings
warnings.warn("This module has been deprecated. We encourage users to switch "
              "to alternative libraries implementing a trie data structure, "
              "for example pygtrie.", BiopythonDeprecationWarning)


def match(string, trie):
    """Find longest key, or return None.

    Find the longest key in the trie that matches the beginning of the
    string.
    """
    longest = None
    for i in range(len(string)):
        substr = string[:i + 1]
        if not trie.has_prefix(substr):
            break
        if substr in trie:
            longest = substr
    return longest


def match_all(string, trie):
    """Find and return a list of keys.

    Find all the keys in the trie that matches the beginning of the
    string.
    """
    matches = []
    for i in range(len(string)):
        substr = string[:i + 1]
        if not trie.has_prefix(substr):
            break
        if substr in trie:
            matches.append(substr)
    return matches


def find(string, trie):
    """Find all the keys in the trie that match anywhere in the string.

    Returns a list of tuples (key, start, end).
    """
    results = []
    start = 0     # index to start the search
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            results.append((key, start, start + len(key)))
        start += 1
    return results


DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace


def find_words(string, trie):
    """Find all the keys in the trie that match full words in the string.

    Find all the keys in the trie that match full words in the string.
    Word boundaries are defined as any punctuation or whitespace.

    Returns a list of tuples (key, start, end).
    """
    _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))

    results = []
    start = 0     # index of word boundary
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            length = len(key)
            # Make sure it ends at a boundary.
            if start + length == len(string) or \
               _boundary_re.match(string[start + length]):
                results.append((key, start, start + length))
        # Move forward to the next boundary.
        m = _boundary_re.search(string, start)
        if m is None:
            break
        start = m.end()
    return results