File: fsa.py

package info (click to toggle)
python-pynlpl 1.2.9-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,900 kB
  • sloc: python: 25,677; sh: 73; makefile: 3
file content (128 lines) | stat: -rw-r--r-- 4,785 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#---------------------------------------------------------------
# PyNLPl - Finite State Automata
#   by Maarten van Gompel
#   Centre for Language Studies
#   Radboud University Nijmegen
#   http://proycon.github.com/folia
#   http://www.github.com/proycon/pynlpl
#   proycon AT anaproy DOT nl
#
# Partially based/inspired on code by Xiayun Sun (https://github.com/xysun/regex)
#
#   Licensed under GPLv3
#
#----------------------------------------------------------------
from __future__ import print_function, unicode_literals, division, absolute_import
import sys


class State(object):
    def __init__(self, **kwargs):
        if 'epsilon' in kwargs:
            self.epsilon = kwargs['epsilon'] # epsilon-closure (lis of states)
        else:
            self.epsilon = [] # epsilon-closure
        if 'transitions' in kwargs:
            self.transitions = kwargs['transitions']
        else:
            self.transitions = [] #(matchitem, matchfunction(value), state)
        if 'final' in kwargs:
            self.final = bool(kwargs['final']) # ending state
        else:
            self.final = False
        self.transitioned = None #will be a tuple (state, matchitem) indicating how this state was reached



class NFA(object):
    """Non-deterministic finite state automaton. Can be used to model DFAs as well if your state transitions are not ambiguous and epsilon is empty."""

    def __init__(self, initialstate):
        self.initialstate = initialstate

    def run(self, sequence, mustmatchall=False,debug=False):
        def add(state, states):
            """add state and recursively add epsilon transitions"""
            assert isinstance(state, State)
            if state in states:
                return
            states.add(state)
            for eps in state.epsilon: #recurse into epsilon transitions
                add(eps, states)

        current_states = set()
        add(self.initialstate, current_states)
        if debug: print("Starting run, current states: ", repr(current_states),file=sys.stderr)

        for offset, value in enumerate(sequence):
            if not current_states: break
            if debug: print("Value: ", repr(value),file=sys.stderr)
            next_states = set()
            for state in current_states:
                for matchitem, matchfunction, trans_state in state.transitions:
                    if matchfunction(value):
                        trans_state.transitioned = (state, matchitem)
                        add(trans_state, next_states)

            current_states = next_states
            if debug: print("Current states: ", repr(current_states),file=sys.stderr)
            if not mustmatchall:
                for s in current_states:
                    if s.final:
                        if debug: print("Final state reached",file=sys.stderr)
                        yield offset+1

        if mustmatchall:
            for s in current_states:
                if s.final:
                    if debug: print("Final state reached",file=sys.stderr)
                    yield offset+1


    def match(self, sequence):
        try:
            return next(self.run(sequence,True)) == len(sequence)
        except StopIteration:
            return False

    def find(self, sequence, debug=False):
        l = len(sequence)
        for i in range(0,l):
            for length in self.run(sequence[i:], False, debug):
                yield sequence[i:i+length]

    def __iter__(self):
        return iter(self._states(self.initialstate))

    def _states(self, state, processedstates=[]): #pylint: disable=dangerous-default-value
        """Iterate over all states in no particular order"""
        processedstates.append(state)

        for nextstate in state.epsilon:
            if not nextstate in processedstates:
                self._states(nextstate, processedstates)

        for _, nextstate in state.transitions:
            if not nextstate in processedstates:
                self._states(nextstate, processedstates)

        return processedstates

    def __repr__(self):
        out = []
        for state in self:
            staterep = repr(state)
            if state is self.initialstate:
                staterep += " (INITIAL)"
            for nextstate in state.epsilon:
                nextstaterep = repr(nextstate)
                if nextstate.final:
                    nextstaterep += " (FINAL)"
                out.append( staterep + " -e-> " + nextstaterep )
            for item, _, nextstate in state.transitions:
                nextstaterep = repr(nextstate)
                if nextstate.final:
                    nextstaterep += " (FINAL)"
                out.append( staterep + " -(" + repr(item) + ")-> " + nextstaterep )

        return "\n".join(out)