File: run_automaton.ml

package info (click to toggle)
hlins 0.40-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 484 kB
  • sloc: ml: 1,215; makefile: 148; sh: 4
file content (343 lines) | stat: -rw-r--r-- 11,527 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
(***************************************************************************)
(*  HLins: insert http-links into HTML documents.                          *)
(*  See http://www.lri.fr/~treinen/hlins                                   *)
(*                                                                         *)
(*  Copyright (C) 1999-2024 Ralf Treinen <treinen@irif.fr>                 *)
(*                                                                         *)
(*  This program is free software; you can redistribute it and/or modify   *)
(*  it under the terms of the GNU General Public License as published by   *)
(*  the Free Software Foundation; either version 2 of the License, or (at  *)
(*  your option) any later version.                                        *)
(*                                                                         *)
(*  This program is distributed in the hope that it will be useful, but    *)
(*  WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU      *)
(*  General Public License for more details.                               *)
(*                                                                         *)
(*  You should have received a copy of the GNU General Public License      *)
(*  along with this program; if not, write to the Free Software            *)
(*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307    *)
(*  USA                                                                    *)
(*                                                                         *)
(***************************************************************************)

(*
    HLins: insert http-links into HTML documents.
    See http://www.lri.fr/~treinen/hlins

    Copyright (C) 1999 Ralf Treinen <treinen@lri.fr>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*)


open Array;;
open Cyclic_buffer;;
open Lexing;;
open Automaton;;
open Read_html;;


let isblank c = c=' ' || c='\t' || c='\n' || c='\r' ;;


let run
  {max_path_length=m;level=level;tree=tree;board=board;suf=suf;
   found=found;expand=expand} inbuf subst outc =

  let s = fresh (m+1)
	    (* s serves as additional input buffer that
	       takes priority over the lex buffer inbuf, that is when
	       taking the next character we check first with s (see
	       function  read). In some cases we have to put symbols
	       already read back into s. However, the length of s is
	       at most the length of the longest search pattern.  *)
	    
  and ext = fresh (m+1)
	      (* the buffer ext contains the part of the input that we
		 have already seen but that is not yet known to be a
		 match. Except in case of the function ex_lock this is
		 identical with the window (see below). The length of
		 ext is at most the length of the longest search
		 pattern. *)

  in let rec read () =
      (* read () tries to read first from the buffer s,
	 then from inbuf using lexer. If it succeeds then it
	 returns (c,vs,false), otherwise it returns (' ',"",true) .
	 In case of success:
	 - if a character token has been read then c is this character
	   and vs = ""
	 - if a verb token has been read then vs is the string of this
           verb token (always non-empty) and c is ' '
      *)
      if is_empty s
      then
	try
	  (match next_html inbuf with
	       (CHAR c) -> (c,"", false)
	     | (VERB s) -> ' ',s,false)
	with End_of_file -> (' ',"",true)
      else (getc s,"",false)

     and gettrans q c =
      (* get the new state from q with letter c, taking tree and
	 board into consideration *)
      try get_transition tree.(q) (if isblank c then ' ' else c)
      with Not_found ->
	if q=0 then 0 else gettrans board.(q) c

     and gettree q c =
      (* return the state obtained from node q with character c with
	 tree transition, 0 when a tree transition is not possible *)
      try get_transition tree.(q) c
      with Not_found -> 0
	  

(***************************************************************************)

(*

  At every moment, the search engine has stored in a "window" the part
  of past input that still is under consideration. The search engine
  tries to find the earliest position in the string
  window^rest_of_input such that some search pattern is prefix of the
  substring of window^rest_of_input starting at that position. Once
  this position fixed, the search engine tries to find the longest
  such prefix.

  In the window, multiple white space is compressed to one white space
  character. In any case, the window is a prefix of some search
  pattern.

  The engine can be in one of three possible states, realised by the
  three main functions:
  - go: no factor of the window is a search pattern.
  - lock: some prefix of the window is a search pattern. In this case
    we just try to extend the prefix to an ever longer prefix that is a
    search pattern.
  - try: some factor that is not a prefix of the window is a search
    pattern (that is the search patterns starts at a later position in
    the window). In this case there is still hope that we might find a
    earlier position in the window where a search patterns starts.

  The search engine uses of course the automaton (see explications in
  automaton.mli). For each of the three functions, we have that
  - q is a node of the automaton, and path(q) = window
  - lastblank = true iff the last character of window is white
    space. If the window is empty then lastblank has no significance.

*)

(*****************************************************************************)


  in let
  rec run_go q lastblank  =
	 (* This is the initial function called. We execute the
	    automaton, changing into "lock" when we find a final
	    state and into "try" when we find an internal final state.
	 *)
   let (c,vs,stop) = read ()
   in
     if stop
     then 
       output_string outc (getall ext)
     else
       if vs <> ""
       then (* reset the automaton *)
	 begin
	   output_string outc (getall ext);
	   output_string outc vs;
	   run_go 0 false
	 end
       else
	 if isblank(c) && lastblank
	 then
	   begin
	     if q=0 then output_char outc c;
	     run_go q true
	   end
	 else let nq = gettrans q c
	 in
	   if nq = 0 then
	     begin
	       output_string outc (getall ext);
	       output_char outc c;
	       run_go 0 (isblank c);
	     end
	   else
	     begin
	       addc ext c;
	       output_string outc (gets ext (level.(q)-level.(nq)+1));
	       if suf.(nq) = nq
	       then (* ext is a pattern *)
		 run_lock nq nq (getall ext) false
	       else if suf.(nq) <> 0
	       then (* some proper suffix of ext that is a pattern. *)
		 run_try nq suf.(nq) (level.(nq)-level.(suf.(nq))) (isblank c)
	       else (* no factor of ext is a pattern *)
		 run_go nq (isblank c)
	     end
	     

(******************************************************************************)


 and run_lock q foundstate foundname lastblank =
    (* window = foundname ^ (contents_of ext)
       foundname is the longest prefix of window that is a pattern.
       path(foundstate) = foundname
       
       We hence just try to extend found by tree transitions. We don't
       care for the board or internal final states here.
       If we cannot proceed with tree transitions we output foundname
       and start over with what we have buffered in ext.
    *)
    let (c,vs,stop) = read ()
    in
      if stop
      then
	(* no more input. print what we have found so far and start over *)
	begin
	  output_string outc
	     (subst found.(foundstate) expand.(foundstate) foundname);
	  push s (getall ext);
	  run_go 0 false
	end
      else
	if vs <> ""
	then (* reset the automaton *)
	  begin
	    output_string outc
	      (subst found.(foundstate) expand.(foundstate) foundname);
	    output_string outc vs;
	    push s (getall ext);
	    run_go 0 false
	  end
	else 
	  if isblank(c) && lastblank
	  then (* q can not be 0 in go_lock *)
	    run_lock q foundstate foundname true
	  else
	    begin 
	      addc ext c;
	      let nq = gettree q c
	      in
		if nq = 0
		then
		  (* No more tree transition possible. Print what we have
		     so far in foundname, put the contents of ext plus c
		     back into the input buffer, and start over in state 0.
		  *)
		  begin
		    output_string outc
		      (subst found.(foundstate) expand.(foundstate) foundname);
		    push s (getall ext);
		    run_go 0 false
		  end
		else
		  if suf.(nq)=nq
		  then (* nq is again a final state, extend foundname *)
		    run_lock nq nq (foundname^(getall ext)) false  
		  else (* nq is not a final state  *)
		    run_lock nq foundstate foundname (isblank c)
	    end
	    
	    

(**************************************************************************)
  
and run_try q bq off lastblank =
	 (* This is the most complicated case.
	    ext is the window, and path(q) = window
	    off is the earliest position of the window such that some
	    search pattern is a prefix of the sub-string of the window
	    starting at that position. path(bq) is this search pattern.
	 *)
  let (c,vs,stop) = read ()
  in
    if stop
    then
      begin
	(let fo = gets ext (level.(bq))
	 in output_string outc (subst found.(bq) expand.(bq) fo));
	push s (getall ext);
	run_go 0 false
      end
    else
      if vs <> ""
      then (* reset the automaton *)
	begin
	  (let fo = gets ext (level.(bq))
	   in output_string outc (subst found.(bq) expand.(bq) fo));
	  output_string outc vs;
	  push s (getall ext);
	  run_go 0 false
	end
      else 
	if isblank(c) && lastblank
	then (* q can not be 0 in go_try *)
	  run_try q bq off true
	else
	  begin
	    addc ext c;
	    let nq = gettrans q c 
	    in let offset = level.(q) - level.(nq) + 1 
	    in
	      if offset < off
	      then (* we can do the transition *) 
		begin
		  output_string outc (gets ext offset);
		  if suf.(nq) = nq
		  then (* final state *)
		    run_lock nq nq (getall ext) false
		  else if suf.(nq) <> 0
		  then (* internal final state *)
		    let newoff = level.(nq) - level.(suf.(nq))
		    in
		      if newoff < off - offset
		      then 
			run_try nq suf.(nq) (off-offset-newoff) (isblank c)
		      else
			run_try nq bq (off-offset) (isblank c)
		  else (* nq not final and not internal final *)
		    run_try nq bq (off-offset) (isblank c)
		end
	      else (* offset >= off *)
		(* we can not advance the start position of the window 
		   since this would cut the pattern that we already
		   have found in the interior of ext. hence we just commit
		   to the factor that we have found.
		*)
		begin
		  output_string outc (gets ext off);
		  let foundname = gets ext level.(bq)
		  in
		    begin
		      push s (getall ext);
		      run_lock bq bq foundname false
		    end
		end
		
	  end
	  
in
  run_go 0 false
;;