1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
|
(***************************************************************************)
(* HLins: insert http-links into HTML documents. *)
(* See http://www.lri.fr/~treinen/hlins *)
(* *)
(* Copyright (C) 1999-2024 Ralf Treinen <treinen@irif.fr> *)
(* *)
(* This program is free software; you can redistribute it and/or modify *)
(* it under the terms of the GNU General Public License as published by *)
(* the Free Software Foundation; either version 2 of the License, or (at *)
(* your option) any later version. *)
(* *)
(* This program is distributed in the hope that it will be useful, but *)
(* WITHOUT ANY WARRANTY; without even the implied warranty of *)
(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)
(* General Public License for more details. *)
(* *)
(* You should have received a copy of the GNU General Public License *)
(* along with this program; if not, write to the Free Software *)
(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)
(* USA *)
(* *)
(***************************************************************************)
(*
HLins: insert http-links into HTML documents.
See http://www.lri.fr/~treinen/hlins
Copyright (C) 1999 Ralf Treinen <treinen@lri.fr>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*)
open Array;;
open Cyclic_buffer;;
open Lexing;;
open Automaton;;
open Read_html;;
let isblank c = c=' ' || c='\t' || c='\n' || c='\r' ;;
let run
{max_path_length=m;level=level;tree=tree;board=board;suf=suf;
found=found;expand=expand} inbuf subst outc =
let s = fresh (m+1)
(* s serves as additional input buffer that
takes priority over the lex buffer inbuf, that is when
taking the next character we check first with s (see
function read). In some cases we have to put symbols
already read back into s. However, the length of s is
at most the length of the longest search pattern. *)
and ext = fresh (m+1)
(* the buffer ext contains the part of the input that we
have already seen but that is not yet known to be a
match. Except in case of the function ex_lock this is
identical with the window (see below). The length of
ext is at most the length of the longest search
pattern. *)
in let rec read () =
(* read () tries to read first from the buffer s,
then from inbuf using lexer. If it succeeds then it
returns (c,vs,false), otherwise it returns (' ',"",true) .
In case of success:
- if a character token has been read then c is this character
and vs = ""
- if a verb token has been read then vs is the string of this
verb token (always non-empty) and c is ' '
*)
if is_empty s
then
try
(match next_html inbuf with
(CHAR c) -> (c,"", false)
| (VERB s) -> ' ',s,false)
with End_of_file -> (' ',"",true)
else (getc s,"",false)
and gettrans q c =
(* get the new state from q with letter c, taking tree and
board into consideration *)
try get_transition tree.(q) (if isblank c then ' ' else c)
with Not_found ->
if q=0 then 0 else gettrans board.(q) c
and gettree q c =
(* return the state obtained from node q with character c with
tree transition, 0 when a tree transition is not possible *)
try get_transition tree.(q) c
with Not_found -> 0
(***************************************************************************)
(*
At every moment, the search engine has stored in a "window" the part
of past input that still is under consideration. The search engine
tries to find the earliest position in the string
window^rest_of_input such that some search pattern is prefix of the
substring of window^rest_of_input starting at that position. Once
this position fixed, the search engine tries to find the longest
such prefix.
In the window, multiple white space is compressed to one white space
character. In any case, the window is a prefix of some search
pattern.
The engine can be in one of three possible states, realised by the
three main functions:
- go: no factor of the window is a search pattern.
- lock: some prefix of the window is a search pattern. In this case
we just try to extend the prefix to an ever longer prefix that is a
search pattern.
- try: some factor that is not a prefix of the window is a search
pattern (that is the search patterns starts at a later position in
the window). In this case there is still hope that we might find a
earlier position in the window where a search patterns starts.
The search engine uses of course the automaton (see explications in
automaton.mli). For each of the three functions, we have that
- q is a node of the automaton, and path(q) = window
- lastblank = true iff the last character of window is white
space. If the window is empty then lastblank has no significance.
*)
(*****************************************************************************)
in let
rec run_go q lastblank =
(* This is the initial function called. We execute the
automaton, changing into "lock" when we find a final
state and into "try" when we find an internal final state.
*)
let (c,vs,stop) = read ()
in
if stop
then
output_string outc (getall ext)
else
if vs <> ""
then (* reset the automaton *)
begin
output_string outc (getall ext);
output_string outc vs;
run_go 0 false
end
else
if isblank(c) && lastblank
then
begin
if q=0 then output_char outc c;
run_go q true
end
else let nq = gettrans q c
in
if nq = 0 then
begin
output_string outc (getall ext);
output_char outc c;
run_go 0 (isblank c);
end
else
begin
addc ext c;
output_string outc (gets ext (level.(q)-level.(nq)+1));
if suf.(nq) = nq
then (* ext is a pattern *)
run_lock nq nq (getall ext) false
else if suf.(nq) <> 0
then (* some proper suffix of ext that is a pattern. *)
run_try nq suf.(nq) (level.(nq)-level.(suf.(nq))) (isblank c)
else (* no factor of ext is a pattern *)
run_go nq (isblank c)
end
(******************************************************************************)
and run_lock q foundstate foundname lastblank =
(* window = foundname ^ (contents_of ext)
foundname is the longest prefix of window that is a pattern.
path(foundstate) = foundname
We hence just try to extend found by tree transitions. We don't
care for the board or internal final states here.
If we cannot proceed with tree transitions we output foundname
and start over with what we have buffered in ext.
*)
let (c,vs,stop) = read ()
in
if stop
then
(* no more input. print what we have found so far and start over *)
begin
output_string outc
(subst found.(foundstate) expand.(foundstate) foundname);
push s (getall ext);
run_go 0 false
end
else
if vs <> ""
then (* reset the automaton *)
begin
output_string outc
(subst found.(foundstate) expand.(foundstate) foundname);
output_string outc vs;
push s (getall ext);
run_go 0 false
end
else
if isblank(c) && lastblank
then (* q can not be 0 in go_lock *)
run_lock q foundstate foundname true
else
begin
addc ext c;
let nq = gettree q c
in
if nq = 0
then
(* No more tree transition possible. Print what we have
so far in foundname, put the contents of ext plus c
back into the input buffer, and start over in state 0.
*)
begin
output_string outc
(subst found.(foundstate) expand.(foundstate) foundname);
push s (getall ext);
run_go 0 false
end
else
if suf.(nq)=nq
then (* nq is again a final state, extend foundname *)
run_lock nq nq (foundname^(getall ext)) false
else (* nq is not a final state *)
run_lock nq foundstate foundname (isblank c)
end
(**************************************************************************)
and run_try q bq off lastblank =
(* This is the most complicated case.
ext is the window, and path(q) = window
off is the earliest position of the window such that some
search pattern is a prefix of the sub-string of the window
starting at that position. path(bq) is this search pattern.
*)
let (c,vs,stop) = read ()
in
if stop
then
begin
(let fo = gets ext (level.(bq))
in output_string outc (subst found.(bq) expand.(bq) fo));
push s (getall ext);
run_go 0 false
end
else
if vs <> ""
then (* reset the automaton *)
begin
(let fo = gets ext (level.(bq))
in output_string outc (subst found.(bq) expand.(bq) fo));
output_string outc vs;
push s (getall ext);
run_go 0 false
end
else
if isblank(c) && lastblank
then (* q can not be 0 in go_try *)
run_try q bq off true
else
begin
addc ext c;
let nq = gettrans q c
in let offset = level.(q) - level.(nq) + 1
in
if offset < off
then (* we can do the transition *)
begin
output_string outc (gets ext offset);
if suf.(nq) = nq
then (* final state *)
run_lock nq nq (getall ext) false
else if suf.(nq) <> 0
then (* internal final state *)
let newoff = level.(nq) - level.(suf.(nq))
in
if newoff < off - offset
then
run_try nq suf.(nq) (off-offset-newoff) (isblank c)
else
run_try nq bq (off-offset) (isblank c)
else (* nq not final and not internal final *)
run_try nq bq (off-offset) (isblank c)
end
else (* offset >= off *)
(* we can not advance the start position of the window
since this would cut the pattern that we already
have found in the interior of ext. hence we just commit
to the factor that we have found.
*)
begin
output_string outc (gets ext off);
let foundname = gets ext level.(bq)
in
begin
push s (getall ext);
run_lock bq bq foundname false
end
end
end
in
run_go 0 false
;;
|