File: gnatcoll-boyer_moore.adb

package info (click to toggle)
libgnatcoll 18-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 5,068 kB
  • sloc: ada: 40,393; python: 354; ansic: 310; makefile: 245; sh: 31
file content (376 lines) | stat: -rw-r--r-- 12,701 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
------------------------------------------------------------------------------
--                             G N A T C O L L                              --
--                                                                          --
--                     Copyright (C) 2001-2017, AdaCore                     --
--                                                                          --
-- This library is free software;  you can redistribute it and/or modify it --
-- under terms of the  GNU General Public License  as published by the Free --
-- Software  Foundation;  either version 3,  or (at your  option) any later --
-- version. This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY;  without even the implied warranty of MERCHAN- --
-- TABILITY or FITNESS FOR A PARTICULAR PURPOSE.                            --
--                                                                          --
--                                                                          --
--                                                                          --
--                                                                          --
--                                                                          --
-- You should have received a copy of the GNU General Public License and    --
-- a copy of the GCC Runtime Library Exception along with this program;     --
-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
-- <http://www.gnu.org/licenses/>.                                          --
--                                                                          --
------------------------------------------------------------------------------

--  The search is done right-to-left. In the best cases (the text doesn't
--  contain any character from the pattern), this results in
--   string_length / pattern_length, characters being examined, instead of
--   string_length characters.
--
--  Compiling the pattern generates two lookup-tables:
--
--  The Last Occurrence Function
--  ============================
--
--  The Last-Ocurrence-Function returns the right-most location for each
--  character in the alphabet in the pattern.
--  When a character is seen in the searched string, this array will suggest
--  the offset by which we should move the character:
--      string:   "revolution in the treatment of"
--      pattern:  "     reminiscence"
--                                ^ when we see the 'h', we can move to:
--                "                reminiscence"
--
--      string:   "written notice that"
--      pattern:  "  reminiscence"
--                            ^  when the see the 'i', we can move to:
--                "      reminiscence"
--
--      string:   "golden fleece of"
--      pattern:  " reminiscence"
--                           ^   when we see the 'e', no move can be suggested,
--                               since 'e' appears at the right-most position
--                               in the pattern.
--
--  The Good Suffix Function
--  ========================
--
--  This function reports the least amount that garantees that any pattern
--  characters that align with the good suffix previously found in the text
--  will match those suffix characters.
--  For instance:
--
--      string:   "written notice that"
--      pattern:  "  reminiscence"
--                            ^   The pattern would be moved so that the "ce"
--                             vv we have already found match some text.
--                "     reminiscence"
--
--  Combination
--  ===========
--
--  The two functions above can be computed statically based only on the
--  pattern, and without any knowledge of the text.
--  When we try to match a pattern with a text, these two functions are
--  combined, and the pattern is moved forward by the maximum amount reported
--  by the two functions.

with Unchecked_Deallocation;
with Ada.Text_IO;             use Ada.Text_IO;
with Ada.Integer_Text_IO;     use Ada.Integer_Text_IO;
with GNAT.Case_Util;          use GNAT.Case_Util;

package body GNATCOLL.Boyer_Moore is

   Debug     : constant Boolean := False;
   Debug_Run : constant Boolean := False;

   procedure Dump_Str (Str : String);
   --  Print string, replacing the newlines with spaces for clarity

   procedure Dump
      (M, Shift, J : Natural;
       Num_Comp  : in out Natural;
       Motif     : Pattern;
       In_String : String);
   --  Print the current state of the search.
   --  The parameters are the internal state in Search. We do not use a
   --  nested subprogram for efficiency reasons

   -------------
   -- Compile --
   -------------

   procedure Compile
     (Motif          : in out Pattern;
      From_String    : String;
      Case_Sensitive : Boolean := True)
   is
      --  Prefix contains the following:
      --    Prefix (J) is the length of the longest prefix of Motif
      --    which is also a suffix of
      --    Motif (Motif'First .. Motif'First + J - 1)
      --    ie of the motif made of the j-th first characters of Motif
      --
      --  Reverse_Prefix is the Prefix function applied to the reverse of Motif
      --
      --  Motif.Last_Occurrence contains the index of the last occurrence of
      --    the character in the motif. This is in the range 1 .. Motif'Length
      --
      --  Good_Suffix at index J:
      --    If a mismatch occurs in the j-th character of the pattern, we
      --    can safely advance by good_suffix (j).
      --       m = Motif'Length
      --       GS(J) = m
      --          - Max (k; 0<=k<m
      --                    and (Motif (J+1 .. m) suffix of Motif (1 .. k)
      --                         or Motif (1 .. j) suffix of Motif (J+1 .. m))
      --
      --    For instance:  Motif="AABBA"
      --       GS(1) = 5 - 1 = 4
      --          since Motif (1 .. 1)="A" is a suffix "ABBA"
      --       GS(4) = 5 - 2 = 3
      --          since "A" is a suffix of Motif (1 .. 2) = "AA"
      --       GS(5) = 5 - 1 = 4
      --          since "" is a suffix of Motif (1 .. 4) = "AABB"

      Prefix          : Offset_Array (1 .. From_String'Length);
      Reverse_Prefix  : Offset_Array (1 .. From_String'Length);
      K, K2           : Natural := 0;
      Tmp             : Natural;

   begin
      if From_String = "" then
         Motif.Motif := null;
         return;
      end if;

      Motif.Case_Sensitive  := Case_Sensitive;
      Motif.Last_Occurrence := (others => 0);
      Motif.Motif           := new String (1 .. From_String'Length);
      Motif.Motif.all       := From_String;
      if not Case_Sensitive then
         To_Lower (Motif.Motif.all);
      end if;

      Prefix (Prefix'First)                   := 0;
      Reverse_Prefix (Reverse_Prefix'First)   := 0;
      Motif.Last_Occurrence (Motif.Motif (1)) := 1;

      for Q in 2 .. Motif.Motif'Last loop
         --  Compute Last occurrence

         Motif.Last_Occurrence (Motif.Motif (Q)) := Q;

         --  Compute prefix function

         while K > 0
           and then Motif.Motif (K + 1) /= Motif.Motif (Q)
         loop
            K := Prefix (K);
         end loop;

         if Motif.Motif (K + 1) = Motif.Motif (Q) then
            K := K + 1;
         end if;

         Prefix (Q) := K;

         --  Compute the reverse prefix function

         while K2 > 0
           and then Motif.Motif (Motif.Motif'Last - K2) /=
           Motif.Motif (Motif.Motif'Last + 1 - Q)
         loop
            K2 := Reverse_Prefix (K2);
         end loop;

         if Motif.Motif (Motif.Motif'Last - K2) =
           Motif.Motif (Motif.Motif'Last + 1 - Q)
         then
            K2 := K2 + 1;
         end if;

         Reverse_Prefix (Q) := K2;
      end loop;

      --  Compute the good suffix function

      K := From_String'Length - Prefix (From_String'Length);
      Motif.Good_Suffix := new Offset_Array'(0 .. From_String'Length => K);

      for L in Motif.Motif'Range loop
         K   := From_String'Length - Reverse_Prefix (L);
         Tmp := L - Reverse_Prefix (L);

         if Motif.Good_Suffix (K) > Tmp then
            Motif.Good_Suffix (K) := Tmp;
         end if;
      end loop;

      if Debug then
         Put ("   i   =  ");
         for J in Motif.Motif'Range loop
            Put (Item => J, Width => 3);
         end loop;
         New_Line;

         Put ("  Pat[i]=  ");
         for J in Motif.Motif'Range loop
            Put ("  " & Motif.Motif (J));
         end loop;
         New_Line;

         Put ("  Pre[i]=  ");
         for J in Prefix'Range loop
            Put (Item => Prefix (J), Width => 3);
         end loop;
         New_Line;

         Put ("RevPre[i]=  ");
         for J in Reverse_Prefix'Range loop
            Put (Item => Reverse_Prefix (J), Width => 3);
         end loop;
         New_Line;

         Put ("GoodSu[i]=  ");
         for J in Motif.Good_Suffix'Range loop
            Put (Item => Motif.Good_Suffix (J), Width => 3);
         end loop;
         New_Line;
      end if;
   end Compile;

   ----------
   -- Free --
   ----------

   procedure Free (Motif : in out Pattern) is
      procedure Internal is new Unchecked_Deallocation
        (Offset_Array, Offset_Array_Access);
      procedure Internal is new Unchecked_Deallocation (String, String_Access);
   begin
      Internal (Motif.Good_Suffix);
      Internal (Motif.Motif);
   end Free;

   --------------
   -- Dump_Str --
   --------------

   procedure Dump_Str (Str : String) is
   begin
      for S in Str'Range loop
         if Str (S) = ASCII.LF then
            Put (' ');
         else
            Put (Str (S));
         end if;
      end loop;
      New_Line;
   end Dump_Str;

   ----------
   -- Dump --
   ----------

   procedure Dump
      (M, Shift, J : Natural;
       Num_Comp  : in out Natural;
       Motif     : Pattern;
       In_String : String) is
   begin
      --  Show current automaton state
      Num_Comp := Num_Comp + M - J + 1;

      if Debug_Run then
         New_Line;
         Put_Line ("Offset : Shift+j="
                   & Integer'Image (Shift + J)
                   & " J=" & J'Img
                   & " Last_Occ=" & In_String (Shift + J)
                   & " Max ("
                   & Motif.Good_Suffix (J)'Img
                   & ","
                   & Integer'Image
                   (J - Motif.Last_Occurrence (In_String (Shift + J)))
                   & ")");

         if In_String'Length < 400 then
            Dump_Str (In_String);
            Put ((1 .. Shift - In_String'First + 1 => ' '));
         end if;
         Dump_Str (Motif.Motif.all);

         if Shift + J - In_String'First < 400 then
            Put ((1 .. Shift + J - In_String'First => ' '));
            Put_Line ("^");
         end if;
      end if;

      if J = 0 then
         Put_Line ("Matched at position" & Natural'Image (Shift + 1)
                   & " after" & Num_Comp'Img & " comparisons");
      end if;
   end Dump;

   ------------
   -- Search --
   ------------

   function Search (Motif : Pattern; In_String : String) return Integer is
      M : Natural;
      Shift : Natural := In_String'First - 1;
      J : Natural;
      Num_Comp : Natural := 0;
   begin
      if Motif.Motif = null then
         return -1;
      end if;

      M := Motif.Motif'Length;  --  length of pattern
      pragma Assert (Motif.Motif'First = 1);

      if not Motif.Case_Sensitive then
         while Shift <= In_String'Last - M loop
            J := M;
            while J > 0
              and then Motif.Motif (J) = To_Lower (In_String (Shift + J))
            loop
               J := J - 1;
            end loop;

            if J = 0 then
               return Shift + 1;
            elsif Debug then
               Dump (M, Shift, J, Num_Comp, Motif, In_String);
            end if;

            Shift := Shift +
              Natural'Max (Motif.Good_Suffix (J),
                           J - Motif.Last_Occurrence
                           (To_Lower (In_String (Shift + J))));
         end loop;

      else
         while Shift <= In_String'Last - M loop
            J := M;
            while J > 0 and then Motif.Motif (J) = In_String (Shift + J) loop
               J := J - 1;
            end loop;

            if J = 0 then
               return Shift + 1;
            elsif Debug then
               Dump (M, Shift, J, Num_Comp, Motif, In_String);
            end if;

            Shift := Shift +
              Natural'Max (Motif.Good_Suffix (J),
                           J - Motif.Last_Occurrence (In_String (Shift + J)));
         end loop;
      end if;

      return -1;
   end Search;

end GNATCOLL.Boyer_Moore;