File: indexer1.ml

package info (click to toggle)
mldonkey 3.1.2-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 17,524 kB
  • sloc: ml: 149,175; cpp: 11,805; ansic: 8,780; sh: 4,226; asm: 3,870; xml: 1,092; perl: 102; makefile: 95
file content (504 lines) | stat: -rw-r--r-- 16,474 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
(* Copyright 2001, 2002 b8_bavard, b8_fee_carabine, INRIA *)
(*
    This file is part of mldonkey.

    mldonkey is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    mldonkey is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with mldonkey; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*)

open Printf2

open Indexer
  
module Make(Doc : Doc) = struct

    type doc = Doc.t
    
    type tree_rec = {
        mutable tree_expr : node;
      }
    
    and node =
      List of suffix * (char * tree_rec) list
    | String of string * suffix
    | Suffix of suffix
    | Filtered of tree_rec
    
    and index = {
        mutable next_doc : int;
        mutable next_word : int;  
        index_tree : tree_rec;
      }
    
    and suffix = {
        suffix_num : int;
        suffix_value : string;
        mutable suffix_word : word option;
        mutable suffix_words : word list;
      }
    
    and word ={
        word_string : string;
        mutable word_filtered : bool;
        mutable word_docs : (Doc.t * int) list; 
      }

(* pour chaque mot, on obtient une liste de documents.
Pour chaque document, un entier indique les champs dans lesquels
le mot apparait.
  *)
    
    let nsuffixes = ref 0
    let new_word s =
      incr nsuffixes;  
      { 
        suffix_num = !nsuffixes;
        suffix_value = s;
        suffix_word = None;
        suffix_words = [] ;
      }

    let stats _ = 0
      
    let clear index = 
      index.next_doc <- 0;
      index.next_word <- 0;
      index.index_tree.tree_expr <- String ("", new_word  "")



(************ ADDING DOCUMENTS ***************)
    
    
    
    let add_suffix idx s =
      let len = String.length s in
      let rec iter pos tree filtered =
        if pos = len then
          match tree.tree_expr with
          | Suffix _ -> assert false
          | List (num, _) -> num, filtered
          | String (str,num) ->
              if str = "" then num, filtered
              else
              let new_num = new_word (String.sub s 0 pos) in
              tree.tree_expr <- List (new_num, 
                [str.[0], 
                  { tree_expr = String (
                      String.sub str 1 (String.length str - 1), num); }]);
              new_num, filtered
          | Filtered tree ->
              iter pos tree true
        else
        match tree.tree_expr with
        | Suffix _ -> assert false
        | String (str, num) ->
            let suffix = String.sub s pos (len - pos) in
            if str = suffix then num, filtered else
            if str = "" then
              let new_num = new_word (String.sub s 0 pos) in
              tree.tree_expr <- List (num, 
                [s.[pos], 
                  { tree_expr = String (
                      String.sub s (pos+1) (String.length s -1-pos), new_num); 
                  }]);
              new_num, filtered
            else 
              begin
                tree.tree_expr <- List (new_word (String.sub s 0 pos), 
                  [str.[0], 
                    { tree_expr = String (
                        String.sub str 1 (String.length str - 1), num); }]);
                iter pos tree filtered; 
              end
        | Filtered tree ->
            iter pos tree true
        | List (num, list) -> 
            let c = s.[pos] in
            try 
              iter (pos+1) (List.assoc c list) filtered
            with Not_found ->
                let new_num = new_word s in
                tree.tree_expr <- List (num, (c, { tree_expr = String (
                        String.sub s (pos+1) (String.length s -pos-1), new_num); 
                    }) :: list);
                new_num, filtered
      in
      iter 0 idx.index_tree false
    
    let add_doc w doc field filtered =
      if filtered || w.word_filtered then begin
(*      lprintf "FILTER ON DOC"; lprint_newline (); *)
          Doc.filter doc true;
        end;
      w.word_docs <- (match w.word_docs with
          (d,f) :: tail when d = doc -> 
            (doc, f lor field) :: w.word_docs
        | _ -> (doc, field) :: w.word_docs)
    
    let add idx s doc field =
      let suffix, filtered = add_suffix idx s in
(*  if filtered then begin
      lprintf "SUFFIX %s IS FILTERED..." s; 
      lprint_newline (); 
    end; *)
      match suffix.suffix_word with
      | Some w -> (* the word is already inside *)
(*        lprintf "Word already exists"; lprint_newline (); *)
          add_doc w doc field filtered
      | None ->
(*      lprintf "New word added"; lprint_newline (); *)
          let w = { word_docs = [doc, field]; 
              word_string = s; 
              word_filtered = filtered;
            } in
          if filtered then begin
(*          lprintf "FILTER ON DOC"; lprint_newline (); *)
              Doc.filter doc true;
            end;
          suffix.suffix_word <- Some w;
          let len = String.length s in
          let rec iter pos =
(*        lprintf "iter %d/%d" pos len; lprint_newline (); *)
            if pos < len then
              let suff = String.sub s pos (len - pos) in
              let suffix, filtered = add_suffix idx suff in
(*          lprintf "added suffix [%s]" suffix.suffix_value; 
          lprint_newline (); *)
(*          if filtered then begin
              lprintf "NEXT SUFFIX %s IS FILTERED..." suff; 
              lprint_newline ();
            end; *)
              match suffix.suffix_word with
              | Some w -> add_doc w doc field filtered
              | None -> 
                  if filtered then begin
                      w.word_filtered <- true;
                      Doc.filter doc true;
                    end;
                  (match suffix.suffix_words with
                      ww :: _ when w == ww -> ()
                    | _ -> 
                        suffix.suffix_words <- w :: suffix.suffix_words
                  );
                  iter (pos+1)
          in
          iter 1




(*************** MISC FUNCTIONS ****************)  
    
    
    
    let find idx s =
      try
        let len = String.length s in
        let rec iter pos tree =
          if pos = len then tree.tree_expr else
          match tree.tree_expr with
          | Suffix _ -> assert false
          | String (str, num) ->
              let max_len = len - pos in
              let len1 = String.length str in
              if len1 < max_len then raise Not_found;
              let str2 = String.sub s pos max_len in
              let str1 = String.sub str 0 max_len in
(*        lprintf "Check [%s] with [%s]" str1 str2; lprint_newline (); *)
              if str1 = str2 then begin
                  Suffix num 
                end else
                raise Not_found
          | Filtered _ -> raise Not_found
          | List (num, list) ->
              if pos = len then Suffix num else
              let c = s.[pos] in
(*        lprintf "Multiplex on [%c]" c; lprint_newline (); *)
              iter (pos+1) (List.assoc c list)
        in
        iter 0 idx.index_tree
      with _ ->
          Suffix { suffix_num = 0;
            suffix_value = "";
            suffix_word = None;
            suffix_words = [];
          } 
    
    
    let rec spaces n=
      if n > 0 then begin
          lprint_char ' ';
          spaces (n-1)
        end
    
    let print idx =
      lprintf_nl "INDEX";
      let rec iter pos tree =
        match tree.tree_expr with
        | Suffix s ->
            spaces pos;
            lprintf_nl "SUFFIX %d" s.suffix_num;
        | String (s, num) ->
            spaces pos;
            lprintf "[%s]  ---> [%d]" s num.suffix_num;
            spaces pos;
            lprintf "  ";
            List.iter (fun w ->
                lprintf_nl "%s " w.word_string) num.suffix_words;
        | List(num, list) ->
            spaces pos;
            lprintf_nl "List [%d]" num.suffix_num;
            List.iter (fun (c, tree) ->
                spaces (pos+2);
                lprintf_nl "Char [%c]" c;
                iter (pos+4) tree
            ) list;
        | Filtered tree ->
            lprintf_nl "FILTERED";
            iter (pos+2) tree
      in
      iter 0 idx.index_tree
    
    let create () =
      {
        next_doc = 0;
        index_tree = { tree_expr = String ("", new_word  "") };
        next_word = 0;
      }
    
    let print_word_docs w =
      lprintf_nl "Word [%s]" w.word_string;
      List.iter (fun (doc, _) ->
          lprintf_nl "    %d" (Doc.num doc)) w.word_docs
    
    let rec print_suffix_docs suffix = 
      List.iter (fun w ->
          print_word_docs w
      ) (match suffix.suffix_word with
          None ->  suffix.suffix_words
        | Some w -> w :: suffix.suffix_words)
    
    let rec print_all_docs s =
      match s with
        Suffix suffix ->  print_suffix_docs suffix
      | String (_,s) -> print_suffix_docs s
      | List(s, list) ->
          print_suffix_docs s;
          List.iter (fun (c,t) -> print_all_docs t.tree_expr) list
      | Filtered t ->
          lprintf_nl "filtered";
          print_all_docs t.tree_expr




(**************** FILTERS *****************)
    
    
    
    
    let rec filter_tree tree = 
      match tree.tree_expr with
        List (s, list) ->
          List.iter (fun (c, tree) -> filter_tree tree) list;
          filter_suffix s
      | String (_,s) 
      | Suffix s -> filter_suffix s
      | Filtered t ->
          filter_tree t
    
    and filter_suffix s =
      List.iter (fun w ->
          if not w.word_filtered then begin
(*          lprintf "FILTER ON WORD [%s]" w.word_string; 
  lprint_newline (); *)
              w.word_filtered <- true;
              List.iter (fun (doc, where) ->
                  if not (Doc.filtered doc) then
                    Doc.filter doc true) w.word_docs
            end)
      (match s.suffix_word with
          None -> s.suffix_words | Some w -> w :: s.suffix_words)
    
    let add_filter idx s =
(*  lprintf "ADD FILTER ON %s" s; lprint_newline (); *)
      let len = String.length s in
      let rec iter pos tree =
        if pos = len then begin
(*        lprintf "FILTER AT FINAL POS"; lprint_newline (); *)
            tree.tree_expr <- Filtered { tree_expr = tree.tree_expr };
            filter_tree tree
          end
        else
        match tree.tree_expr with
        | Suffix _ -> 
(*        lprintf "FILTER ON SUFFIX ???"; lprint_newline (); *)
            assert false
        | String (str, num) ->
(*        lprintf "FILTER ON STRING "; lprint_newline (); *)
            if str = "" then
              let new_num = new_word (String.sub s 0 pos) in
              tree.tree_expr <- List (num, 
                [s.[pos], 
                  { tree_expr = String (
                      String.sub s (pos+1) (String.length s -1-pos), new_num); 
                  }]);
            else 
              tree.tree_expr <- List (new_word (String.sub s 0 pos), 
                [str.[0], 
                  { tree_expr = String (
                      String.sub str 1 (String.length str - 1), num); }]);
            iter pos tree; 
        | Filtered tree ->
(*        lprintf "FILTER ON FILTER"; lprint_newline (); *)
            iter pos tree
        | List (num, list) -> 
(*        lprintf "FILTER IN LIST"; lprint_newline (); *)
            let c = s.[pos] in
            try 
              iter (pos+1) (List.assoc c list)
            with Not_found ->
                let new_num = new_word s in
                let new_tree = { tree_expr = String (
                      String.sub s (pos+1) (String.length s -pos-1), new_num); 
                  } in
                tree.tree_expr <- List (num, (c, new_tree) :: list);
                iter (pos+1) new_tree
      in
      iter 0 idx.index_tree
    
    let filter_words index list = 
      List.iter (fun s -> ignore (add_filter index s)) list
    
    let clear_filter index = 
      let rec iter tree =
        match tree.tree_expr with
          List (s, list) ->
            List.iter (fun (c, tree) -> iter tree) list;
            iter_suffix s
        | String (_,s) 
        | Suffix s -> iter_suffix s
        | Filtered t ->
(*        lprintf "REMOVE FILTER"; lprint_newline (); *)
            tree.tree_expr <- t.tree_expr;
            iter tree
      
      and iter_suffix s =
        List.iter (fun w ->
            if w.word_filtered then begin
(*            lprintf "REMOVE FILTERED WORD %s" w.word_string;
            lprint_newline (); *)
                w.word_filtered <- false;
                List.iter (fun (doc, where) ->
                    if Doc.filtered doc then begin
(*
                    lprintf "RMOVE FILTER ON DOC"; 
lprint_newline ();
  *)
                        Doc.filter doc false
                      end) w.word_docs
              end)
        (match s.suffix_word with
            None -> s.suffix_words | Some w -> w :: s.suffix_words)
      
      in
      iter index.index_tree
    
    let filtered doc = Doc.filtered doc
    
    
    let or_get_fields map tree fields =
(*  lprintf "get_fields"; lprint_newline (); *)
      let rec iter tree = 
(* retourne une liste de listes triees de documents *)
        match tree with
          List (s, tree_list) ->
            iter_suffix s;
            List.iter (fun (_, tree) ->
                iter tree.tree_expr
            ) tree_list
        | String (_, s) 
        | Suffix s ->
            iter_suffix s
        | Filtered _ -> ()
      
      and iter_suffix s =
(*    lprintf "iter suffix"; lprint_newline (); *)
        List.iter (fun w ->
(*        lprintf "occurences of [%s]" w.word_string; lprint_newline ();  *)
            if not w.word_filtered then
              List.iter (fun (doc, where) ->
(*            lprintf "in doc %d field %d/%d" doc.doc_num where fields; 
            lprint_newline (); *)
                  if where land fields <> 0 && 
                    not (Doc.filtered doc) &&
                    not (
                      Intmap.mem (Doc.num doc) !map) then begin
(*                lprintf "add doc to map"; lprint_newline (); *)
                      map := Intmap.add (Doc.num doc) doc !map
                    end
              ) w.word_docs) (match s.suffix_word with
            None -> s.suffix_words | Some w -> w :: s.suffix_words);    
      in
      iter tree;
      !map
    
    
    let and_get_fields tree fields and_map = 
      let map = ref Intmap.empty in 
      let rec iter tree = 
(* retourne une liste de listes triees de documents *)
        match tree with
          List (s, tree_list) ->
            iter_suffix s;
            List.iter (fun (_, tree) ->
                iter tree.tree_expr
            ) tree_list
        | String (_, s) 
        | Suffix s ->
            iter_suffix s
        | Filtered _ -> ()
      
      and iter_suffix s =
(*    lprintf "ITER AND SUFFIX [%d]" s.suffix_num; lprint_newline (); *)
        List.iter (fun w ->
            if not w.word_filtered then
(*        lprintf "ON WORD [%s]" w.word_string; lprint_newline (); *)
              List.iter (fun (doc, where) ->
(*            lprintf "CHECK %d" doc.doc_num; lprint_newline (); *)
                  if where land fields <> 0 &&
                    not (Doc.filtered doc) &&
                    Intmap.mem (Doc.num doc) and_map &&
                    not (
                      Intmap.mem (Doc.num doc) !map) then begin
(*                lprintf "add doc to AND map"; lprint_newline (); *)
                      map := Intmap.add (Doc.num doc) doc !map
                    end
                  else begin

(*                lprintf "AND failed"; lprint_newline (); *)
                      ()
                    end
              ) w.word_docs)  (match s.suffix_word with
            None -> s.suffix_words | Some w -> w :: s.suffix_words);
      in
      iter tree;
      !map
      
    let size node = 0
  end
  
  
module FullMake (Doc : Indexer.Doc) = Indexer.FullMake (Doc ) (Make)