File: HtWordCodec.cc

package info (click to toggle)
htdig 1%3A3.2.0b6-21
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 21,292 kB
  • sloc: ansic: 49,632; cpp: 46,468; sh: 17,400; xml: 4,180; perl: 2,543; makefile: 888; php: 79; asm: 14
file content (437 lines) | stat: -rw-r--r-- 12,104 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
//
// HtWordCodec.cc
//
// HtWordCodec: Given two lists of pair of "words" 'from' and 'to';
//              simple one-to-one translations, use those lists to translate.
//              Only restriction are that no null (0) characters must be
//              used in "words", and that there is a character "joiner" that
//              does not appear in any word.  One-to-one consistency may be
//              checked at construction.
//
// Part of the ht://Dig package   <https://htdig.sourceforge.net/>
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later 
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: HtWordCodec.cc,v 1.9 2004/05/28 13:15:21 lha Exp $
//

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include "HtWordCodec.h"

// Do not use 0, so we can use "normal" string routines.
// Values 1..4 are used to describe how many bytes are used to
// keep the number.  Do not use other than control-characters,
// as the first character for internal encodings, so the user
// can use "international" characters (128 .. 255) for cute
// encodings to use across different configuration files and
// databases.
#define JOIN_CHAR 5
#define QUOTE_CHAR 6
#define FIRST_INTERNAL_SINGLECHAR 7
#define LAST_INTERNAL_SINGLECHAR 31


HtWordCodec::HtWordCodec()
{
  myFrom = 0;
  myTo = 0;
  myFromMatch = 0;
  myToMatch = 0;
}


HtWordCodec::~HtWordCodec()
{
  if (myFrom)
    delete myFrom;

  if (myTo)
    delete myTo;

  if (myFromMatch)
    delete myFromMatch;

  if (myToMatch)
    delete myToMatch;
}


// Straightforward filling of the encoding-lists.
HtWordCodec::HtWordCodec(StringList *from, StringList *to, char joiner)
{
  myFromMatch = new StringMatch;
  myToMatch = new StringMatch;

  myTo = to;
  myFrom = from;

  String to_pattern(myTo->Join(joiner));

  // After being initialized with Join, the strings are not
  // null-terminated, but that is done through "operator char*".
  myToMatch->Pattern(to_pattern, joiner);

  String from_pattern(myFrom->Join(joiner));
  myFromMatch->Pattern(from_pattern, joiner);

}


// This constructor is the most complicated function in this class.
// It handles consistency checking for the supplied code-lists.

// Cleanups for anything except myTo, myFrom, myToMatch is
// necessary.  The member myFromMatch is used as a sanity check
// for member functions to see that the constructor was
// successful in case the programmer forgets to check errmsg.
HtWordCodec::HtWordCodec(StringList &requested_encodings,
                         StringList &frequent_substrings,
                         String &errmsg)
{
  if ((requested_encodings.Count() % 2) != 0)
  {
    errmsg =
      "Expected pairs, got odd number of strings";

    return;
  }

  myFrom = new StringList;
  myTo = new StringList;

  // Go through requested_encodings and fill myTo and myFrom.
  // Check that the "to" strings look remotely sane regarding
  // reserved characters.

  // Iteration temporaries.
  String *from;
  String *to;

  int n_of_pairs = requested_encodings.Count() / 2;

  requested_encodings.Start_Get();
  while ((from = (String *) requested_encodings.Get_Next()) != NULL)
  {
    // Sanity check: Reserve empty strings as we cannot do
    // anything sane with them.

    int templen = from->length();
    if (templen == 0)
    {
      errmsg = "Empty strings are not allowed";
      return;
    }

    myFrom->Add(new String(*from));

    // This must be non-null since we checked "oddness" above.
    to = (String *) requested_encodings.Get_Next();

    templen = to->length();
    if (templen == 0)
    {
      errmsg = "Empty strings are not allowed";
      return;
    }

    // We just have to check that there's no JOIN_CHAR in the
    // string.  Since no "to" is allowed to be part of any other
    // "to", there will be no ambiguity, even if one would
    // contain a QUOTE_CHAR (which is documented as invalid anyway).
    if (strchr(from->get(), JOIN_CHAR) != NULL)
    {
      errmsg =
        form("(\"%s\" =>) \"%s\" contains a reserved character (number %d)",
             from->get(), to->get(), int(JOIN_CHAR));
      return;
    }

    // Loop over the other "to"-strings and check that this
    // string is not a  substring of any other "to", or vice versa.
    // Return in error if it is so.
    int i;
    int count = myTo->Count();
    for (i = 0; i < count; i++)
    {
      String *ith = (String *) myTo->Nth(i);

      // Just check if the shorter string is part of the
      // longer string.
      if (to->length() < ith->length()
          ? ith->indexOf(to->get()) != -1
          : to->indexOf(ith->get()) != -1)
      {
        errmsg =
          form("\"%s\" => \"%s\" collides with (\"%s\" => \"%s\")",
               from, to, (*myFrom)[i], ith->get());

        return;
      }
    }

    // All ok, just add this one.
    myTo->Add(new String(*to));
  }

  // Check that none of the "to"-strings is a substring of any
  // of the "from" strings, since that's hard to support and
  // most probably is a user mistake anyway.

  StringMatch req_tos;
  String req_to_pattern(myTo->Join(JOIN_CHAR));
  int which, length;

  // The StringMatch functions want the strings
  // zero-terminated, which is done through "operator char*".
  req_tos.Pattern(req_to_pattern, JOIN_CHAR);

  // Check the requested encodings.
  if (n_of_pairs != 0)
  {
    int i;
    for (i = 0; i < n_of_pairs; i++)
    {
      from = (String *) myFrom->Nth(i);
      if (req_tos.FindFirst(from->get(), which, length) != -1)
      {
        if (i != which)
        {
          errmsg =
            form("(\"%s\" => \"%s\") overlaps (\"%s\" => \"%s\")",
                 (*myFrom)[which], (*myTo)[which],
                 from->get(), (*myTo)[i]);
        }
        else
        {
          errmsg =
            form("Overlap in (\"%s\" => \"%s\")",
                 from->get(), (*myTo)[i]);
        }

        return;
      }
    }
  }

  if (frequent_substrings.Count() != 0)
  {
    // Make a temporary search-pattern of the requested
    // from-strings.

    StringMatch req_froms;
    String req_from_pattern(myFrom->Join(JOIN_CHAR));

    req_froms.Pattern(req_from_pattern, JOIN_CHAR);

    // Continue filling "to" and "from" from frequent_substrings and
    // internal encodings.  If a frequent_substring is found in the
    // requested from-strings, it is ignored, but the internal
    // encoding is still ticked up, so that changes in
    // requested_encodings (e.g. url_part_aliases) do not change
    // an existing database (e.g. containing common_url_parts).

    int internal_encoding_no = 0;

    String *common_part;
    frequent_substrings.Start_Get();
    String to;

    for (;
         (common_part = (String *) frequent_substrings.Get_Next()) != NULL;
         internal_encoding_no++)
    {
      int templen = common_part->length();
      if (templen == 0)
      {
        errmsg = "Empty strings are not allowed";
        return;
      }

      // Is a "From" string in it, or is a "To" string in it?
      //  Note that checking if there are *any* requested
      // encodings (n_of_pairs) is not just an "optimization";
      // it is necessary since StringMatch will return 0 (not
      // -1) if the pattern is empty (FIXME: changing that
      // breaks something else in another part of ht://Dig).

      if (n_of_pairs
          && (req_froms.FindFirst(common_part->get()) != -1
              || req_tos.FindFirst(common_part->get()) != -1))
        continue;

      to = 0;                   // Clear previous run.

      // Dream up an encoding without zeroes.
      // Use FIRST_INTERNAL_SINGLECHAR .. LAST_INTERNAL_SINGLECHAR
      // for the first encodings, as much as possible.

      long int number_to_store =
        internal_encoding_no + FIRST_INTERNAL_SINGLECHAR;

      if (number_to_store <= LAST_INTERNAL_SINGLECHAR)
      {
        to << char(number_to_store);
      }
      else
      {
        // Use <number-of-bytes-in-length>
        // <number-as-nonzero-bytes> to code the rest.
        //  Note that we assume eight-bit chars here, which
        // should be ok for all systems you run htdig on.
        // At least it helps clarity here.

        number_to_store -= LAST_INTERNAL_SINGLECHAR;

        // Make sure highest bit in every byte is "1" by
        // inserting one there.
        char to_store[sizeof(number_to_store)+1];
        int j = 1;

        while (number_to_store > 0x7f)
        {
          number_to_store = ((number_to_store & ~0x7f) << 1)
            | 0x80 | (number_to_store & 0x7f);

          to_store[j++] = char(number_to_store);
          number_to_store >>= 8;
        }

        // Finally, store the highest byte.  It too shall have
        // the highest bit set.  This is the easiest way to
        // adjust it not to be QUOTE_CHAR.
        to_store[0] = j;
        to_store[j] = char(number_to_store | 0x80);

        to.append(to_store, j+1);
      }

      // Add to replacement pairs.
      myFrom->Add(new String(*common_part));
      myTo->Add(new String(to));
    }
  }

  // Now, add the quoted "to":s to the "to"-list, with the unquoted
  // "to":s to the "from"-list.  This way we do not have to
  //  check for quoting separately.  Like this:
  // From  To
  // foo : !
  // bar : >
  // baz : $
  // !   : \!
  // >   : \>
  // $   : \$
  //
  // Since we checked that none of the "To":s are in a "From" we
  // can do this.

  myTo->Start_Get();
  int to_count = myTo->Count();
  String *current;
  String temp;

  int i;
  for (i = 0; i < to_count; i++)
  {
    // It works to append *and* iterate through a
    // StringList, despite not having an iterator class.
    current = (String *) myTo->Nth(i);

    myFrom->Add(new String(*current));

    temp = 0; // Reset any previous round.
    temp.append(char(QUOTE_CHAR));
    temp.append(*current);

    myTo->Add(new String(temp));
  }

  myFromMatch = new StringMatch;
  myToMatch = new StringMatch;

  String to_pattern(myTo->Join(JOIN_CHAR));
  String from_pattern(myFrom->Join(JOIN_CHAR));

  // StringMatch class has unchecked limits, better check them.
  // The length of each string in the pattern an the upper limit
  // of the needs.
  if (to_pattern.length() - (myTo->Count() - 1) > 0xffff
      || from_pattern.length() - (myFrom->Count() - 1) > 0xffff)
  {
    errmsg = "Limit reached; use fewer encodings";
    return;
  }

  myToMatch->Pattern(to_pattern, JOIN_CHAR);
  myFromMatch->Pattern(from_pattern, JOIN_CHAR);

  errmsg = 0;
}


// We only need one "coding" function, since quoting and unquoting is
// handled through the to- and from-lists.
String
HtWordCodec::code(const String &orig_string, StringMatch &match,
                  StringList &replacements) const
{
  String retval;
  String tempinput;
  int offset, which, length;
  const char *orig;

  // Get a null-terminated string, usable for FindFirst to look at.
  orig = orig_string.get();

  // Sanity check.  If bad use, just return empty strings.
  if (myFromMatch == NULL)
  {
    return retval;
  }

  // Need to check if "replacements" is empty; that is, if no
  // transformations should be done.  FindFirst() does not return
  // -1 in this case, it returns 0.
  if (replacements.Count() == 0)
    return orig_string;

  // Find the encodings and replace them.
  while ((offset = match.FindFirst(orig, which, length)) != -1)
  {
    // Append the previous part that was not part of a code.
    retval.append(orig, offset);

    // Replace with the original string.
    retval.append(replacements[which]);

    orig += offset + length;
  }

  // Add the final non-matched part.
  retval.append(orig);

  return retval;
}


// The assymetry is caused by swapping both the matching and
// replacement lists.
String
HtWordCodec::decode(const String &orig) const
{
  return code(orig, *myToMatch, *myFrom);
}


String
HtWordCodec::encode(const String &orig) const
{
  return code(orig, *myFromMatch, *myTo);
}

// End of HtWordCodec.cc