File: regexpreprocessor.cpp

package info (click to toggle)
source-highlight 3.1.8-1.2~deb9u1
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 10,224 kB
  • sloc: sh: 11,709; cpp: 10,226; ansic: 9,521; makefile: 1,902; lex: 1,200; yacc: 1,021; php: 213; perl: 211; awk: 98; erlang: 94; lisp: 90; java: 75; ruby: 69; python: 61; asm: 43; ada: 36; ml: 29; haskell: 27; xml: 23; cs: 11; sql: 8; tcl: 7; sed: 4
file content (429 lines) | stat: -rw-r--r-- 13,776 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
//
// C++ Implementation: RegexPreProcessor
//
// Description: performs operations or inspections on a string representing
// a valid regular expression
//
// Author: Lorenzo Bettini <http://www.lorenzobettini.it>, (C) 1999-2007
//
// Copyright: See COPYING file that comes with this distribution
//
//

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "regexpreprocessor.h"
#include <boost/regex.hpp>
#include <sstream>
#include <ctype.h>
#include <iostream>

using namespace std;

namespace srchilite {

// IMPORTANT: the following regular expressions assume that the
// regular expression they try to match is a valid regular expression

/// matches character sets in a regular expression
const boost::regex char_set_exp("\\[([^\\|]*)\\]");

/// substitute a "(" with "(?:" if it's not followed by a ? and not preceeded by \\ char
const boost::regex from("(\\\\\\()|(\\((?!\\?))");
/// substitute a "(" with "(?:" if it's not followed by a ? and not preceeded by \\ char
const string into = "(?1\\\\\\()(?2\\(\\?\\:)";

/// found actual marking parenthesis, i.e., not preceeded by \\ and not followed by ?
const boost::regex paren("(?<!\\\\)\\((?!\\?)");

/// regular expression matching a backreference, e.g., \1 or inside a conditional (?(1)...)
const boost::regex
        backreference("(\\\\([[:digit:]]))|(\\(\\?\\(([[:digit:]]))");

/// index of the subexpression matching a slashed backreference
#define SLASHED_BACKREFERENCE 1
/// index of the subexpression matching the number in a slashed backreference
#define SLASHED_BACKREFERENCE_NUM 2
/// index of the subexpression matching a conditional backreference
#define CONDITIONAL_BACKREFERENCE 3
/// index of the subexpression matching the number in a conditional backreference
#define CONDITIONAL_BACKREFERENCE_NUM 4

/** regular expression matching a reference, e.g., @{digit} the @$ must not be preceeded
 by an escape char */
const boost::regex reference_exp("(?<!\\\\)@\\{([[:digit:]])\\}");

/// substitute a special char (i.e., .[{()\*+?|^$)...
const boost::regex special_char("(\\.|\\[|\\]|\\{|\\}|\\(|\\)|\\\\|\\*|\\+|\\?|\\||\\^|\\$)");
/// ...with its escaped version
const string special_char_escape = "(?1\\\\\\1)";


/// index of the subexpression matching the number backreference
#define BACKREFERENCE_NUM 1

const string
        subexpressions_info::ERR_OUTER_UNMARKED = "unmarked subexpressions are allowed only inside marked subexpressions";
const string
        subexpressions_info::ERR_NESTED_SUBEXP = "subexpressions of subexpressions are not allowed";
const string
        subexpressions_info::ERR_UNBALANCED_PAREN = "unbalanced parenthesis";
const string
        subexpressions_info::ERR_OUTSIDE_SUBEXP = "parts outside marked subexpressions are not allowed";

RegexPreProcessor::RegexPreProcessor() {
}

RegexPreProcessor::~RegexPreProcessor() {
}

const string RegexPreProcessor::preprocess(const string &s) {
    // substitute a "(" with "(?:" if it's not followed by a ? and not preceeded by \\ char
    return boost::regex_replace(s, from, into, boost::match_default
            | boost::format_all);
}

const string _make_nonsensitive(const string &s) {
    ostringstream result;

    for (string::const_iterator it = s.begin(); it != s.end(); ++it)
        if (isalpha(*it))
            result << "["<< (char)toupper(*it) << (char)tolower(*it) << "]";
        else
            result << *it;

    return result.str();
}

const string RegexPreProcessor::make_nonsensitive(const string &s) {
    boost::sregex_iterator m1(s.begin(), s.end(), char_set_exp);
    boost::sregex_iterator m2;

    if (m1 == m2)
        return _make_nonsensitive(s);

    ostringstream buffer;
    string prefix;
    string suffix;

    for (boost::sregex_iterator it = m1; it != m2; ++it) {
        prefix = it->prefix();
        suffix = it->suffix();

        if (prefix.size()) {
            buffer << _make_nonsensitive(prefix);
        }

        buffer << (*it)[0];
    }

    if (suffix.size()) {
        buffer << _make_nonsensitive(suffix);
    }

    return buffer.str();
}

unsigned int RegexPreProcessor::num_of_subexpressions(const string &s) {
    boost::sregex_iterator m1(s.begin(), s.end(), paren);
    boost::sregex_iterator m2;

    int counter = 0;

    for (boost::sregex_iterator it = m1; it != m2; ++it) {
        ++counter;
    }

    return counter;
}

const subexpressions_strings *RegexPreProcessor::split_marked_subexpressions(
        const string &s) {
    boost::sregex_iterator m1(s.begin(), s.end(), paren);
    boost::sregex_iterator m2;

    // we don't need to parse it (we can use the regex) since we assume that
    // the regular expression represented by s is made up of only
    // marked subexpressions and no nested subexpressions and char outside subexpressions

    subexpressions_strings *split = new subexpressions_strings;

    for (boost::sregex_iterator it = m1; it != m2;) {
        string prefix = it->prefix();
        if (prefix.size())
            split->push_back("(" + prefix);

        string suffix = it->suffix();
        if (++it == m2)
            split->push_back("(" + suffix);
    }

    return split;
}

subexpressions_info RegexPreProcessor::num_of_marked_subexpressions(
        const string &s, bool allow_outer_char, bool allow_outer_nonmarked) {
    subexpressions_info sexps;

    // number of open parenthesis
    int open_paren_num = 0;
    // whether we're inside a marked subexpressions
    bool found_marked_subexp = false;
    // whether we're inside a char sequence [...]
    bool char_sequence_state = false;
    // whether we're inside a char class name, e.g., [:alpha:]
    bool char_class_name_state = false;
    // len of string
    int len = s.size();
    // char we're examining
    char c;

    for (int i = 0; i < len; ++i) {
        c = s[i];
        if (!char_sequence_state && c == '[' && (i == 0 || s[i-1] != '\\') ) {
            char_sequence_state = true;
        } else if (char_sequence_state) {
            // here i cannot be 0
            if (char_class_name_state) {
                if (c == ']' && s[i-1] == ':') {
                    // exit the char class name state
                    char_class_name_state = false;
                }
            } else if (!char_class_name_state && c == '[' && s[i-1] != '\\'
                && (i+1) < len && s[i+1] == ':') {
                char_class_name_state = true;
            } else if (c == ']' && s[i-1] != '\\') {
                // exit the char sequence state
                char_sequence_state = false;
            }
            // else simply go on skipping chars
        } else if (c == '\\' && (i+1) < len && (s[i+1] == '('|| s[i+1] == ')')) {
            // skip the escaped paren
            ++i;
        } else if (c == '(') {
            // we found a subexp
            ++open_paren_num;

            if ((i+1) < len && s[i+1] == '?') {
                if (!found_marked_subexp && !allow_outer_nonmarked) {
                    // outer subexpressions must be marked
                    sexps.errors = subexpressions_info::ERR_OUTER_UNMARKED;
                    return sexps;
                }
            } else {
                // it's a marked subexp
                if (found_marked_subexp) {
                    // we don't allow nested subexpressions
                    sexps.errors = subexpressions_info::ERR_NESTED_SUBEXP;
                    return sexps;
                }

                // we found a marked subexp
                found_marked_subexp = true;
                ++(sexps.marked);
            }
        } else if (c == ')') {
            if (!open_paren_num) {
                // unbalanced parenthesis
                sexps.errors = subexpressions_info::ERR_UNBALANCED_PAREN;
                return sexps;
            }

            --open_paren_num;

            // end of marked subexp
            if (!open_paren_num && found_marked_subexp)
                found_marked_subexp = false;
        } else {
            // we don't allow non marked parts
            if (!found_marked_subexp && !allow_outer_char) {
                sexps.errors = subexpressions_info::ERR_OUTSIDE_SUBEXP;
                return sexps;
            }
        }
    }

    // check that all paren are closed
    if (open_paren_num)
        sexps.errors = subexpressions_info::ERR_UNBALANCED_PAREN;

    return sexps;
}

bool RegexPreProcessor::contains_backreferences(const std::string &s) {
    return boost::regex_search(s, backreference);
}

backreference_info RegexPreProcessor::num_of_backreferences(const string &s) {
    boost::sregex_iterator m1(s.begin(), s.end(), backreference);
    boost::sregex_iterator m2;

    backreference_info info(std::make_pair(0, 0));

    for (boost::sregex_iterator it = m1; it != m2; ++it) {
        ++(info.first);
        // for converting a string into the number
        stringstream buffer;
        int backreference_num;

        if ((*it)[SLASHED_BACKREFERENCE].matched) {
            buffer << (*it)[SLASHED_BACKREFERENCE_NUM];
        } else {
            buffer << (*it)[CONDITIONAL_BACKREFERENCE_NUM];
        }

        // retrieve the number of the backreference
        buffer >> backreference_num;
        if (backreference_num > info.second) {
            // we update the number of the highest backreference
            info.second = backreference_num;
        }
    }

    return info;
}

backreference_info RegexPreProcessor::num_of_references(const string &s) {
    boost::sregex_iterator m1(s.begin(), s.end(), reference_exp);
    boost::sregex_iterator m2;

    backreference_info info(std::make_pair(0, 0));

    for (boost::sregex_iterator it = m1; it != m2; ++it) {
        ++(info.first);
        // for converting a string into the number
        stringstream buffer;
        int backreference_num;

        // the digit of the reference
        buffer << (*it)[BACKREFERENCE_NUM];

        // retrieve the number of the backreference
        buffer >> backreference_num;
        if (backreference_num > info.second) {
            // we update the number of the highest backreference
            info.second = backreference_num;
        }
    }

    return info;
}

const std::string RegexPreProcessor::replace_backreferences(
        const std::string &original, const backreference_replacements &replace) {
    boost::sregex_iterator m1(original.begin(), original.end(), backreference);
    boost::sregex_iterator m2;

    if (m1 == m2) {
        // no backreference in the current string
        return original;
    }

    ostringstream result;

    for (boost::sregex_iterator it = m1; it != m2;) {
        string prefix = it->prefix();
        if (prefix.size())
            result << prefix;

        // for converting a string into the number
        stringstream buffer;
        int backreference_num;

        // substitute the backreference number with the corresponding replace
        // string
        if ((*it)[SLASHED_BACKREFERENCE].matched) {
            buffer << (*it)[SLASHED_BACKREFERENCE_NUM];
            // retrieve the number of the backreference
            buffer >> backreference_num;

            result << replace[backreference_num-1];
        } else {
            buffer << (*it)[CONDITIONAL_BACKREFERENCE_NUM];
            // retrieve the number of the backreference
            buffer >> backreference_num;

            result << "(?("<< replace[backreference_num-1];
        }

        string suffix = it->suffix();
        if (++it == m2)
            result << suffix;
    }

    return result.str();
}

const std::string RegexPreProcessor::replace_backreferences(
        const std::string &original, const regex_match_results &results) {
    // build the replacement vector
    backreference_replacements replace(9);

    // fill it with matched results
    for (unsigned i = 1; i < results.size(); ++i) {
        // remember that 0 is the whole expressions
        replace[i-1] = results[i];
    }

    return replace_backreferences(original, replace);
}

const string RegexPreProcessor::replace_references(
        const std::string &original, const backreference_replacements &replace) {
    boost::sregex_iterator m1(original.begin(), original.end(), reference_exp);
    boost::sregex_iterator m2;

    if (m1 == m2) {
        // no backreference in the current string
        return original;
    }

    ostringstream result;

    for (boost::sregex_iterator it = m1; it != m2;) {
        string prefix = it->prefix();
        if (prefix.size())
            result << prefix;

        // for converting a string into the number
        stringstream buffer;
        int backreference_num;

        buffer << (*it)[BACKREFERENCE_NUM];

        // retrieve the number of the backreference
        buffer >> backreference_num;

        // substitute the backreference number with the corresponding replace
        // string, after escaping non-alphanumerical characters
        // this is necessary since this string will be used to create a regular
        // expression
        result << boost::regex_replace(replace[backreference_num-1],
                special_char, special_char_escape, boost::match_default
                | boost::format_all);

        string suffix = it->suffix();
        if (++it == m2)
            result << suffix;
    }

    return result.str();
}

const string RegexPreProcessor::replace_references(
        const std::string &original, const regex_match_results &results) {
    // build the replacement vector
    backreference_replacements replace(9);

    // fill it with matched results
    for (unsigned i = 1; i < results.size(); ++i) {
        // remember that 0 is the whole expressions
        replace[i-1] = results[i];
    }

    return replace_references(original, replace);
}

}