File: mimics_pcre.cc

package info (click to toggle)
re2 20190101%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,832 kB
  • sloc: cpp: 20,111; makefile: 281; python: 281; sh: 246; perl: 224; ansic: 9
file content (187 lines) | stat: -rw-r--r-- 6,057 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Determine whether this library should match PCRE exactly
// for a particular Regexp.  (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
//   * the regexp contains a repetition of an empty string,
//     like (a*)* or (a*)+.  In this case, PCRE will treat
//     the repetition sequence as ending with an empty string,
//     while this library does not.
//   * Perl and PCRE differ on whether \v matches \n.
//     For historical reasons, this library implements the Perl behavior.
//   * Perl and PCRE allow $ in one-line mode to match either the very
//     end of the text or just before a \n at the end of the text.
//     This library requires it to match only the end of the text.
//   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
//     match the end of the text if the last character is a \n.
//     This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.

#include "util/util.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"

namespace re2 {

// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);

// Walker class to compute whether library handles a regexp
// exactly as PCRE would.  See comment at top for conditions.

class PCREWalker : public Regexp::Walker<bool> {
 public:
  PCREWalker() {}
  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
                 int nchild_args);

  bool ShortVisit(Regexp* re, bool a) {
    // Should never be called: we use Walk not WalkExponential.
    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
    return a;
  }
};

// Called after visiting each of re's children and accumulating
// the return values in child_args.  So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
                           bool* child_args, int nchild_args) {
  // If children failed, so do we.
  for (int i = 0; i < nchild_args; i++)
    if (!child_args[i])
      return false;

  // Otherwise look for other reasons to fail.
  switch (re->op()) {
    // Look for repeated empty string.
    case kRegexpStar:
    case kRegexpPlus:
    case kRegexpQuest:
      if (CanBeEmptyString(re->sub()[0]))
        return false;
      break;
    case kRegexpRepeat:
      if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
        return false;
      break;

    // Look for \v
    case kRegexpLiteral:
      if (re->rune() == '\v')
        return false;
      break;

    // Look for $ in single-line mode.
    case kRegexpEndText:
    case kRegexpEmptyMatch:
      if (re->parse_flags() & Regexp::WasDollar)
        return false;
      break;

    // Look for ^ in multi-line mode.
    case kRegexpBeginLine:
      // No condition: in single-line mode ^ becomes kRegexpBeginText.
      return false;

    default:
      break;
  }

  // Not proven guilty.
  return true;
}

// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
  PCREWalker w;
  return w.Walk(this, true);
}


// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate.  For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can.  Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.

class EmptyStringWalker : public Regexp::Walker<bool> {
 public:
  EmptyStringWalker() { }
  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
                 bool* child_args, int nchild_args);

  bool ShortVisit(Regexp* re, bool a) {
    // Should never be called: we use Walk not WalkExponential.
    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
    return a;
  }

 private:
  EmptyStringWalker(const EmptyStringWalker&) = delete;
  EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
};

// Called after visiting re's children.  child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string).  Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
                                  bool* child_args, int nchild_args) {
  switch (re->op()) {
    case kRegexpNoMatch:               // never empty
    case kRegexpLiteral:
    case kRegexpAnyChar:
    case kRegexpAnyByte:
    case kRegexpCharClass:
    case kRegexpLiteralString:
      return false;

    case kRegexpEmptyMatch:            // always empty
    case kRegexpBeginLine:             // always empty, when they match
    case kRegexpEndLine:
    case kRegexpNoWordBoundary:
    case kRegexpWordBoundary:
    case kRegexpBeginText:
    case kRegexpEndText:
    case kRegexpStar:                  // can always be empty
    case kRegexpQuest:
    case kRegexpHaveMatch:
      return true;

    case kRegexpConcat:                // can be empty if all children can
      for (int i = 0; i < nchild_args; i++)
        if (!child_args[i])
          return false;
      return true;

    case kRegexpAlternate:             // can be empty if any child can
      for (int i = 0; i < nchild_args; i++)
        if (child_args[i])
          return true;
      return false;

    case kRegexpPlus:                  // can be empty if the child can
    case kRegexpCapture:
      return child_args[0];

    case kRegexpRepeat:                // can be empty if child can or is x{0}
      return child_args[0] || re->min() == 0;
  }
  return false;
}

// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
  EmptyStringWalker w;
  return w.Walk(re, true);
}

}  // namespace re2