Package: glib2.0 / 2.42.1-1

regex-if-PCRE-is-8.34-or-later-disable-auto-possessi.patch Patch series | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
From e0c0836a09cac0cccfbc1c207bed7a7af1fb63fa Mon Sep 17 00:00:00 2001
From: Simon McVittie <simon.mcvittie@collabora.co.uk>
Date: Tue, 22 Jul 2014 09:33:39 +0100
Subject: [PATCH 4/4] regex: if PCRE is 8.34 or later, disable
 auto-possessification for DFA

Normally, recent PCRE behaves as if certain patterns were replaced
by a more "possessive" pattern that gives the same answer for normal
regex matching, but is more efficient. However, the modified pattern
produces fewer results under DFA. If we want the full set of results
we have to apply PCRE_NO_AUTO_POSSESS, and that's a compile-time flag.

This currently only affects a system PCRE, but would also work fine for
an internal PCRE 8.34 or later if the embedded copy is updated.

Bug: https://bugzilla.gnome.org/show_bug.cgi?id=733325
---
 glib/gregex.c | 122 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 33 deletions(-)

diff --git a/glib/gregex.c b/glib/gregex.c
index 41bf67e..1022dfb 100644
--- a/glib/gregex.c
+++ b/glib/gregex.c
@@ -1267,6 +1267,15 @@ g_regex_unref (GRegex *regex)
     }
 }
 
+/*
+ * @match_options: (inout) (optional):
+ */
+static pcre *regex_compile (const gchar         *pattern,
+                            GRegexCompileFlags   compile_options,
+                            GRegexCompileFlags  *compile_options_out,
+                            GRegexMatchFlags    *match_options,
+                            GError             **error);
+
 /**
  * g_regex_new:
  * @pattern: the regular expression
@@ -1291,12 +1300,8 @@ g_regex_new (const gchar         *pattern,
   GRegex *regex;
   pcre *re;
   const gchar *errmsg;
-  gint erroffset;
-  gint errcode;
   gboolean optimize = FALSE;
   static volatile gsize initialised = 0;
-  unsigned long int pcre_compile_options;
-  GRegexCompileFlags nonpcre_compile_options;
 
   g_return_val_if_fail (pattern != NULL, NULL);
   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
@@ -1325,13 +1330,61 @@ g_regex_new (const gchar         *pattern,
       return NULL;
     }
 
-  nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
-
   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
   if (compile_options & G_REGEX_OPTIMIZE)
     optimize = TRUE;
 
+  re = regex_compile (pattern, compile_options, &compile_options,
+                      &match_options, error);
+
+  if (re == NULL)
+    return NULL;
+
+  regex = g_new0 (GRegex, 1);
+  regex->ref_count = 1;
+  regex->pattern = g_strdup (pattern);
+  regex->pcre_re = re;
+  regex->compile_opts = compile_options;
+  regex->match_opts = match_options;
+
+  if (optimize)
+    {
+      regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
+      if (errmsg != NULL)
+        {
+          GError *tmp_error = g_error_new (G_REGEX_ERROR,
+                                           G_REGEX_ERROR_OPTIMIZE,
+                                           _("Error while optimizing "
+                                             "regular expression %s: %s"),
+                                           regex->pattern,
+                                           errmsg);
+          g_propagate_error (error, tmp_error);
+
+          g_regex_unref (regex);
+          return NULL;
+        }
+    }
+
+  return regex;
+}
+
+static pcre *
+regex_compile (const gchar         *pattern,
+               GRegexCompileFlags   compile_options,
+               GRegexCompileFlags  *compile_options_out,
+               GRegexMatchFlags    *match_options,
+               GError             **error)
+{
+  pcre *re;
+  const gchar *errmsg;
+  gint erroffset;
+  gint errcode;
+  GRegexCompileFlags nonpcre_compile_options;
+  unsigned long int pcre_compile_options;
+
+  nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
+
   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
    * instead uses UTF-8 only if required with PCRE_UTF8. */
   if (compile_options & G_REGEX_RAW)
@@ -1343,7 +1396,9 @@ g_regex_new (const gchar         *pattern,
     {
       /* enable utf-8 */
       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
-      match_options |= PCRE_NO_UTF8_CHECK;
+
+      if (match_options != NULL)
+        *match_options |= PCRE_NO_UTF8_CHECK;
     }
 
   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
@@ -1408,32 +1463,10 @@ g_regex_new (const gchar         *pattern,
         compile_options |= G_REGEX_DUPNAMES;
     }
 
-  regex = g_new0 (GRegex, 1);
-  regex->ref_count = 1;
-  regex->pattern = g_strdup (pattern);
-  regex->pcre_re = re;
-  regex->compile_opts = compile_options;
-  regex->match_opts = match_options;
+  if (compile_options_out != 0)
+    *compile_options_out = compile_options;
 
-  if (optimize)
-    {
-      regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
-      if (errmsg != NULL)
-        {
-          GError *tmp_error = g_error_new (G_REGEX_ERROR,
-                                           G_REGEX_ERROR_OPTIMIZE,
-                                           _("Error while optimizing "
-                                             "regular expression %s: %s"),
-                                           regex->pattern,
-                                           errmsg);
-          g_propagate_error (error, tmp_error);
-
-          g_regex_unref (regex);
-          return NULL;
-        }
-    }
-
-  return regex;
+  return re;
 }
 
 /**
@@ -1873,6 +1906,7 @@ g_regex_match_all_full (const GRegex      *regex,
 {
   GMatchInfo *info;
   gboolean done;
+  pcre *pcre_re;
 
   g_return_val_if_fail (regex != NULL, FALSE);
   g_return_val_if_fail (string != NULL, FALSE);
@@ -1880,6 +1914,24 @@ g_regex_match_all_full (const GRegex      *regex,
   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
 
+#ifdef PCRE_NO_AUTO_POSSESS
+  /* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which
+   * is an optimization for normal regex matching, but results in omitting
+   * some shorter matches here, and an observable behaviour change.
+   *
+   * DFA matching is rather niche, and very rarely used according to
+   * codesearch.debian.net, so don't bother caching the recompiled RE. */
+  pcre_re = regex_compile (regex->pattern,
+                           regex->compile_opts | PCRE_NO_AUTO_POSSESS,
+                           NULL, NULL, error);
+
+  if (pcre_re == NULL)
+    return FALSE;
+#else
+  /* For PCRE < 8.33 the precompiled regex is fine. */
+  pcre_re = regex->pcre_re;
+#endif
+
   info = match_info_new (regex, string, string_len, start_position,
                          match_options, TRUE);
 
@@ -1887,7 +1939,7 @@ g_regex_match_all_full (const GRegex      *regex,
   while (!done)
     {
       done = TRUE;
-      info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
+      info->matches = pcre_dfa_exec (pcre_re, regex->extra,
                                      info->string, info->string_len,
                                      info->pos,
                                      regex->match_opts | match_options,
@@ -1917,6 +1969,10 @@ g_regex_match_all_full (const GRegex      *regex,
         }
     }
 
+#ifdef PCRE_NO_AUTO_POSSESS
+  pcre_free (pcre_re);
+#endif
+
   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
   info->pos = -1;
 
-- 
2.0.1