Package: grep / 2.20-4.1

0001-grep-P-invalid-utf8-non-matching-debian.patch Patch series | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
From 16fc7fa0e0f273fa81e382b4303617d707364902 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Tue, 9 Sep 2014 12:41:54 -0700
Subject: [PATCH 1/2] grep: -P now treats invalid UTF-8 input as non-matching

Problem reported by Santiago Vila in: http://bugs.gnu.org/18266
* NEWS: Mention this.
* src/pcresearch.c (Pexecute): Treat UTF-8 encoding errors
as non-matching data, instead of exiting 'grep'.
* tests/pcre-infloop: grep now exits with status 1, not 2.
* tests/pcre-invalid-utf8-input: grep now exits with status 0, not 2.
---
 NEWS                          |  3 ++
 src/pcresearch.c              | 70 +++++++++++++++++--------------------------
 tests/pcre-infloop            |  2 +-
 tests/pcre-invalid-utf8-input |  2 +-
 4 files changed, 33 insertions(+), 44 deletions(-)

--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,9 @@
 
 * Noteworthy changes in release 2.20 (2014-06-03) [stable]
 
+  grep -P no longer reports an error and exits when given invalid UTF-8 data.
+  Instead, it considers the data to be non-matching.
+
 ** Bug fixes
 
   grep --max-count=N FILE would no longer stop reading after the Nth match.
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -207,34 +207,43 @@
 #else
   /* This array must have at least two elements; everything after that
      is just for performance improvement in pcre_exec.  */
-  int sub[300];
+  enum { nsub = 300 };
+  int sub[nsub];
 
-  const char *line_buf, *line_end, *line_next;
+  char const *p = start_ptr ? start_ptr : buf;
+  int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
+  char const *line_start = buf;
   int e = PCRE_ERROR_NOMATCH;
-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+  char const *line_end;
 
   /* PCRE can't limit the matching to single lines, therefore we have to
      match each line in the buffer separately.  */
-  for (line_next = buf;
-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
-       start_ofs -= line_next - line_buf)
+  for (; p < buf + size; p = line_start = line_end + 1)
     {
-      line_buf = line_next;
-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
-      if (line_end == NULL)
-        line_next = line_end = buf + size;
-      else
-        line_next = line_end + 1;
+      line_end = memchr (p, eolbyte, buf + size - p);
 
-      if (start_ptr && start_ptr >= line_end)
-        continue;
-
-      if (INT_MAX < line_end - line_buf)
+      if (INT_MAX < line_end - p)
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
-                     start_ofs < 0 ? 0 : start_ofs, 0,
-                     sub, sizeof sub / sizeof *sub);
+      /* Treat encoding-error bytes as data that cannot match.  */
+      for (;;)
+        {
+          int valid_bytes;
+          e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
+          if (e != PCRE_ERROR_BADUTF8)
+            break;
+          valid_bytes = sub[0];
+          e = pcre_exec (cre, extra, p, valid_bytes, 0,
+                         options | PCRE_NO_UTF8_CHECK, sub, nsub);
+          if (e != PCRE_ERROR_NOMATCH)
+            break;
+          p += valid_bytes + 1;
+          options = PCRE_NOTBOL;
+        }
+
+      if (e != PCRE_ERROR_NOMATCH)
+        break;
+      options = 0;
     }
 
   if (e <= 0)
@@ -251,10 +260,6 @@
           error (EXIT_TROUBLE, 0,
                  _("exceeded PCRE's backtracking limit"));
 
-        case PCRE_ERROR_BADUTF8:
-          error (EXIT_TROUBLE, 0,
-                 _("invalid UTF-8 byte sequence in input"));
-
         default:
           /* For now, we lump all remaining PCRE failures into this basket.
              If anyone cares to provide sample grep usage that can trigger
@@ -268,25 +273,8 @@
     }
   else
     {
-      /* Narrow down to the line we've found.  */
-      char const *beg = line_buf + sub[0];
-      char const *end = line_buf + sub[1];
-      char const *buflim = buf + size;
-      char eol = eolbyte;
-      if (!start_ptr)
-        {
-          /* FIXME: The case when '\n' is not found indicates a bug:
-             Since grep is line oriented, the match should never contain
-             a newline, so there _must_ be a newline following.
-           */
-          if (!(end = memchr (end, eol, buflim - end)))
-            end = buflim;
-          else
-            end++;
-          while (buf < beg && beg[-1] != eol)
-            --beg;
-        }
-
+      char const *beg = start_ptr ? p + sub[0] : line_start;
+      char const *end = start_ptr ? p + sub[1] : line_end + 1;
       *match_size = end - beg;
       return beg - buf;
     }
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@
 fail=0
 
 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
 
 Exit $fail
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -16,6 +16,6 @@
 printf 'j\202\nj\n' > in || framework_failure_
 
 LC_ALL=en_US.UTF-8 grep -P j in
-test $? -eq 2 || fail=1
+test $? -eq 0 || fail=1
 
 Exit $fail