File: bash53-008.diff

package info (click to toggle)
bash 5.3-2
  • links: PTS
  • area: main
  • in suites: sid
  • size: 44,432 kB
  • sloc: ansic: 134,747; sh: 8,866; yacc: 5,966; makefile: 4,697; perl: 4,105; asm: 48; awk: 23; sed: 16
file content (180 lines) | stat: -rw-r--r-- 4,886 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
			     BASH PATCH REPORT
			     =================

Bash-Release:	5.3
Patch-ID:	bash53-008

Bug-Reported-by:	Grisha Levit <grishalevit@gmail.com>
Bug-Reference-ID:	<20251022174207.10518-1-grishalevit@gmail.com>
Bug-Reference-URL:	https://lists.gnu.org/archive/html/bug-bash/2025-10/msg00145.html

Bug-Description:

Bash tries to consume entire multibyte characters when looking for backslash
escapes in $'...' strings, and treats too many characters as potentially
beginning a multibyte character in UTF-8 locales. Being more selective about
when to call mbrtowc() can lead to optimized string processing and script
speedups. This patch also handles the unlikely situation of a locale
encoding null wide characters with non-null bytes.

--- a/lib/sh/strtrans.c
+++ b/lib/sh/strtrans.c
@@ -55,7 +55,7 @@ ansicstr (const char *string, size_t len
   const char *s;
   unsigned long v;
   size_t clen;
-  int mb_cur_max;
+  size_t mb_cur_max;
 #if defined (HANDLE_MULTIBYTE)
   wchar_t wc;
 #endif
@@ -63,7 +63,7 @@ ansicstr (const char *string, size_t len
   if (string == 0 || *string == '\0')
     return ((char *)0);
 
-  mb_cur_max = MB_CUR_MAX;
+  mb_cur_max = locale_mb_cur_max;
 #if defined (HANDLE_MULTIBYTE)
   temp = 4*len + 4;
   if (temp < 12)
@@ -79,10 +79,14 @@ ansicstr (const char *string, size_t len
 	{
 	  clen = 1;
 #if defined (HANDLE_MULTIBYTE)
-	  if ((locale_utf8locale && (c & 0x80)) ||
-	      (locale_utf8locale == 0 && mb_cur_max > 0 && is_basic (c) == 0))
+	  /* We read an entire multibyte character at a time if we are in a
+	     locale where a backslash can possibly appear as part of a
+	     multibyte character. UTF-8 encodings prohibit this. */
+	  if (locale_utf8locale == 0 && mb_cur_max > 1 && is_basic (c) == 0)
 	    {
 	      clen = mbrtowc (&wc, s - 1, mb_cur_max, 0);
+	      if (MB_NULLWCH (clen))
+		break;			/* it apparently can happen */
 	      if (MB_INVALIDCH (clen))
 		clen = 1;
 	    }
@@ -227,30 +231,24 @@ ansic_quote (const char *str, int flags,
 {
   char *r, *ret;
   const char  *s;
-  size_t l, rsize;
   unsigned char c;
+#if defined (HANDLE_MULTIBYTE)
   size_t clen;
   int b;
-#if defined (HANDLE_MULTIBYTE)
   wchar_t wc;
+  DECLARE_MBSTATE;
 #endif
 
   if (str == 0 || *str == 0)
     return ((char *)0);
 
-  l = strlen (str);
-  rsize = 4 * l + 4;
-  r = ret = (char *)xmalloc (rsize);
+  r = ret = (char *)xmalloc (4 * strlen (str) + 4);
 
   *r++ = '$';
   *r++ = '\'';
 
   for (s = str; c = *s; s++)
     {
-      b = 1;		/* 1 == add backslash; 0 == no backslash */
-      l = 1;
-      clen = 1;
-
       switch (c)
 	{
 	case ESC: c = 'E'; break;
@@ -266,39 +264,42 @@ ansic_quote (const char *str, int flags,
 	  break;
 	default:
 #if defined (HANDLE_MULTIBYTE)
-	  b = is_basic (c);
-	  /* XXX - clen comparison to 0 is dicey */
-	  if ((b == 0 && ((clen = mbrtowc (&wc, s, MB_CUR_MAX, 0)) < 0 || MB_INVALIDCH (clen) || iswprint (wc) == 0)) ||
-	      (b == 1 && ISPRINT (c) == 0))
-#else
-	  if (ISPRINT (c) == 0)
-#endif
+	  if ((locale_utf8locale && (c & 0x80)) ||
+	      (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) == 0))
 	    {
-	      *r++ = '\\';
-	      *r++ = TOCHAR ((c >> 6) & 07);
-	      *r++ = TOCHAR ((c >> 3) & 07);
-	      *r++ = TOCHAR (c & 07);
-	      continue;
+	      clen = mbrtowc (&wc, s, locale_mb_cur_max, &state);
+	      if (MB_NULLWCH (clen))
+		goto quote_end;
+	      if (MB_INVALIDCH (clen))
+		INITIALIZE_MBSTATE;
+	      else if (iswprint (wc))
+		{
+		  for (b = 0; b < (int)clen; b++)
+		    *r++ = (unsigned char)s[b];
+		  s += clen - 1;	/* -1 because of the increment above */
+		  continue;
+		}
 	    }
-	  l = 0;
-	  break;
+	  else
+#endif
+	    if (ISPRINT (c))
+	      {
+		*r++ = c;
+		continue;
+	      }
+
+	  *r++ = '\\';
+	  *r++ = TOCHAR ((c >> 6) & 07);
+	  *r++ = TOCHAR ((c >> 3) & 07);
+	  *r++ = TOCHAR (c & 07);
+	  continue;
 	}
-      if (b == 0 && clen == 0)
-	break;
 
-      if (l)
-	*r++ = '\\';
-
-      if (clen == 1)
-	*r++ = c;
-      else
-	{
-	  for (b = 0; b < (int)clen; b++)
-	    *r++ = (unsigned char)s[b];
-	  s += clen - 1;	/* -1 because of the increment above */
-	}
+      *r++ = '\\';
+      *r++ = c;
     }
 
+quote_end:
   *r++ = '\'';
   *r = '\0';
   if (rlen)
@@ -348,7 +349,8 @@ ansic_shouldquote (const char *string)
   for (s = string; c = *s; s++)
     {
 #if defined (HANDLE_MULTIBYTE)
-      if (is_basic (c) == 0)
+      if ((locale_utf8locale && (c & 0x80)) ||
+	  (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) == 0))
 	return (ansic_wshouldquote (s));
 #endif
       if (ISPRINT (c) == 0)
--- a/patchlevel.h
+++ b/patchlevel.h
@@ -25,6 +25,6 @@
    regexp `^#define[ 	]*PATCHLEVEL', since that's what support/mkversion.sh
    looks for to find the patch level (for the sccs version string). */
 
-#define PATCHLEVEL 7
+#define PATCHLEVEL 8
 
 #endif /* _PATCHLEVEL_H_ */