1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
|
BASH PATCH REPORT
=================
Bash-Release: 5.3
Patch-ID: bash53-008
Bug-Reported-by: Grisha Levit <grishalevit@gmail.com>
Bug-Reference-ID: <20251022174207.10518-1-grishalevit@gmail.com>
Bug-Reference-URL: https://lists.gnu.org/archive/html/bug-bash/2025-10/msg00145.html
Bug-Description:
Bash tries to consume entire multibyte characters when looking for backslash
escapes in $'...' strings, and treats too many characters as potentially
beginning a multibyte character in UTF-8 locales. Being more selective about
when to call mbrtowc() can lead to optimized string processing and script
speedups. This patch also handles the unlikely situation of a locale
encoding null wide characters with non-null bytes.
--- a/lib/sh/strtrans.c
+++ b/lib/sh/strtrans.c
@@ -55,7 +55,7 @@ ansicstr (const char *string, size_t len
const char *s;
unsigned long v;
size_t clen;
- int mb_cur_max;
+ size_t mb_cur_max;
#if defined (HANDLE_MULTIBYTE)
wchar_t wc;
#endif
@@ -63,7 +63,7 @@ ansicstr (const char *string, size_t len
if (string == 0 || *string == '\0')
return ((char *)0);
- mb_cur_max = MB_CUR_MAX;
+ mb_cur_max = locale_mb_cur_max;
#if defined (HANDLE_MULTIBYTE)
temp = 4*len + 4;
if (temp < 12)
@@ -79,10 +79,14 @@ ansicstr (const char *string, size_t len
{
clen = 1;
#if defined (HANDLE_MULTIBYTE)
- if ((locale_utf8locale && (c & 0x80)) ||
- (locale_utf8locale == 0 && mb_cur_max > 0 && is_basic (c) == 0))
+ /* We read an entire multibyte character at a time if we are in a
+ locale where a backslash can possibly appear as part of a
+ multibyte character. UTF-8 encodings prohibit this. */
+ if (locale_utf8locale == 0 && mb_cur_max > 1 && is_basic (c) == 0)
{
clen = mbrtowc (&wc, s - 1, mb_cur_max, 0);
+ if (MB_NULLWCH (clen))
+ break; /* it apparently can happen */
if (MB_INVALIDCH (clen))
clen = 1;
}
@@ -227,30 +231,24 @@ ansic_quote (const char *str, int flags,
{
char *r, *ret;
const char *s;
- size_t l, rsize;
unsigned char c;
+#if defined (HANDLE_MULTIBYTE)
size_t clen;
int b;
-#if defined (HANDLE_MULTIBYTE)
wchar_t wc;
+ DECLARE_MBSTATE;
#endif
if (str == 0 || *str == 0)
return ((char *)0);
- l = strlen (str);
- rsize = 4 * l + 4;
- r = ret = (char *)xmalloc (rsize);
+ r = ret = (char *)xmalloc (4 * strlen (str) + 4);
*r++ = '$';
*r++ = '\'';
for (s = str; c = *s; s++)
{
- b = 1; /* 1 == add backslash; 0 == no backslash */
- l = 1;
- clen = 1;
-
switch (c)
{
case ESC: c = 'E'; break;
@@ -266,39 +264,42 @@ ansic_quote (const char *str, int flags,
break;
default:
#if defined (HANDLE_MULTIBYTE)
- b = is_basic (c);
- /* XXX - clen comparison to 0 is dicey */
- if ((b == 0 && ((clen = mbrtowc (&wc, s, MB_CUR_MAX, 0)) < 0 || MB_INVALIDCH (clen) || iswprint (wc) == 0)) ||
- (b == 1 && ISPRINT (c) == 0))
-#else
- if (ISPRINT (c) == 0)
-#endif
+ if ((locale_utf8locale && (c & 0x80)) ||
+ (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) == 0))
{
- *r++ = '\\';
- *r++ = TOCHAR ((c >> 6) & 07);
- *r++ = TOCHAR ((c >> 3) & 07);
- *r++ = TOCHAR (c & 07);
- continue;
+ clen = mbrtowc (&wc, s, locale_mb_cur_max, &state);
+ if (MB_NULLWCH (clen))
+ goto quote_end;
+ if (MB_INVALIDCH (clen))
+ INITIALIZE_MBSTATE;
+ else if (iswprint (wc))
+ {
+ for (b = 0; b < (int)clen; b++)
+ *r++ = (unsigned char)s[b];
+ s += clen - 1; /* -1 because of the increment above */
+ continue;
+ }
}
- l = 0;
- break;
+ else
+#endif
+ if (ISPRINT (c))
+ {
+ *r++ = c;
+ continue;
+ }
+
+ *r++ = '\\';
+ *r++ = TOCHAR ((c >> 6) & 07);
+ *r++ = TOCHAR ((c >> 3) & 07);
+ *r++ = TOCHAR (c & 07);
+ continue;
}
- if (b == 0 && clen == 0)
- break;
- if (l)
- *r++ = '\\';
-
- if (clen == 1)
- *r++ = c;
- else
- {
- for (b = 0; b < (int)clen; b++)
- *r++ = (unsigned char)s[b];
- s += clen - 1; /* -1 because of the increment above */
- }
+ *r++ = '\\';
+ *r++ = c;
}
+quote_end:
*r++ = '\'';
*r = '\0';
if (rlen)
@@ -348,7 +349,8 @@ ansic_shouldquote (const char *string)
for (s = string; c = *s; s++)
{
#if defined (HANDLE_MULTIBYTE)
- if (is_basic (c) == 0)
+ if ((locale_utf8locale && (c & 0x80)) ||
+ (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) == 0))
return (ansic_wshouldquote (s));
#endif
if (ISPRINT (c) == 0)
--- a/patchlevel.h
+++ b/patchlevel.h
@@ -25,6 +25,6 @@
regexp `^#define[ ]*PATCHLEVEL', since that's what support/mkversion.sh
looks for to find the patch level (for the sccs version string). */
-#define PATCHLEVEL 7
+#define PATCHLEVEL 8
#endif /* _PATCHLEVEL_H_ */
|