1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
|
--echo # Start of ctype_unescape.inc
SET sql_mode = '';
#
# Testing how string literals with backslash and quote-quote are unescaped.
# The tests assume that single quote (') is used as a delimiter.
#
#
# Make sure that the parser really works using the character set we need.
# We use binary strings to compose strings, to be able to test get malformed
# sequences, which are possible as a result of mysql_real_escape_string().
# The important thing for this test is to make the parser unescape using
# the client character set, rather than binary. Currently it works exactly
# that way by default, so the query below should return @@character_set_client
#
SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection';
PREPARE stmt FROM @query;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
let $CHARSET=`SELECT @@character_set_connection`;
CREATE TABLE allbytes (a VARBINARY(10));
#
# Create various byte sequences to test. Testing the full banch of
# possible combinations takes about 2 minutes. So this test provides
# variants to run with:
# - the full set of possible combinations
# - a reduced test of combinations for selected bytes only
#
# Create selected byte combinations
if ($ctype_unescape_combinations == 'selected')
{
--echo # Using selected bytes combinations
--source include/bytes.inc
#
# Populate "selected_bytes" with bytes that have a special meaning.
# We'll use "selected_bytes" to generate byte seqeunces,
# instead of the full possible byte combinations, to reduce test time.
#
CREATE TABLE selected_bytes (a VARBINARY(10));
# Bytes that have a special meaning in all character sets:
# 0x00 - mysql_real_escape_string() quotes this to '\0'
# 0x0D - mysql_real_escape_string() quotes this to '\r'
# 0x0A - mysql_real_escape_string() quotes this to '\n'
# 0x1A - mysql_real_escape_string() quotes this to '\Z'
# 0x08 - mysql_real_escape_string() does not quote this,
# but '\b' is unescaped to 0x08.
# 0x09 - mysql_real_escape_string() does not quote this,
# but '\t' is unescaped to 0x09.
# 0x30 - '0', as in '\0'
# 0x5A - 'Z', as in '\Z'
# 0x62 - 'b', as in '\b'
# 0x6E - 'n', as in '\n'
# 0x72 - 't', as in '\r'
# 0x74 - 't', as in '\t'
INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z');
INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z');
# 0x22 - double quote
# 0x25 - percent sign, '\%' is preserved as is for LIKE.
# 0x27 - single quote
# 0x5C - backslash
# 0x5F - underscore, '\_' is preserved as is for LIKE.
INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27);
# Some bytes do not have any special meaning, for example basic Latin letters.
# Let's add, one should be enough for a good enough coverage.
INSERT INTO selected_bytes (a) VALUES ('a');
#
# This maps summarizes bytes that have a special
# meaning in various character sets:
#
# MBHEAD MBTAIL NONASCII-8BIT BAD
# ------ ------ -------------- ----------
# big5: [A1..F9] [40..7E,A1..FE] N/A [80..A0,FA..FF]
# cp932: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
# gbk: [81..FE] [40..7E,80..FE] N/A [FF]
# sjis: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
# swe7: N/A N/A [5B..5E,7B..7E] [80..FF]
#
INSERT INTO selected_bytes (a) VALUES
(0x3F), # 7bit
(0x40), # 7bit mbtail
(0x7E), # 7bit mbtail nonascii-8bit
(0x7F), # 7bit nonascii-8bit
(0x80), # mbtail bad-mb
(0x81), # mbhead mbtail
(0x9F), # mbhead mbtail bad-mb
(0xA0), # mbhead mbtail bad-mb
(0xA1), # mbhead mbtail nonascii-8bit
(0xE0), # mbhead mbtai
(0xEF), # mbhead mbtail
(0xF9), # mbhead mbtail
(0xFA), # mbhead mbtail bad-mb
(0xFC), # mbhead mbtail bad-mb
(0xFD), # mbhead mbtail bad-mb
(0xFE), # mbhead mbtial bad-mb
(0xFF); # bad-mb
#
# Now populate the test table
#
# Use all single bytes, this is cheap, there are only 256 values.
INSERT INTO allbytes (a) SELECT a FROM bytes;
# Add selected bytes combinations
INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2;
DROP TABLE selected_bytes;
# Delete all non-single byte sequences that do not have
# backslashes or quotes at all. There is nothing special with these strings.
DELETE FROM allbytes WHERE
OCTET_LENGTH(a)>1 AND
LOCATE(0x5C,a)=0 AND
a NOT LIKE '%\'%' AND
a NOT LIKE '%"%';
}
if ($ctype_unescape_combinations=='')
{
--echo # Using full byte combinations
--source include/bytes2.inc
INSERT INTO allbytes (a) SELECT a FROM bytes;
INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2;
}
DELIMITER //;
#
# A procedure that make an SQL query using 'val' as a string literal.
# The result of the query execution is written into the table 't1'.
# NULL in t1.b means that query failed due to syntax error,
# typically because of mis-interpreted closing quote delimiter.
#
CREATE PROCEDURE p1(val VARBINARY(10))
BEGIN
DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL);
SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')");
PREPARE stmt FROM @query;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END//
#
# A procedure that iterates through all records in "allbytes".
# And runs p1() for every record.
#
CREATE PROCEDURE p2()
BEGIN
DECLARE val VARBINARY(10);
DECLARE done INT DEFAULT FALSE;
DECLARE stmt CURSOR FOR SELECT a FROM allbytes;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE;
OPEN stmt;
read_loop1: LOOP
FETCH stmt INTO val;
IF done THEN
LEAVE read_loop1;
END IF;
CALL p1(val);
END LOOP;
CLOSE stmt;
END//
# A function that converts the value from binary to $CHARSET
# and check if it has changed. CONVERT() fixes malformed strings.
# So if the string changes in CONVERT(), it means it was not wellformed.
--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET);
#
# A function that approximately reproduces how the SQL parser
# would unescape a binary string.
#
CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256)
BEGIN
# We need to do it in a way to avoid producing new escape sequences
# First, enclose all known escsape sequences to '{{xx}}'
# - Backslash not followed by a LIKE pattern characters _ and %
# - Double escapes
# This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)).
# So '\\1' in the last argument always means the match, no matter
# which alternative it came from.
SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}');
# Now unescape all enclosed standard escape sequences
SET a=REPLACE(a,'{{\\0}}', '\0');
SET a=REPLACE(a,'{{\\b}}', '\b');
SET a=REPLACE(a,'{{\\t}}', '\t');
SET a=REPLACE(a,'{{\\r}}', '\r');
SET a=REPLACE(a,'{{\\n}}', '\n');
SET a=REPLACE(a,'{{\\Z}}', '\Z');
SET a=REPLACE(a,'{{\\\'}}', '\'');
# Unescape double quotes
SET a=REPLACE(a,'{{\'\'}}', '\'');
# Unescape the rest: all other \x sequences mean just 'x'
SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1');
RETURN a;
END//
#
# A function that checks what happened during unescaping.
#
# @param a - the value before unescaping
# @param b - the value after unescaping
#
# The following return values are possible:
# - SyntErr - b IS NULL, which means syntax error happened in p1().
# - Preserv - the value was not modified during unescaping.
# This is possible if 0x5C was treated as mbtail.
# Or only LIKE escape sequences were found: '\_' and '\%'.
# - Trivial - only 0x5C were removed.
# - Regular - the value was unescaped like a binary string.
# Some standard escape sequences were found.
# No special multi-byte handling happened.
# - Special - Something else happened. Should not happen.
#
CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256)
BEGIN
RETURN CASE
WHEN b IS NULL THEN '[SyntErr]'
WHEN a=b THEN CASE
WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]'
WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]'
WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]'
ELSE '[Preserv][MB]' END
WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]'
WHEN UNESCAPE(a)=b THEN '[Regular]'
ELSE '[Special]' END;
END//
#
# Check what happened with wellformedness during unescaping
# @param a - the value before unescaping
# @param b - the value after unescaping
#
# Returned values:
# [FIXED] - the value was malformed and become wellformed after unescaping
# [BROKE] - the value was wellformed and become malformed after unescaping
# [ILSEQ] - both values (before unescaping and after unescaping) are malformed
# '' - both values are wellformed
#
CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256))
RETURNS VARBINARY(256)
BEGIN
RETURN CASE
WHEN b IS NULL THEN ''
WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]'
WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]'
WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]'
ELSE ''
END;
END//
#
# Check if the value could be generated by mysql_real_escape_string(),
# or can only come from a direct user input.
#
# @param a - the value before unescaping
#
# Returns:
# [USER] - if the value could not be generated by mysql_real_escape_string()
# '' - if the value was possibly generated by mysql_real_escape_string()
#
#
CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256))
RETURNS VARBINARY(256)
BEGIN
DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1);
DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1);
DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1);
DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1);
DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4);
RETURN CASE
WHEN (a1=0x5C) AND
(a3=0x5C) AND
(a2>0x7F) AND
(a4 NOT IN ('_','%','0','t','r','n','Z')) AND
iswellformed(a2a4) THEN '[USER]'
ELSE ''
END;
END//
DELIMITER ;//
CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10));
CALL p2();
# Avoid "Invalid XXX character string" warnings
# We mark malformed strings in the output anyway
--disable_warnings
# All records marked with '[BAD]' mean that the string was unescaped
# in a unexpected way, that means there is a bug in UNESCAPE() above.
#check after fix MDEV-29290
--disable_view_protocol
SELECT HEX(a),HEX(b),
CONCAT(unescape_type(a,b),
wellformedness(a,b),
mysql_real_escape_string_generated(a),
IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment
FROM t1 ORDER BY LENGTH(a),a;
--enable_view_protocol
--enable_warnings
DROP TABLE t1;
DROP PROCEDURE p1;
DROP PROCEDURE p2;
DROP FUNCTION unescape;
DROP FUNCTION unescape_type;
DROP FUNCTION wellformedness;
DROP FUNCTION mysql_real_escape_string_generated;
DROP FUNCTION iswellformed;
DROP TABLE allbytes;
SET sql_mode = DEFAULT;
--echo # End of ctype_backslash.inc
|