1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
|
/*
+----------------------------------------------------------------------+
| Copyright (c) The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| https://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Authors: Niels Dossche <nielsdos@php.net> |
+----------------------------------------------------------------------+
*/
/* This file implements the MIME sniff algorithm from https://mimesniff.spec.whatwg.org/#parsing-a-mime-type (Date: 2023-09-27)
* It is a strict implementation of the algorithm, i.e. it does not accept malformed headers.
* In particular, it exposes php_dom_sniff_charset() to parse the charset from the Content-Type header.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "php.h"
#ifdef HAVE_LIBXML
#include "php_libxml.h"
static bool is_not_slash(char c)
{
return c != '/';
}
static bool is_not_semicolon(char c)
{
return c != ';';
}
static bool is_not_semicolon_or_equals(char c)
{
return c != ';' && c != '=';
}
static bool is_not_quote_or_backslash(char c)
{
return c != '"' && c != '\\';
}
/* https://fetch.spec.whatwg.org/#http-tab-or-space */
static bool is_http_tab_or_space(char c)
{
return c == 0x09 || c == 0x20;
}
/* https://fetch.spec.whatwg.org/#http-whitespace */
static bool is_http_whitespace(char c)
{
return c == 0x0A || c == 0x0D || is_http_tab_or_space(c);
}
/* https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point */
static bool is_http_quoted_string_token(unsigned char c) /* Note: unsigned is important to let the >= 0x20 check work properly! */
{
return c == 0x09 || (c >= 0x20 && c != 0x7F);
}
/* https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points
* Implemented by returning the length of the sequence */
static zend_always_inline size_t collect_a_sequence_of_code_points(const char *position, const char *end, bool (*condition)(char))
{
const char *start = position;
while (position < end && condition(*position)) {
position++;
}
return position - start;
}
/* https://fetch.spec.whatwg.org/#collect-an-http-quoted-string with extract-value always true */
static zend_string *collect_an_http_quoted_string_with_extract_value(const char *position, const char *end, const char **position_out)
{
/* 1. Saving positionStart is not necessary, as in the extract-value == true variant we don't use it */
/* 2. Let value be the empty string */
zend_string *value = zend_string_alloc(end - position /* can't be longer than this */, false);
ZSTR_LEN(value) = 0;
/* 3. Assert */
ZEND_ASSERT(*position == '"');
/* 4. Advance */
position++;
/* 5. While true */
while (true) {
/* 5.1. Append the result of collect a sequence of code points that are not '"' or '\\' */
size_t length = collect_a_sequence_of_code_points(position, end, is_not_quote_or_backslash);
memcpy(ZSTR_VAL(value) + ZSTR_LEN(value), position, length);
ZSTR_LEN(value) += length;
position += length;
/* 5.2. Past end check */
if (position >= end) {
break;
}
/* 5.3. quoteOrBackslash is the code point at position */
char quote_or_backslash = *position;
/* 5.4. Advance */
position++;
/* 5.5. quote_or_backslash is '\\', deal with escaping */
if (quote_or_backslash == '\\') {
/* 5.5.1. Past end check */
if (position >= end) {
ZSTR_VAL(value)[ZSTR_LEN(value)] = '\\';
ZSTR_LEN(value)++;
break;
}
/* 5.5.2. Append code point at position */
ZSTR_VAL(value)[ZSTR_LEN(value)] = *position;
ZSTR_LEN(value)++;
/* 5.5.3. Advance */
position++;
} else {
/* 5.6. Otherwise: assert and break */
ZEND_ASSERT(quote_or_backslash == '"');
break;
}
}
ZSTR_VAL(value)[ZSTR_LEN(value)] = '\0';
*position_out = position;
/* 6. extract-value is always true, return value */
/* Step 7 is not needed because we always return here already */
return value;
}
/* https://infra.spec.whatwg.org/#ascii-alphanumeric */
static bool is_ascii_alpha_numeric(char c)
{
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
/* https://mimesniff.spec.whatwg.org/#http-token-code-point */
static bool is_http_token(char c)
{
return c == 0x21
|| (c >= 0x23 && c <= 0x27)
|| c == 0x2A || c == 0x2B || c == 0x2D || c == 0x2E
|| c == 0x5E || c == 0x5F
|| c == 0x60
|| c == 0x7C || c == 0x7E
|| is_ascii_alpha_numeric(c);
}
static bool is_empty_string_or_does_not_solely_contain_http_token_code_points(const char *start, size_t len)
{
if (len == 0) {
return true;
}
while (len > 0) {
if (!is_http_token(*start)) {
return true;
}
len--;
start++;
}
return false;
}
static bool solely_contains_http_quoted_string_tokens(const char *start, size_t len)
{
while (len > 0) {
if (!is_http_quoted_string_token(*start)) {
return false;
}
len--;
start++;
}
return true;
}
/* https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
* Note: We only care about the charset detection */
PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end)
{
/* 1. Remove leading & trailing HTTP whitespace */
while (start < end && is_http_whitespace(*start)) {
start++;
}
while (start < end && is_http_whitespace(*(end - 1))) {
end--;
}
/* 2. Position variable: no-op because we move the start pointer instead */
/* 3. Collect sequence of code points that are not '/' (for type) */
size_t type_length = collect_a_sequence_of_code_points(start, end, is_not_slash);
/* 4. Empty string or not solely http tokens */
if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, type_length)) {
return NULL;
}
start += type_length;
/* 5. Failure if past end of input (note: end is one past the last char; in practice this is only possible if no '/' was found) */
if (start >= end) {
return NULL;
}
/* 6. Skip '/' */
start++;
/* 7. Collect sequence of code points that are not ';' (for subtype) */
size_t subtype_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
/* 8. Remove trailing HTTP whitespace from subtype, but we don't care about subtype, so no-op */
/* 9. Empty string or not solely http tokens */
if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, subtype_length)) {
return NULL;
}
start += subtype_length;
/* 10. Initialise stuff, no-op as well as we don't care about anything other than charset */
/* 11. Loop with check: position not past end */
while (start < end) {
/* 11.1. Advance position */
start++;
/* 11.2. Collect sequence that *is* HTTP whitespace */
size_t whitespace_length = collect_a_sequence_of_code_points(start, end, is_http_whitespace);
start += whitespace_length;
/* 11.3. Collect a sequence of code points that are not ';' or '=' (for parameterName) */
size_t parameter_name_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon_or_equals);
const char *parameter_name = start;
start += parameter_name_length;
/* 11.4. Convert parameter_name to ASCII lowercase, no-op because we are only interested in charset which we'll match down below */
/* 11.5. Position past input check */
if (start < end) {
if (*start == ';') {
continue;
}
start++;
} else {
/* 11.6. */
break;
}
/* 11.7. Let parameterValue be null */
zend_string *parameter_value = NULL;
/* 11.8. Quoted string check */
if (*start == '"') {
/* 11.8.1. Set parameterValue to the result of collecting an HTTP quoted string */
parameter_value = collect_an_http_quoted_string_with_extract_value(start, end, &start);
/* 11.8.2. Collect a sequence of code points that are not ';' */
start += collect_a_sequence_of_code_points(start, end, is_not_semicolon);
} else {
/* 11.9. Otherwise */
/* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */
size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
parameter_value = zend_string_init(start, parameter_value_length, false);
start += parameter_name_length;
/* 11.9.2. Remove trailing HTTP whitespace from parameterValue */
while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {
ZSTR_LEN(parameter_value)--;
}
ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value)] = '\0';
/* 11.9.3. Continue if parameterValue is empty */
if (ZSTR_LEN(parameter_value) == 0) {
zend_string_release_ex(parameter_value, false);
continue;
}
}
/* 11.10. We diverge from the spec here: we're only interested in charset.
* Furthermore, as only the first match matters, we can stop immediately with the loop once we set the charset. */
if (parameter_name_length == strlen("charset")
&& strncasecmp(parameter_name, "charset", strlen("charset")) == 0 /* Because of lowercasing in step 11.4 */
&& solely_contains_http_quoted_string_tokens(ZSTR_VAL(parameter_value), ZSTR_LEN(parameter_value))) {
return parameter_value;
}
zend_string_release_ex(parameter_value, false);
}
/* 12. Return mimetype, a no-op / spec divergence */
return NULL;
}
PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s)
{
if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
zval *header;
/* Scan backwards: The header array might contain the headers for multiple responses, if
* a redirect was followed.
*/
ZEND_HASH_REVERSE_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
if (Z_TYPE_P(header) == IS_STRING) {
/* If no colon is found in the header, we assume it's the HTTP status line and bail out. */
char *colon = memchr(Z_STRVAL_P(header), ':', Z_STRLEN_P(header));
char *space = memchr(Z_STRVAL_P(header), ' ', Z_STRLEN_P(header));
if (colon == NULL || space < colon) {
return NULL;
}
if (zend_string_starts_with_literal_ci(Z_STR_P(header), "content-type:")) {
return php_libxml_sniff_charset_from_string(Z_STRVAL_P(header) + strlen("content-type:"), Z_STRVAL_P(header) + Z_STRLEN_P(header));
}
}
} ZEND_HASH_FOREACH_END();
}
return NULL;
}
#endif /* HAVE_LIBXML */
|