1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
|
/**
* @file unicode.h
* @brief Definitions for all unicode specific functions.
*/
#ifndef ADA_UNICODE_H
#define ADA_UNICODE_H
#include "ada/common_defs.h"
#include "ada/ada_idna.h"
#include <string>
#include <string_view>
#include <optional>
/**
* Unicode operations. These functions are not part of our public API and may
* change at any time.
*
* @private
* @namespace ada::unicode
* @brief Includes the definitions for unicode operations
*/
namespace ada::unicode {
/**
* @private
* We receive a UTF-8 string representing a domain name.
* If the string is percent encoded, we apply percent decoding.
*
* Given a domain, we need to identify its labels.
* They are separated by label-separators:
*
* U+002E (.) FULL STOP
* U+FF0E FULLWIDTH FULL STOP
* U+3002 IDEOGRAPHIC FULL STOP
* U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP
*
* They are all mapped to U+002E.
*
* We process each label into a string that should not exceed 63 octets.
* If the string is already punycode (starts with "xn--"), then we must
* scan it to look for unallowed code points.
* Otherwise, if the string is not pure ASCII, we need to transcode it
* to punycode by following RFC 3454 which requires us to
* - Map characters (see section 3),
* - Normalize (see section 4),
* - Reject forbidden characters,
* - Check for right-to-left characters and if so, check all requirements (see
* section 6),
* - Optionally reject based on unassigned code points (section 7).
*
* The Unicode standard provides a table of code points with a mapping, a list
* of forbidden code points and so forth. This table is subject to change and
* will vary based on the implementation. For Unicode 15, the table is at
* https://www.unicode.org/Public/idna/15.0.0/IdnaMappingTable.txt
* If you use ICU, they parse this table and map it to code using a Python
* script.
*
* The resulting strings should not exceed 255 octets according to RFC 1035
* section 2.3.4. ICU checks for label size and domain size, but these errors
* are ignored.
*
* @see https://url.spec.whatwg.org/#concept-domain-to-ascii
*
*/
bool to_ascii(std::optional<std::string>& out, std::string_view plain,
size_t first_percent);
/**
* @private
* Checks if the input has tab or newline characters.
*
* @attention The has_tabs_or_newline function is a bottleneck and it is simple
* enough that compilers like GCC can 'autovectorize it'.
*/
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept;
/**
* @private
* Checks if the input is a forbidden host code point.
* @see https://url.spec.whatwg.org/#forbidden-host-code-point
*/
ada_really_inline constexpr bool is_forbidden_host_code_point(char c) noexcept;
/**
* @private
* Checks if the input contains a forbidden domain code point.
* @see https://url.spec.whatwg.org/#forbidden-domain-code-point
*/
ada_really_inline constexpr bool contains_forbidden_domain_code_point(
const char* input, size_t length) noexcept;
/**
* @private
* Checks if the input contains a forbidden domain code point in which case
* the first bit is set to 1. If the input contains an upper case ASCII letter,
* then the second bit is set to 1.
* @see https://url.spec.whatwg.org/#forbidden-domain-code-point
*/
ada_really_inline constexpr uint8_t
contains_forbidden_domain_code_point_or_upper(const char* input,
size_t length) noexcept;
/**
* @private
* Checks if the input is a forbidden domain code point.
* @see https://url.spec.whatwg.org/#forbidden-domain-code-point
*/
ada_really_inline constexpr bool is_forbidden_domain_code_point(
char c) noexcept;
/**
* @private
* Checks if the input is alphanumeric, '+', '-' or '.'
*/
ada_really_inline constexpr bool is_alnum_plus(char c) noexcept;
/**
* @private
* @details An ASCII hex digit is an ASCII upper hex digit or ASCII lower hex
* digit. An ASCII upper hex digit is an ASCII digit or a code point in the
* range U+0041 (A) to U+0046 (F), inclusive. An ASCII lower hex digit is an
* ASCII digit or a code point in the range U+0061 (a) to U+0066 (f), inclusive.
*/
ada_really_inline constexpr bool is_ascii_hex_digit(char c) noexcept;
/**
* @private
* An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9),
* inclusive.
*/
ada_really_inline constexpr bool is_ascii_digit(char c) noexcept;
/**
* @private
* @details If a char is between U+0000 and U+007F inclusive, then it's an ASCII
* character.
*/
ada_really_inline constexpr bool is_ascii(char32_t c) noexcept;
/**
* @private
* Checks if the input is a C0 control or space character.
*
* @details A C0 control or space is a C0 control or U+0020 SPACE.
* A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION
* SEPARATOR ONE, inclusive.
*/
ada_really_inline constexpr bool is_c0_control_or_space(char c) noexcept;
/**
* @private
* Checks if the input is a ASCII tab or newline character.
*
* @details An ASCII tab or newline is U+0009 TAB, U+000A LF, or U+000D CR.
*/
ada_really_inline constexpr bool is_ascii_tab_or_newline(char c) noexcept;
/**
* @private
* @details A double-dot path segment must be ".." or an ASCII case-insensitive
* match for ".%2e", "%2e.", or "%2e%2e".
*/
ada_really_inline constexpr bool is_double_dot_path_segment(
std::string_view input) noexcept;
/**
* @private
* @details A single-dot path segment must be "." or an ASCII case-insensitive
* match for "%2e".
*/
ada_really_inline constexpr bool is_single_dot_path_segment(
std::string_view input) noexcept;
/**
* @private
* @details ipv4 character might contain 0-9 or a-f character ranges.
*/
ada_really_inline constexpr bool is_lowercase_hex(char c) noexcept;
/**
* @private
* @details Convert hex to binary. Caller is responsible to ensure that
* the parameter is an hexadecimal digit (0-9, A-F, a-f).
*/
ada_really_inline unsigned constexpr convert_hex_to_binary(char c) noexcept;
/**
* @private
* first_percent should be = input.find('%')
*
* @todo It would be faster as noexcept maybe, but it could be unsafe since.
* @author Node.js
* @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L245
* @see https://encoding.spec.whatwg.org/#utf-8-decode-without-bom
*/
std::string percent_decode(std::string_view input, size_t first_percent);
/**
* @private
* Returns a percent-encoding string whether percent encoding was needed or not.
* @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L226
*/
std::string percent_encode(std::string_view input,
const uint8_t character_set[]);
/**
* @private
* Returns a percent-encoded string version of input, while starting the percent
* encoding at the provided index.
* @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L226
*/
std::string percent_encode(std::string_view input,
const uint8_t character_set[], size_t index);
/**
* @private
* Returns true if percent encoding was needed, in which case, we store
* the percent-encoded content in 'out'. If the boolean 'append' is set to
* true, the content is appended to 'out'.
* If percent encoding is not needed, out is left unchanged.
* @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L226
*/
template <bool append>
bool percent_encode(std::string_view input, const uint8_t character_set[],
std::string& out);
/**
* @private
* Returns the index at which percent encoding should start, or (equivalently),
* the length of the prefix that does not require percent encoding.
*/
ada_really_inline size_t percent_encode_index(std::string_view input,
const uint8_t character_set[]);
/**
* @private
* Lowers the string in-place, assuming that the content is ASCII.
* Return true if the content was ASCII.
*/
constexpr bool to_lower_ascii(char* input, size_t length) noexcept;
} // namespace ada::unicode
#endif // ADA_UNICODE_H
|