1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: 2021 The Elixir Team
defmodule String.Tokenizer.Security do
@moduledoc false
# UTS39 security checks that operate on all tokens in a file,
# like Confusables. If we add whole-file mixed-script-confusable-characters
# checks we can add them to the list of lints here
def unicode_lint_warnings(tokens) do
for warning <- confusables(tokens),
do: format_warning(warning)
end
defp format_warning({token, reason}) do
{_, {line, col, _}, _} = token
{{line, col}, to_charlist(reason)}
end
## Confusables
defp confusables(tokens) do
{_, warnings} =
for token <- tokens, reduce: {%{}, []} do
{skeletons, warnings} ->
case check_token_for_confusability(token, skeletons) do
{:ok, skeletons} -> {skeletons, warnings}
{:warn, reason} -> {skeletons, [{token, reason} | warnings]}
end
end
warnings
end
@identifiers [
:identifier,
:op_identifier,
:kw_identifier,
:paren_identifier,
:bracket_identifier,
:alias,
:atom
]
defp check_token_for_confusability(
{kind, {_line, _column, [_ | _] = name} = info, _},
skeletons
)
when kind in @identifiers do
skeleton = confusable_skeleton(name)
case skeletons[skeleton] do
{_, _, ^name} ->
{:ok, skeletons}
{line, _, previous_name} when name != previous_name ->
{:warn,
"confusable identifier: '#{name}' looks like '#{previous_name}' on line #{line}, " <>
"but they are written using different characters" <> dir_compare(name, previous_name)}
_ ->
{:ok, Map.put(skeletons, skeleton, info)}
end
end
defp check_token_for_confusability(_token, skeletons), do: {:ok, skeletons}
# AAAA ; BBBB CCCC DDDDD ;
# ^ char ^ prototypical char or sequence of chars it can be confused with
confusables_path = "confusables.txt"
lines =
Path.join(__DIR__, confusables_path)
|> File.read!()
|> String.split(["\r\n", "\n"], trim: true)
regex = ~r/^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);/u
matches = Enum.map(lines, &Regex.run(regex, &1, capture: :all_but_first))
confusable_prototype_lookup =
for [confusable_str, prototype_str] <- matches, reduce: %{} do
acc ->
confusable = String.to_integer(String.trim(confusable_str), 16)
if Map.has_key?(acc, confusable) or
confusable in ?A..?Z or confusable in ?a..?z or confusable in ?0..?9 do
acc
else
prototype =
prototype_str
|> String.split(" ", trim: true)
|> Enum.map(&String.to_integer(&1, 16))
Map.put(acc, confusable, prototype)
end
end
for {confusable, prototype} <- confusable_prototype_lookup do
defp confusable_prototype(unquote(confusable)) do
unquote(prototype)
end
end
defp confusable_prototype(other), do: <<other::utf8>>
def confusable_skeleton(s) do
# "- Convert X to NFD format, as described in [UAX15].
# - Concatenate the prototypes for each character in X according to
# the specified data, producing a string of exemplar characters.
# - Reapply NFD." (UTS 39 section 4, skeleton definition)
:unicode.characters_to_nfd_list(s)
|> bidi_skeleton()
|> :unicode.characters_to_nfd_list()
end
# Unicode 15 adds bidiSkeleton because, w/RTL codepoints, idents that
# aren't confusable LTR *are* confusable in most places human review
# occurs (editors/browsers, thanks to bidi algo, UAX9).
#
# The solution is to detect spans with reversed visual direction,
# and reverse those, so that the input we check for confusability
# matches the perceived sequence instead of the byte sequence.
#
# (we need this regardless of script mixing, because direction-neutral
# chars like _ or 0..9 can mix w/RTL chars).
def bidi_skeleton(s) do
# UTS39-28 4:
#
# Bidirectional confusability is costlier to check than
# confusability, as [unicode bidi algo] must be applied.
# [...] a fast path can be used: [...] if X has no characters
# w/bidi classes R or AL, bidiSkeleton(X) = skeleton(X)
if match?([_, _ | _], s) and any_rtl?(s) do
unbidify(s) |> Enum.map(&confusable_prototype/1)
else
Enum.map(s, &confusable_prototype/1)
end
end
defp any_rtl?(s), do: Enum.any?(s, &(:rtl == String.Tokenizer.dir(&1)))
defp dir_compare(a, b) do
"""
#{if any_rtl?(a), do: "\n\n" <> dir_breakdown(a)}
#{if any_rtl?(b), do: dir_breakdown(b)}
"""
end
defp dir_breakdown(s) do
init = "'#{s}' includes right-to-left characters:\n"
init <>
for codepoint <- s, into: "" do
hex = :io_lib.format(~c"~4.16.0B", [codepoint])
" \\u#{hex} #{[codepoint]} #{String.Tokenizer.dir(codepoint)}\n"
end
end
# make charlist match visual order by reversing spans of {rtl, neutral}
# and attaching neutral characters and weak number types according to uax9
#
# UTS39-28 4: '[...] if the strings are known not to contain explicit
# directional formatting characters[...], the algorithm can
# be drastically simplified, [...], obviating the need for
# the [...] stack of the [unicode bidi algo]'
def unbidify(chars) when is_list(chars) do
{neutrals, direction, last_part, acc} =
Enum.reduce(chars, {[], :ltr, [], []}, fn head, {neutrals, part_dir, part, acc} ->
# https://www.unicode.org/reports/tr9/#W2
case String.Tokenizer.dir(head) do
:weak_number ->
{[], part_dir, [head] ++ neutrals ++ part, acc}
:neutral ->
{[head | neutrals], part_dir, part, acc}
^part_dir ->
{[], part_dir, [head | neutrals] ++ part, acc}
:ltr when part_dir == :rtl ->
{[], :ltr, [head | neutrals], Enum.reverse(part, acc)}
:rtl when part_dir == :ltr ->
{[], :rtl, [head], neutrals ++ part ++ acc}
end
end)
case direction do
:ltr -> Enum.reverse(acc, Enum.reverse(neutrals ++ last_part))
:rtl -> Enum.reverse(acc, neutrals ++ last_part)
end
end
end
|