File: security.ex

package info (click to toggle)
elixir-lang 1.19.5.dfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,524 kB
  • sloc: erlang: 12,234; sh: 321; makefile: 288
file content (193 lines) | stat: -rw-r--r-- 6,229 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: 2021 The Elixir Team

defmodule String.Tokenizer.Security do
  @moduledoc false

  # UTS39 security checks that operate on all tokens in a file,
  # like Confusables. If we add whole-file mixed-script-confusable-characters
  # checks we can add them to the list of lints here
  def unicode_lint_warnings(tokens) do
    for warning <- confusables(tokens),
        do: format_warning(warning)
  end

  defp format_warning({token, reason}) do
    {_, {line, col, _}, _} = token
    {{line, col}, to_charlist(reason)}
  end

  ## Confusables

  defp confusables(tokens) do
    {_, warnings} =
      for token <- tokens, reduce: {%{}, []} do
        {skeletons, warnings} ->
          case check_token_for_confusability(token, skeletons) do
            {:ok, skeletons} -> {skeletons, warnings}
            {:warn, reason} -> {skeletons, [{token, reason} | warnings]}
          end
      end

    warnings
  end

  @identifiers [
    :identifier,
    :op_identifier,
    :kw_identifier,
    :paren_identifier,
    :bracket_identifier,
    :alias,
    :atom
  ]

  defp check_token_for_confusability(
         {kind, {_line, _column, [_ | _] = name} = info, _},
         skeletons
       )
       when kind in @identifiers do
    skeleton = confusable_skeleton(name)

    case skeletons[skeleton] do
      {_, _, ^name} ->
        {:ok, skeletons}

      {line, _, previous_name} when name != previous_name ->
        {:warn,
         "confusable identifier: '#{name}' looks like '#{previous_name}' on line #{line}, " <>
           "but they are written using different characters" <> dir_compare(name, previous_name)}

      _ ->
        {:ok, Map.put(skeletons, skeleton, info)}
    end
  end

  defp check_token_for_confusability(_token, skeletons), do: {:ok, skeletons}

  # AAAA ;   BBBB CCCC DDDDD ;
  # ^ char   ^ prototypical char or sequence of chars it can be confused with
  confusables_path = "confusables.txt"

  lines =
    Path.join(__DIR__, confusables_path)
    |> File.read!()
    |> String.split(["\r\n", "\n"], trim: true)

  regex = ~r/^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);/u
  matches = Enum.map(lines, &Regex.run(regex, &1, capture: :all_but_first))

  confusable_prototype_lookup =
    for [confusable_str, prototype_str] <- matches, reduce: %{} do
      acc ->
        confusable = String.to_integer(String.trim(confusable_str), 16)

        if Map.has_key?(acc, confusable) or
             confusable in ?A..?Z or confusable in ?a..?z or confusable in ?0..?9 do
          acc
        else
          prototype =
            prototype_str
            |> String.split(" ", trim: true)
            |> Enum.map(&String.to_integer(&1, 16))

          Map.put(acc, confusable, prototype)
        end
    end

  for {confusable, prototype} <- confusable_prototype_lookup do
    defp confusable_prototype(unquote(confusable)) do
      unquote(prototype)
    end
  end

  defp confusable_prototype(other), do: <<other::utf8>>

  def confusable_skeleton(s) do
    # "- Convert X to NFD format, as described in [UAX15].
    #  - Concatenate the prototypes for each character in X according to
    #    the specified data, producing a string of exemplar characters.
    #  - Reapply NFD." (UTS 39 section 4, skeleton definition)
    :unicode.characters_to_nfd_list(s)
    |> bidi_skeleton()
    |> :unicode.characters_to_nfd_list()
  end

  # Unicode 15 adds bidiSkeleton because, w/RTL codepoints, idents that
  # aren't confusable LTR *are* confusable in most places human review
  # occurs (editors/browsers, thanks to bidi algo, UAX9).
  #
  # The solution is to detect spans with reversed visual direction,
  # and reverse those, so that the input we check for confusability
  # matches the perceived sequence instead of the byte sequence.
  #
  # (we need this regardless of script mixing, because direction-neutral
  # chars like _ or 0..9 can mix w/RTL chars).
  def bidi_skeleton(s) do
    # UTS39-28 4:
    #
    # Bidirectional confusability is costlier to check than
    # confusability, as [unicode bidi algo] must be applied.
    # [...] a fast path can be used: [...] if X has no characters
    # w/bidi classes R or AL, bidiSkeleton(X) = skeleton(X)
    if match?([_, _ | _], s) and any_rtl?(s) do
      unbidify(s) |> Enum.map(&confusable_prototype/1)
    else
      Enum.map(s, &confusable_prototype/1)
    end
  end

  defp any_rtl?(s), do: Enum.any?(s, &(:rtl == String.Tokenizer.dir(&1)))

  defp dir_compare(a, b) do
    """
    #{if any_rtl?(a), do: "\n\n" <> dir_breakdown(a)}
    #{if any_rtl?(b), do: dir_breakdown(b)}
    """
  end

  defp dir_breakdown(s) do
    init = "'#{s}' includes right-to-left characters:\n"

    init <>
      for codepoint <- s, into: "" do
        hex = :io_lib.format(~c"~4.16.0B", [codepoint])
        "  \\u#{hex} #{[codepoint]} #{String.Tokenizer.dir(codepoint)}\n"
      end
  end

  # make charlist match visual order by reversing spans of {rtl, neutral}
  # and attaching neutral characters and weak number types according to uax9
  #
  #  UTS39-28 4: '[...] if the strings are known not to contain explicit
  #   directional formatting characters[...], the algorithm can
  #   be drastically simplified, [...], obviating the need for
  #   the [...] stack of the [unicode bidi algo]'
  def unbidify(chars) when is_list(chars) do
    {neutrals, direction, last_part, acc} =
      Enum.reduce(chars, {[], :ltr, [], []}, fn head, {neutrals, part_dir, part, acc} ->
        # https://www.unicode.org/reports/tr9/#W2
        case String.Tokenizer.dir(head) do
          :weak_number ->
            {[], part_dir, [head] ++ neutrals ++ part, acc}

          :neutral ->
            {[head | neutrals], part_dir, part, acc}

          ^part_dir ->
            {[], part_dir, [head | neutrals] ++ part, acc}

          :ltr when part_dir == :rtl ->
            {[], :ltr, [head | neutrals], Enum.reverse(part, acc)}

          :rtl when part_dir == :ltr ->
            {[], :rtl, [head], neutrals ++ part ++ acc}
        end
      end)

    case direction do
      :ltr -> Enum.reverse(acc, Enum.reverse(neutrals ++ last_part))
      :rtl -> Enum.reverse(acc, neutrals ++ last_part)
    end
  end
end