File: html_parser.ex

package info (click to toggle)
elixir-earmark-parser 1.4.44-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,148 kB
  • sloc: makefile: 9
file content (95 lines) | stat: -rw-r--r-- 2,783 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
defmodule EarmarkParser.Helpers.HtmlParser do
  @moduledoc false

  import EarmarkParser.Helpers.StringHelpers, only: [behead: 2]
  import EarmarkParser.LineScanner, only: [void_tag?: 1]

  def parse_html(lines)

  def parse_html([tag_line | rest]) do
    case _parse_tag(tag_line) do
      {:ok, tag, ""} -> [_parse_rest(rest, tag, [])]
      {:ok, tag, suffix} -> [_parse_rest(rest, tag, [suffix])]
      {:ext, tag, ""} -> [_parse_rest(rest, tag, [])]
      {:ext, tag, suffix} -> [_parse_rest(rest, tag, []), [suffix]]
    end
  end

  # Parse One Tag
  # -------------

  defp _parse_atts(string, tag, atts) do
    quoted_attr = ~r{\A ([-\w]+) \s* = \s* (["']) (.*?) \2 \s*}x

    case Regex.run(quoted_attr, string) do
      [all, name, _delim, value] ->
        _parse_atts(behead(string, all), tag, [{name, value} | atts])

      _ ->
        unquoted_attr = ~r{\A ([-\w]+) (?: \s* = \s* ([^&\s>]*))? \s*}x

        case Regex.run(unquoted_attr, string) do
          [all, name, value] -> _parse_atts(behead(string, all), tag, [{name, value} | atts])
          [all, name] -> _parse_atts(behead(string, all), tag, [{name, name} | atts])
          _ -> _parse_tag_tail(string, tag, atts)
        end
    end
  end

  # Are leading and trailing "-"s ok?
  defp _parse_tag(string) do
    tag_head = ~r{\A \s* <([-\w]+) \s*}x

    case Regex.run(tag_head, string) do
      [all, tag] -> _parse_atts(behead(string, all), tag, [])
    end
  end

  defp _parse_tag_tail(string, tag, atts) do
    tag_tail = ~r{\A .*? (/?)> \s* (.*) \z}x

    case Regex.run(tag_tail, string) do
      [_, closing, suffix] ->
        suffix1 = String.replace(suffix, ~r{\s*</#{tag}>.*}, "")
        _close_tag_tail(tag, atts, closing != "", suffix1)
    end
  end

  defp _close_tag_tail(tag, atts, closing?, suffix) do
    if closing? || void_tag?(tag) do
      {:ext, {tag, Enum.reverse(atts)}, suffix}
    else
      {:ok, {tag, Enum.reverse(atts)}, suffix}
    end
  end

  # Iterate over lines inside a tag
  # -------------------------------

  @verbatim %{verbatim: true}
  defp _parse_rest(rest, tag_tpl, lines)

  defp _parse_rest([], tag_tpl, lines) do
    _tag_append(tag_tpl, lines)
  end

  defp _parse_rest([last_line], {tag, _} = tag_tpl, lines) do
    case Regex.run(~r{\A\s*</#{tag}>\s*(.*)}, last_line) do
      nil -> _tag_append(tag_tpl, [last_line | lines])
      [_, ""] -> _tag_append(tag_tpl, lines)
      [_, suffix] -> [_tag_append(tag_tpl, lines), suffix]
    end
  end

  defp _parse_rest([inner_line | rest], tag_tpl, lines) do
    _parse_rest(rest, tag_tpl, [inner_line | lines])
  end

  defp _tag_append(tag_tpl, lines) do
    tag_tpl
    |> Tuple.insert_at(2, Enum.reverse(lines))
    |> Tuple.insert_at(3, @verbatim)
  end
end

# SPDX-License-Identifier: Apache-2.0