File: links.rb

package info (click to toggle)
ruby-html-proofer 3.19.4-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,040 kB
  • sloc: ruby: 3,203; sh: 9; makefile: 4; javascript: 1; php: 1
file content (182 lines) | stat: -rw-r--r-- 6,153 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# frozen_string_literal: true

class LinkCheck < ::HTMLProofer::Check
  include HTMLProofer::Utils

  def missing_href?
    return blank?(@link.src) if @node.name == 'source'

    blank?(@link.href) && blank?(@link.name) && blank?(@link.id)
  end

  def placeholder?
    (!blank?(@link.id) || !blank?(@link.name)) && @link.href.nil?
  end

  def run
    @html.css('a, link, source').each do |node|
      @link = create_element(node)
      line = node.line
      content = node.to_s

      next if @link.ignore?

      next if placeholder?
      next if @link.allow_hash_href? && @link.href == '#'

      # is it even a valid URL?
      unless @link.valid?
        add_issue("#{@link.href} is an invalid URL", line: line, content: content)
        next
      end

      check_schemes(@link, line, content)

      # is there even an href?
      if missing_href?
        next if @link.allow_missing_href?
        # HTML5 allows dropping the href: http://git.io/vBX0z
        next if @html.internal_subset.nil? || (@html.internal_subset.name == 'html' && @html.internal_subset.external_id.nil?)

        add_issue('anchor has no href attribute', line: line, content: content)
        next
      end

      # intentionally here because we still want valid? & missing_href? to execute
      next if @link.non_http_remote?

      if !@link.href&.start_with?('#') && !@link.internal? && @link.remote?
        check_sri(line, content) if @link.check_sri? && node.name == 'link'
        # we need to skip these for now; although the domain main be valid,
        # curl/Typheous inaccurately return 404s for some links. cc https://git.io/vyCFx
        next if @link.respond_to?(:rel) && @link.rel == 'dns-prefetch'

        unless @link.path?
          add_issue("#{@link.href} is an invalid URL", line: line, content: content)
          next
        end

        add_to_external_urls(@link.href || @link.src)
        next
      elsif @link.internal?
        add_to_internal_urls(@link.href, InternalLink.new(@link, @path, line, content))
        add_issue("internally linking to #{@link.href}, which does not exist", line: line, content: content) if !@link.exists? && !@link.hash
      end
    end

    external_urls
  end

  def check_internal_link(link, path, line, content)
    # does the local directory have a trailing slash?
    if link.unslashed_directory?(link.absolute_path)
      add_issue("internally linking to a directory #{link.absolute_path} without trailing slash", path: path, line: line, content: content)
      return false
    end

    return true unless link.hash

    # verify the target hash
    handle_hash(link, path, line, content)
  end

  def check_schemes(link, line, content)
    case link.scheme
    when 'mailto'
      handle_mailto(link, line, content)
    when 'tel'
      handle_tel(link, line, content)
    when 'http'
      return unless @options[:enforce_https]

      add_issue("#{link.href} is not an HTTPS link", line: line, content: content)
    end
  end

  def handle_mailto(link, line, content)
    if link.path.empty?
      add_issue("#{link.href} contains no email address", line: line, content: content) unless link.ignore_empty_mailto?
    elsif !link.path.include?('@')
      add_issue("#{link.href} contains an invalid email address", line: line, content: content)
    end
  end

  def handle_tel(link, line, content)
    add_issue("#{link.href} contains no phone number", line: line, content: content) if link.path.empty?
  end

  def handle_hash(link, path, line, content)
    if link.internal? && !hash_exists?(link.html, link.hash) # rubocop:disable Style/GuardClause
      return add_issue("linking to internal hash ##{link.hash} that does not exist", path: path, line: line, content: content)
    elsif link.external?
      return external_link_check(link, line, content)
    end

    true
  end

  def external_link_check(link, line, content)
    if link.exists? # rubocop:disable Style/GuardClause
      target_html = create_nokogiri(link.absolute_path)
      return add_issue("linking to #{link.href}, but #{link.hash} does not exist", line: line, content: content) unless hash_exists?(target_html, link.hash)
    else
      return add_issue("trying to find hash of #{link.href}, but #{link.absolute_path} does not exist", line: line, content: content)
    end

    true
  end

  def hash_exists?(html, href_hash)
    decoded_href_hash = Addressable::URI.unescape(href_hash)
    fragment_ids = [href_hash, decoded_href_hash]
    # https://www.w3.org/TR/html5/single-page.html#scroll-to-fragid
    fragment_ids.include?('top') || !find_fragments(html, fragment_ids).empty?
  end

  def find_fragments(html, fragment_ids)
    xpaths = fragment_ids.flat_map do |frag_id|
      escaped_frag_id = "'#{frag_id.split("'").join("', \"'\", '")}', ''"
      [
        "//*[case_sensitive_equals(@id, concat(#{escaped_frag_id}))]",
        "//*[case_sensitive_equals(@name, concat(#{escaped_frag_id}))]"
      ]
    end
    xpaths << XpathFunctions.new

    html.xpath(*xpaths)
  end

  # Whitelist for affected elements from Subresource Integrity specification
  # https://w3c.github.io/webappsec-subresource-integrity/#link-element-for-stylesheets
  SRI_REL_TYPES = %(stylesheet)

  def check_sri(line, content)
    return unless SRI_REL_TYPES.include?(@link.rel)

    if !defined?(@link.integrity) && !defined?(@link.crossorigin)
      add_issue("SRI and CORS not provided in: #{@link.src}", line: line, content: content)
    elsif !defined?(@link.integrity)
      add_issue("Integrity is missing in: #{@link.src}", line: line, content: content)
    elsif !defined?(@link.crossorigin)
      add_issue("CORS not provided for external resource in: #{@link.src}", line: line, content: content)
    end
  end

  class XpathFunctions
    def case_sensitive_equals(node_set, str_to_match)
      node_set.find_all { |node| node.to_s.== str_to_match.to_s }
    end
  end

  class InternalLink
    attr_reader :link, :href, :path, :line, :content

    def initialize(link, path, line, content)
      @link = link
      @href = @link.href
      @path = path
      @line = line
      @content = content
    end
  end
end