File: internal.rb

package info (click to toggle)
ruby-html-proofer 5.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 22,524 kB
  • sloc: ruby: 4,389; sh: 8; makefile: 4; javascript: 1; php: 1
file content (149 lines) | stat: -rw-r--r-- 5,899 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# frozen_string_literal: true

module HTMLProofer
  class UrlValidator
    class Internal < UrlValidator
      attr_reader :internal_urls

      def initialize(runner, internal_urls)
        super(runner)

        @internal_urls = internal_urls
      end

      def validate
        urls_to_check = @cache.internal_enabled? ? @runner.load_internal_cache : @internal_urls
        urls_detected = pluralize(urls_to_check.count, "internal link", "internal links")
        @logger.log(:info, "Checking #{urls_detected}")

        run_internal_link_checker(urls_to_check)

        @failed_checks
      end

      def run_internal_link_checker(links)
        # collect urls and metadata for hashes to be checked in the same target file
        file_paths_hashes_to_check = {}
        to_add = []
        links.each_with_index do |(link, matched_files), i|
          matched_count_to_log = pluralize(matched_files.count, "reference", "references")
          @logger.log(:debug, "(#{i + 1} / #{links.count}) Internal link #{link}: Checking #{matched_count_to_log}")
          matched_files.each do |metadata|
            url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url], source: metadata[:source], filename: metadata[:filename])

            unless url.exists?
              @failed_checks << Failure.new(
                metadata[:filename],
                "Links > Internal",
                "internally linking to #{url}, which does not exist",
                line: metadata[:line],
                status: nil,
                content: metadata[:element]&.content,
                element: metadata[:element],
              )
              to_add << [url, metadata, false]
              next
            end

            hash_exists = hash_exists_for_url?(url)
            if hash_exists.nil?
              # the hash needs to be checked in the target file, we collect the url and metadata
              target_file_path = url.resolved_path
              unless file_paths_hashes_to_check.key?(target_file_path)
                file_paths_hashes_to_check[target_file_path] = {}
              end
              unless file_paths_hashes_to_check[target_file_path].key?(url.hash)
                file_paths_hashes_to_check[target_file_path][url.hash] = []
              end
              file_paths_hashes_to_check[target_file_path][url.hash] << [url, metadata]
              next
            end
            unless hash_exists
              @failed_checks << Failure.new(
                metadata[:filename],
                "Links > Internal",
                "internally linking to #{url}; the file exists, but the hash '#{url.hash}' does not",
                line: metadata[:line],
                status: nil,
                content: metadata[:element]&.content,
                element: metadata[:element],
              )
              to_add << [url, metadata, false]
              next
            end

            to_add << [url, metadata, true]
          end
        end

        # check hashes by target file
        @logger.log(:info, "Checking internal link hashes in #{pluralize(file_paths_hashes_to_check.count, "file", "files")}")
        file_paths_hashes_to_check.each_with_index do |(file_path, hashes_to_check), i|
          hash_count_to_log = pluralize(hashes_to_check.count, "hash", "hashes")
          @logger.log(:debug, "(#{i + 1} / #{file_paths_hashes_to_check.count}) Checking #{hash_count_to_log} in #{file_path}")
          html = create_nokogiri(file_path)
          hashes_to_check.each_pair do |href_hash, url_metadata|
            exists = hash_exists_in_html?(href_hash, html)
            url_metadata.each do |(url, metadata)|
              unless exists
                @failed_checks << Failure.new(
                  metadata[:filename],
                  "Links > Internal",
                  "internally linking to #{url}; the file exists, but the hash '#{href_hash}' does not",
                  line: metadata[:line],
                  status: nil,
                  content: metadata[:element]&.content,
                  element: metadata[:element],
                )
              end
              to_add << [url, metadata, exists]
            end
          end
        end

        # adding directly to the cache above results in an endless loop
        to_add.each do |(url, metadata, exists)|
          @cache.add_internal(url.to_s, metadata, exists)
        end

        @failed_checks
      end

      # verify the hash w/o just based on the URL, w/o looking at the target file
      # => returns nil if the has could not be verified
      private def hash_exists_for_url?(url)
        href_hash = url.hash
        return true if blank?(href_hash)
        return true unless @runner.options[:check_internal_hash]

        # prevents searching files we didn't ask about
        return false unless url.known_extension?
        return false unless url.has_hash?

        decoded_href_hash = Addressable::URI.unescape(href_hash)
        fragment_ids = [href_hash, decoded_href_hash]
        # https://www.w3.org/TR/html5/single-page.html#scroll-to-fragid
        true if fragment_ids.include?("top")
      end

      private def hash_exists_in_html?(href_hash, html)
        decoded_href_hash = Addressable::URI.unescape(href_hash)
        fragment_ids = [href_hash, decoded_href_hash]
        !find_fragments(fragment_ids, html).empty?
      end

      private def find_fragments(fragment_ids, html)
        xpaths = fragment_ids.uniq.flat_map do |frag_id|
          escaped_frag_id = "'#{frag_id.split("'").join("', \"'\", '")}', ''"
          [
            "//*[case_sensitive_equals(@id, concat(#{escaped_frag_id}))]",
            "//*[case_sensitive_equals(@name, concat(#{escaped_frag_id}))]",
          ]
        end
        xpaths << XpathFunctions.new

        html.xpath(*xpaths)
      end
    end
  end
end