File: external.rb

package info (click to toggle)
ruby-html-proofer 5.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 22,524 kB
  • sloc: ruby: 4,389; sh: 8; makefile: 4; javascript: 1; php: 1
file content (229 lines) | stat: -rw-r--r-- 8,320 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# frozen_string_literal: true

require "typhoeus"
require "open-uri"
# require "uri"
require "pdf-reader"

module HTMLProofer
  class UrlValidator
    class External < UrlValidator
      include HTMLProofer::Utils

      attr_reader :external_urls
      attr_writer :before_request

      def initialize(runner, external_urls)
        super(runner)

        @external_urls = external_urls
        @hydra = Typhoeus::Hydra.new(@runner.options[:hydra])
        @before_request = []

        @paths_with_queries = {}
      end

      def validate
        urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls
        urls_detected = pluralize(urls_to_check.count, "external link", "external links")
        @logger.log(:info, "Checking #{urls_detected}")

        run_external_link_checker(urls_to_check)

        @failed_checks
      end

      # Proofer runs faster if we pull out all the external URLs and run the checks
      # at the end. Otherwise, we're halting the consuming process for every file during
      # `process_files`.
      #
      # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
      #
      # Finally, we'll first make a HEAD request, rather than GETing all the contents.
      # If the HEAD fails, we'll fall back to GET, as some servers are not configured
      # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
      # not available as an option.
      def run_external_link_checker(external_urls)
        # Route log from Typhoeus/Ethon to our own logger
        Ethon.logger = @logger

        external_urls.each_pair do |external_url, metadata|
          url = Attribute::Url.new(@runner, external_url, base_url: nil)

          unless url.valid?
            add_failure(metadata, "#{url} is an invalid URL", 0)
            next
          end

          next unless new_url_query_values?(url)

          method = if @runner.options[:check_external_hash] && url.hash?
            :get
          else
            :head
          end

          queue_request(method, url, metadata)
        end

        @hydra.run
      end

      def queue_request(method, url, filenames)
        opts = @runner.options[:typhoeus].merge(method: method)
        request = Typhoeus::Request.new(url.url, opts)
        @before_request.each do |callback|
          callback.call(request)
        end
        request.on_complete { |response| response_handler(response, url, filenames) }
        @hydra.queue(request)
      end

      def response_handler(response, url, filenames)
        method = response.request.options[:method]
        href = response.request.base_url.to_s
        response_code = response.code
        response.body.delete!("\x00")

        @logger.log(:debug, "Received a #{response_code} for #{href}")

        return if @runner.options[:ignore_status_codes].include?(response_code)

        if response_code.between?(200, 299)
          @cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(
            href,
            url,
            response,
            filenames,
          )
        elsif response.timed_out?
          handle_timeout(href, filenames, response_code)
        elsif response_code.zero?
          handle_connection_failure(href, filenames, response_code, response.status_message)
        elsif method == :head # some servers don't support HEAD
          queue_request(:get, url, filenames)
        else
          return if @runner.options[:only_4xx] && !response_code.between?(400, 499)

          # Received a non-successful http response.
          status_message = blank?(response.status_message) ? "" : ": #{response.status_message}"
          msg = "External link #{href} failed#{status_message}"
          add_failure(filenames, msg, response_code)
          @cache.add_external(href, filenames, response_code, msg, false)
        end
      end

      # Even though the response was a success, we may have been asked to check
      # if the hash on the URL exists on the page
      def check_hash_in_2xx_response(href, url, response, filenames)
        return false if @runner.options[:only_4xx]
        return false unless @runner.options[:check_external_hash]
        return false unless url.hash?

        hash = url.hash
        headers = response.options.fetch(:headers, {})
        content_type = headers.find { |k, _| k.casecmp("content-type").zero? }

        # attempt to verify PDF hash ref; see #787 for more details
        # FIXME: this is re-reading the PDF response
        if content_type && content_type[1].include?("pdf")
          io = URI.parse(url.to_s).open
          reader = PDF::Reader.new(io)

          pages = reader.pages
          if hash =~ /\Apage=(\d+)\z/
            page = Regexp.last_match[1].to_i

            unless pages[page - 1]
              msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
              add_failure(filenames, msg, response.code)
              @cache.add_external(href, filenames, response.code, msg, false)
            end

            return true
          end
        end

        body_doc = create_nokogiri(response.body)

        unencoded_hash = Addressable::URI.unescape(hash)
        xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
        # user-content is a special addition by GitHub.
        if url.host =~ /github\.com/i
          xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
          # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
          # will be identified as a linkable portion
          xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
        end

        return unless body_doc.xpath(xpath.join("|")).empty?

        msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
        add_failure(filenames, msg, response.code)
        @cache.add_external(href, filenames, response.code, msg, false)
        true
      end

      def handle_timeout(href, filenames, response_code)
        msg = "External link #{href} failed: got a time out (response code #{response_code})"
        @cache.add_external(href, filenames, 0, msg, false)
        return if @runner.options[:only_4xx]

        add_failure(filenames, msg, response_code)
      end

      def handle_connection_failure(href, metadata, response_code, status_message)
        msgs = [<<~MSG,
          External link #{href} failed with something very wrong.
          It's possible libcurl couldn't connect to the server, or perhaps the request timed out.
          Sometimes, making too many requests at once also breaks things.
        MSG
        ]

        msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message)

        msg = msgs.join("\n").chomp

        @cache.add_external(href, metadata, 0, msg, false)
        return if @runner.options[:only_4xx]

        add_failure(metadata, msg, response_code)
      end

      def add_failure(metadata, description, status = nil)
        if blank?(metadata) # possible if we're checking an array of links
          @failed_checks << Failure.new("", "Links > External", description, status: status)
        else
          metadata.each do |m|
            @failed_checks << Failure.new(
              m[:filename],
              "Links > External",
              description,
              line: m[:line],
              status: status,
              content: m[:element]&.content,
              element: m[:element],
            )
          end
        end
      end

      # remember queries we've seen, ignore future ones
      private def new_url_query_values?(url)
        return true if (query_values = url.query_values).nil?

        queries = query_values.keys.join("-")
        domain_path = url.domain_path
        if @paths_with_queries[domain_path].nil?
          @paths_with_queries[domain_path] = [queries]
          true
        elsif !@paths_with_queries[domain_path].include?(queries)
          @paths_with_queries[domain_path] << queries
          true
        else
          false
        end
      end
    end
  end
end