File: url.rb

package info (click to toggle)
ruby-html-proofer 5.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 22,524 kB
  • sloc: ruby: 4,389; sh: 8; makefile: 4; javascript: 1; php: 1
file content (269 lines) | stat: -rw-r--r-- 6,562 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# frozen_string_literal: true

module HTMLProofer
  class Attribute
    class Url < HTMLProofer::Attribute
      attr_reader :url, :size, :source, :filename

      REMOTE_SCHEMES = ["http", "https"].freeze

      def initialize(runner, link_attribute, base_url: nil, source: nil, filename: nil, extract_size: false)
        super

        @source = source
        @filename = filename

        if @raw_attribute.nil?
          @url = nil
        else
          @url = @raw_attribute.delete("\u200b").strip
          @url, @size = @url.split(/\s+/) if extract_size
          @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
          @url = "" if @url.nil?

          swap_urls!
          clean_url!
        end
      end

      def protocol_relative?
        url.start_with?("//")
      end

      def to_s
        @url
      end

      def known_extension?
        return true if hash_link?
        return true if path.end_with?("/")

        ext = File.extname(path)

        # no extension means we use the assumed one
        return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)

        @runner.options[:extensions].include?(ext)
      end

      def unknown_extension?
        !known_extension?
      end

      def ignore?
        return true if /^javascript:/.match?(@url)

        true if ignores_pattern?(@runner.options[:ignore_urls])
      end

      def valid?
        !parts.nil?
      end

      def path?
        !parts.host.nil? && !parts.path.nil?
      end

      def parts
        @parts ||= Addressable::URI.parse(@url)
      rescue URI::Error, Addressable::URI::InvalidURIError
        @parts = nil
      end

      def path
        Addressable::URI.unencode(parts.path) unless parts.nil?
      end

      def hash
        parts&.fragment
      end

      # Does the URL have a hash?
      def hash?
        !blank?(hash)
      end

      def scheme
        parts&.scheme
      end

      def remote?
        REMOTE_SCHEMES.include?(scheme)
      end

      def http?
        scheme == "http"
      end

      def https?
        scheme == "https"
      end

      def non_http_remote?
        !scheme.nil? && !remote?
      end

      def host
        parts&.host
      end

      def domain_path
        (host || "") + path
      end

      def query_values
        parts&.query_values
      end

      # checks if a file exists relative to the current pwd
      def exists?
        return true if base64?

        !resolved_path.nil?
      end

      def resolved_path
        path_to_resolve = absolute_path

        return @runner.resolved_paths[path_to_resolve] if @runner.resolved_paths.key?(path_to_resolve)

        # extensionless URLs
        path_with_extension = "#{path_to_resolve}#{@runner.options[:assume_extension]}"
        resolved = if @runner.options[:assume_extension] && File.file?(path_with_extension)
          path_with_extension # existence checked implicitly by File.file?
        # implicit index support
        elsif File.directory?(path_to_resolve) && !unslashed_directory?(path_to_resolve)
          @runner.options[:directory_index_files]
            .map { |dif| File.join(path_to_resolve, dif) }
            .find { |path_with_index| File.file?(path_with_index) }
        # explicit file or directory
        elsif File.exist?(path_to_resolve)
          path_to_resolve
        end
        @runner.resolved_paths[path_to_resolve] = resolved

        resolved
      end

      def base64?
        /^data:image/.match?(@raw_attribute)
      end

      def absolute_path
        path = full_path || @filename

        File.expand_path(path, Dir.pwd)
      end

      def full_path
        return if path.nil? || path.empty?

        base = if absolute_path?(path) # path relative to root
          # either overwrite with root_dir; or, if source is directory, use that; or, just get the source file's dirname
          @runner.options[:root_dir] || (File.directory?(@source) ? @source : File.dirname(@source))
        else
          # path relative to the file where the link is defined
          File.dirname(@filename)
        end

        File.join(base, path)
      end

      def unslashed_directory?(file)
        return false unless File.directory?(file)

        !file.end_with?(File::SEPARATOR) && !follow_location?
      end

      def follow_location?
        @runner.options[:typhoeus] && @runner.options[:typhoeus][:followlocation]
      end

      def absolute_path?(path)
        path.start_with?("/")
      end

      # path is external to the file
      def external?
        !internal?
      end

      def internal?
        relative_link? || internal_absolute_link? || hash_link?
      end

      def internal_absolute_link?
        url.start_with?("/")
      end

      def relative_link?
        return false if remote?

        hash_link? || param_link? || url.start_with?(".") || url =~ /^\S/
      end

      def link_points_to_same_page?
        hash_link || param_link
      end

      def hash_link?
        url.start_with?("#")
      end

      def has_hash?
        url.include?("#")
      end

      def param_link?
        url.start_with?("?")
      end

      def without_hash
        @url.to_s.sub(/##{hash}/, "")
      end

      # catch any obvious issues
      private def clean_url!
        parsed_url = Addressable::URI.parse(@url)
        url = if parsed_url.scheme.nil?
          parsed_url
        else
          parsed_url.normalize
        end.to_s

        # normalize strips this off, which causes issues with cache
        @url = if @url.end_with?("/") && !url.end_with?("/")
          "#{url}/"
        elsif !@url.end_with?("/") && url.end_with?("/")
          url.chop
        else
          url
        end
      rescue Addressable::URI::InvalidURIError # rubocop:disable Lint/SuppressedException -- error will be reported at check time
      end

      private def swap_urls!
        return @url if blank?(replacements = @runner.options[:swap_urls])

        replacements.each do |link, replace|
          @url = @url.gsub(link, replace)
        end
      end

      private def ignores_pattern?(links_to_ignore)
        return false unless links_to_ignore.is_a?(Array)

        links_to_ignore.each do |link_to_ignore|
          case link_to_ignore
          when String
            return true if link_to_ignore == @raw_attribute
          when Regexp
            return true if link_to_ignore&.match?(@raw_attribute)
          end
        end

        false
      end
    end
  end
end