File: cache.rb

package info (click to toggle)
ruby-html-proofer 3.19.4-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,040 kB
  • sloc: ruby: 3,203; sh: 9; makefile: 4; javascript: 1; php: 1
file content (194 lines) | stat: -rw-r--r-- 5,044 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# frozen_string_literal: true

require_relative 'utils'
require 'date'
require 'json'
require 'uri'

module HTMLProofer
  class Cache
    include HTMLProofer::Utils

    DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
    DEFAULT_CACHE_FILE_NAME = 'cache.log'

    URI_REGEXP = URI::DEFAULT_PARSER.make_regexp

    attr_reader :exists, :cache_log, :storage_dir, :cache_file

    def initialize(logger, options)
      @logger = logger
      @cache_log = {}

      @cache_datetime = DateTime.now
      @cache_time = @cache_datetime.to_time

      if options.nil? || options.empty?
        define_singleton_method('use_cache?') { false }
      else
        define_singleton_method('use_cache?') { true }
        setup_cache!(options)
        @parsed_timeframe = parsed_timeframe(options[:timeframe])
      end
    end

    def within_timeframe?(time)
      return false if time.nil?

      (@parsed_timeframe..@cache_time).cover?(Time.parse(time))
    end

    def urls
      @cache_log['urls'] || []
    end

    def size
      @cache_log.length
    end

    def parsed_timeframe(timeframe)
      time, date = timeframe.match(/(\d+)(\D)/).captures
      time = time.to_i
      case date
      when 'M'
        time_ago(time, :months)
      when 'w'
        time_ago(time, :weeks)
      when 'd'
        time_ago(time, :days)
      when 'h'
        time_ago(time, :hours)
      else
        raise ArgumentError, "#{date} is not a valid timeframe!"
      end
    end

    def add(url, filenames, status, msg = '')
      return unless use_cache?

      data = {
        time: @cache_time,
        filenames: filenames,
        status: status,
        message: msg
      }

      @cache_log[clean_url(url)] = data
    end

    def detect_url_changes(found, type)
      found_urls = found.keys.map { |url| clean_url(url) }

      # if there were no urls, bail
      return {} if found_urls.empty?

      existing_urls = @cache_log.keys.map { |url| clean_url(url) }

      # prepare to add new URLs detected
      additions = found.reject do |url, _|
        url = clean_url(url)
        if existing_urls.include?(url)
          true
        else
          @logger.log :debug, "Adding #{url} to cache check"
          false
        end
      end

      new_link_count = additions.length
      new_link_text = pluralize(new_link_count, 'link', 'links')
      @logger.log :info, "Adding #{new_link_text} to the cache..."

      # remove from cache URLs that no longer exist
      deletions = 0
      @cache_log.delete_if do |url, _|
        url = clean_url(url)

        if found_urls.include?(url)
          false
        elsif url_matches_type?(url, type)
          @logger.log :debug, "Removing #{url} from cache check"
          deletions += 1
          true
        end
      end

      del_link_text = pluralize(deletions, 'link', 'links')
      @logger.log :info, "Removing #{del_link_text} from the cache..."

      additions
    end

    # TODO: Garbage performance--both the external and internal
    # caches need access to this file. Write a proper versioned
    # schema in the future
    def write
      File.write(cache_file, @cache_log.to_json)
    end

    def load?
      @load.nil?
    end

    def retrieve_urls(urls, type)
      urls_to_check = detect_url_changes(urls, type)

      @cache_log.each_pair do |url, cache|
        next if within_timeframe?(cache['time']) && cache['message'].empty? # these were successes to skip

        if url_matches_type?(url, type)
          urls_to_check[url] = cache['filenames'] # recheck expired links
        end
      end
      urls_to_check
    end

    # FIXME: it seems that Typhoeus actually acts on escaped URLs,
    # but there's no way to get at that information, and the cache
    # stores unescaped URLs. Because of this, some links, such as
    # github.com/search/issues?q=is:open+is:issue+fig are not matched
    # as github.com/search/issues?q=is%3Aopen+is%3Aissue+fig
    def unescape_url(url)
      Addressable::URI.unescape(url)
    end

    def clean_url(url)
      unescape_url(url)
    end

    def setup_cache!(options)
      @storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR

      FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)

      cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME

      @cache_file = File.join(storage_dir, cache_file_name)

      return unless File.exist?(@cache_file)

      contents = File.read(@cache_file)
      @cache_log = contents.empty? ? {} : JSON.parse(contents)
    end

    private

    def time_ago(measurement, unit)
      case unit
      when :months
        @cache_datetime >> -measurement
      when :weeks
        @cache_datetime - (measurement * 7)
      when :days
        @cache_datetime - measurement
      when :hours
        @cache_datetime - Rational(measurement / 24.0)
      end.to_time
    end

    def url_matches_type?(url, type)
      return true if type == :internal && url !~ URI_REGEXP
      return true if type == :external && url =~ URI_REGEXP
    end
  end
end