File: webrobots.rb

package info (click to toggle)
ruby-webrobots 0.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 312 kB
  • sloc: ruby: 1,441; makefile: 3
file content (213 lines) | stat: -rw-r--r-- 5,759 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
require 'webrobots/version'
require 'webrobots/robotstxt'
require 'uri'
require 'net/https'
require 'thread'
if defined?(Nokogiri)
  require 'webrobots/nokogiri'
else
  autoload :Nokogiri, 'webrobots/nokogiri'
end

class WebRobots
  # Creates a WebRobots object for a robot named +user_agent+, with
  # optional +options+.
  #
  # * :http_get => a custom method, proc, or anything that responds to
  #   .call(uri), to be used for fetching robots.txt.  It must return
  #   the response body if successful, return an empty string if the
  #   resource is not found, and return nil or raise any error on
  #   failure.  Redirects should be handled within this proc.
  #
  # * :crawl_delay => determines how to react to Crawl-delay
  #   directives.  If +:sleep+ is given, WebRobots sleeps as demanded
  #   when allowed?(url)/disallowed?(url) is called.  This is the
  #   default behavior.  If +:ignore+ is given, WebRobots does
  #   nothing.  If a custom method, proc, or anything that responds to
  #   .call(delay, last_checked_at), it is called.
  def initialize(user_agent, options = nil)
    @user_agent = user_agent

    options ||= {}
    @http_get = options[:http_get] || method(:http_get)
    crawl_delay_handler =
      case value = options[:crawl_delay] || :sleep
      when :ignore
        nil
      when :sleep
        method(:crawl_delay_handler)
      else
        if value.respond_to?(:call)
          value
        else
          raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}"
        end
      end

    @parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler)
    @parser_mutex = Mutex.new

    @robotstxt = create_cache()
  end

  # :nodoc:
  def create_cache
    Hash.new	# Must respond to [], []=, delete and clear.
  end

  # Flushes robots.txt cache.
  def flush_cache
    @robotstxt.clear
  end

  # Returns the robot name initially given.
  attr_reader :user_agent

  # Tests if the robot is allowed to access a resource at +url+.  If a
  # malformed URI string is given, URI::InvalidURIError is raised.  If
  # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
  # raised.
  def allowed?(url)
    site, request_uri = split_uri(url)
    return true if request_uri == '/robots.txt'
    robots_txt = get_robots_txt(site)
    robots_txt.allow?(request_uri)
  end

  # Equivalent to !allowed?(url).
  def disallowed?(url)
    !allowed?(url)
  end

  # Returns the number of seconds that the configured agent should wait
  # between successive requests to the site identified by +url+ according
  # to the site's robots.txt +Crawl-delay+ directive.
  def crawl_delay(url)
    robots_txt_for(url).crawl_delay()
  end

  # Returns extended option values for a resource at +url+ in a hash
  # with each field name lower-cased.  See allowed?() for a list of
  # errors that may be raised.
  def options(url)
    robots_txt_for(url).options
  end

  # Equivalent to option(url)[token.downcase].
  def option(url, token)
    options(url)[token.downcase]
  end

  # Returns an array of Sitemap URLs.  See allowed?() for a list of
  # errors that may be raised.
  def sitemaps(url)
    robots_txt_for(url).sitemaps
  end

  # Returns an error object if there is an error in fetching or
  # parsing robots.txt of the site +url+.
  def error(url)
    robots_txt_for(url).error
  end

  # Raises the error if there was an error in fetching or parsing
  # robots.txt of the site +url+.
  def error!(url)
    robots_txt_for(url).error!
  end

  # Removes robots.txt cache for the site +url+.
  def reset(url)
    site, = split_uri(url)
    @robotstxt.delete(site)
  end

  private

  def split_uri(url)
    site =
      if url.is_a?(URI)
        url.dup
      else
        begin
          URI.parse(url)
        rescue => e
          raise ArgumentError, e.message
        end
      end

    site.scheme && site.host or
      raise ArgumentError, "non-absolute URI: #{url}"

    site.is_a?(URI::HTTP) or
      raise ArgumentError, "non-HTTP/HTTPS URI: #{url}"

    request_uri = site.request_uri
    if (host = site.host).match(/[[:upper:]]/)
      site.host = host.downcase
    end
    site.path = '/'
    return site, request_uri
  end

  def robots_txt_for(url)
    site, = split_uri(url)
    get_robots_txt(site)
  end

  def get_robots_txt(site)
    @robotstxt[site] ||= fetch_robots_txt(site)
  end

  def fetch_robots_txt(site)
    begin
      body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
    rescue => e
      return RobotsTxt.unfetchable(site, e, @user_agent)
    end
    @parser_mutex.synchronize {
      @parser.parse!(body, site)
    }
  end

  def http_get(uri)
    response = nil
    referer = nil
    5.times {
      http = Net::HTTP.new(uri.host, uri.port)
      if http.use_ssl = uri.is_a?(URI::HTTPS)
        http.verify_mode = OpenSSL::SSL::VERIFY_PEER
        http.cert_store = OpenSSL::X509::Store.new.tap { |store|
          store.set_default_paths
        }
      end
      header = { 'User-Agent' => @user_agent }
      header['Referer'] = referer if referer
      # header is destroyed by this in ruby 1.9.2!
      response = http.get(uri.request_uri, header)
      case response
      when Net::HTTPSuccess
        return response.body
      when Net::HTTPRedirection
        referer = uri.to_s
        uri = URI(response['location'])
      when Net::HTTPClientError
        return ''
      end
    }
    case response
    when Net::HTTPRedirection
      # Treat too many redirections as not found
      ''
    else
      raise "#{response.code} #{response.message}"
    end
  end

  def crawl_delay_handler(delay, last_checked_at)
    if last_checked_at
      delay -= Time.now - last_checked_at
      sleep delay if delay > 0
    end
  end
end