1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
require 'webrobots/version'
require 'webrobots/robotstxt'
require 'uri'
require 'net/https'
require 'thread'
if defined?(Nokogiri)
require 'webrobots/nokogiri'
else
autoload :Nokogiri, 'webrobots/nokogiri'
end
class WebRobots
# Creates a WebRobots object for a robot named +user_agent+, with
# optional +options+.
#
# * :http_get => a custom method, proc, or anything that responds to
# .call(uri), to be used for fetching robots.txt. It must return
# the response body if successful, return an empty string if the
# resource is not found, and return nil or raise any error on
# failure. Redirects should be handled within this proc.
#
# * :crawl_delay => determines how to react to Crawl-delay
# directives. If +:sleep+ is given, WebRobots sleeps as demanded
# when allowed?(url)/disallowed?(url) is called. This is the
# default behavior. If +:ignore+ is given, WebRobots does
# nothing. If a custom method, proc, or anything that responds to
# .call(delay, last_checked_at), it is called.
def initialize(user_agent, options = nil)
@user_agent = user_agent
options ||= {}
@http_get = options[:http_get] || method(:http_get)
crawl_delay_handler =
case value = options[:crawl_delay] || :sleep
when :ignore
nil
when :sleep
method(:crawl_delay_handler)
else
if value.respond_to?(:call)
value
else
raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}"
end
end
@parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler)
@parser_mutex = Mutex.new
@robotstxt = create_cache()
end
# :nodoc:
def create_cache
Hash.new # Must respond to [], []=, delete and clear.
end
# Flushes robots.txt cache.
def flush_cache
@robotstxt.clear
end
# Returns the robot name initially given.
attr_reader :user_agent
# Tests if the robot is allowed to access a resource at +url+. If a
# malformed URI string is given, URI::InvalidURIError is raised. If
# a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
# raised.
def allowed?(url)
site, request_uri = split_uri(url)
return true if request_uri == '/robots.txt'
robots_txt = get_robots_txt(site)
robots_txt.allow?(request_uri)
end
# Equivalent to !allowed?(url).
def disallowed?(url)
!allowed?(url)
end
# Returns the number of seconds that the configured agent should wait
# between successive requests to the site identified by +url+ according
# to the site's robots.txt +Crawl-delay+ directive.
def crawl_delay(url)
robots_txt_for(url).crawl_delay()
end
# Returns extended option values for a resource at +url+ in a hash
# with each field name lower-cased. See allowed?() for a list of
# errors that may be raised.
def options(url)
robots_txt_for(url).options
end
# Equivalent to option(url)[token.downcase].
def option(url, token)
options(url)[token.downcase]
end
# Returns an array of Sitemap URLs. See allowed?() for a list of
# errors that may be raised.
def sitemaps(url)
robots_txt_for(url).sitemaps
end
# Returns an error object if there is an error in fetching or
# parsing robots.txt of the site +url+.
def error(url)
robots_txt_for(url).error
end
# Raises the error if there was an error in fetching or parsing
# robots.txt of the site +url+.
def error!(url)
robots_txt_for(url).error!
end
# Removes robots.txt cache for the site +url+.
def reset(url)
site, = split_uri(url)
@robotstxt.delete(site)
end
private
def split_uri(url)
site =
if url.is_a?(URI)
url.dup
else
begin
URI.parse(url)
rescue => e
raise ArgumentError, e.message
end
end
site.scheme && site.host or
raise ArgumentError, "non-absolute URI: #{url}"
site.is_a?(URI::HTTP) or
raise ArgumentError, "non-HTTP/HTTPS URI: #{url}"
request_uri = site.request_uri
if (host = site.host).match(/[[:upper:]]/)
site.host = host.downcase
end
site.path = '/'
return site, request_uri
end
def robots_txt_for(url)
site, = split_uri(url)
get_robots_txt(site)
end
def get_robots_txt(site)
@robotstxt[site] ||= fetch_robots_txt(site)
end
def fetch_robots_txt(site)
begin
body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
rescue => e
return RobotsTxt.unfetchable(site, e, @user_agent)
end
@parser_mutex.synchronize {
@parser.parse!(body, site)
}
end
def http_get(uri)
response = nil
referer = nil
5.times {
http = Net::HTTP.new(uri.host, uri.port)
if http.use_ssl = uri.is_a?(URI::HTTPS)
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = OpenSSL::X509::Store.new.tap { |store|
store.set_default_paths
}
end
header = { 'User-Agent' => @user_agent }
header['Referer'] = referer if referer
# header is destroyed by this in ruby 1.9.2!
response = http.get(uri.request_uri, header)
case response
when Net::HTTPSuccess
return response.body
when Net::HTTPRedirection
referer = uri.to_s
uri = URI(response['location'])
when Net::HTTPClientError
return ''
end
}
case response
when Net::HTTPRedirection
# Treat too many redirections as not found
''
else
raise "#{response.code} #{response.message}"
end
end
def crawl_delay_handler(delay, last_checked_at)
if last_checked_at
delay -= Time.now - last_checked_at
sleep delay if delay > 0
end
end
end
|