File: webrobots.rb

package info (click to toggle)
ruby-webrobots 0.1.2-2
links: PTS, VCS
area: main
in suites: forky, sid
size: 312 kB
sloc: ruby: 1,441; makefile: 3
file content (213 lines) | stat: -rw-r--r-- 5,759 bytes
parent folder | download | duplicates (3)
require 'webrobots/version'
require 'webrobots/robotstxt'
require 'uri'
require 'net/https'
require 'thread'
if defined?(Nokogiri)
  require 'webrobots/nokogiri'
else
  autoload :Nokogiri, 'webrobots/nokogiri'
end

class WebRobots
  # Creates a WebRobots object for a robot named +user_agent+, with
  # optional +options+.
  #
  # * :http_get => a custom method, proc, or anything that responds to
  #   .call(uri), to be used for fetching robots.txt.  It must return
  #   the response body if successful, return an empty string if the
  #   resource is not found, and return nil or raise any error on
  #   failure.  Redirects should be handled within this proc.
  #
  # * :crawl_delay => determines how to react to Crawl-delay
  #   directives.  If +:sleep+ is given, WebRobots sleeps as demanded
  #   when allowed?(url)/disallowed?(url) is called.  This is the
  #   default behavior.  If +:ignore+ is given, WebRobots does
  #   nothing.  If a custom method, proc, or anything that responds to
  #   .call(delay, last_checked_at), it is called.
  def initialize(user_agent, options = nil)
    @user_agent = user_agent

    options ||= {}
    @http_get = options[:http_get] || method(:http_get)
    crawl_delay_handler =
      case value = options[:crawl_delay] || :sleep
      when :ignore
        nil
      when :sleep
        method(:crawl_delay_handler)
      else
        if value.respond_to?(:call)
          value
        else
          raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}"
        end
      end

    @parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler)
    @parser_mutex = Mutex.new

    @robotstxt = create_cache()
  end

  # :nodoc:
  def create_cache
    Hash.new	# Must respond to [], []=, delete and clear.
  end

  # Flushes robots.txt cache.
  def flush_cache
    @robotstxt.clear
  end

  # Returns the robot name initially given.
  attr_reader :user_agent

  # Tests if the robot is allowed to access a resource at +url+.  If a
  # malformed URI string is given, URI::InvalidURIError is raised.  If
  # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
  # raised.
  def allowed?(url)
    site, request_uri = split_uri(url)
    return true if request_uri == '/robots.txt'
    robots_txt = get_robots_txt(site)
    robots_txt.allow?(request_uri)
  end

  # Equivalent to !allowed?(url).
  def disallowed?(url)
    !allowed?(url)
  end

  # Returns the number of seconds that the configured agent should wait
  # between successive requests to the site identified by +url+ according
  # to the site's robots.txt +Crawl-delay+ directive.
  def crawl_delay(url)
    robots_txt_for(url).crawl_delay()
  end

  # Returns extended option values for a resource at +url+ in a hash
  # with each field name lower-cased.  See allowed?() for a list of
  # errors that may be raised.
  def options(url)
    robots_txt_for(url).options
  end

  # Equivalent to option(url)[token.downcase].
  def option(url, token)
    options(url)[token.downcase]
  end

  # Returns an array of Sitemap URLs.  See allowed?() for a list of
  # errors that may be raised.
  def sitemaps(url)
    robots_txt_for(url).sitemaps
  end

  # Returns an error object if there is an error in fetching or
  # parsing robots.txt of the site +url+.
  def error(url)
    robots_txt_for(url).error
  end

  # Raises the error if there was an error in fetching or parsing
  # robots.txt of the site +url+.
  def error!(url)
    robots_txt_for(url).error!
  end

  # Removes robots.txt cache for the site +url+.
  def reset(url)
    site, = split_uri(url)
    @robotstxt.delete(site)
  end

  private

  def split_uri(url)
    site =
      if url.is_a?(URI)
        url.dup
      else
        begin
          URI.parse(url)
        rescue => e
          raise ArgumentError, e.message
        end
      end

    site.scheme && site.host or
      raise ArgumentError, "non-absolute URI: #{url}"

    site.is_a?(URI::HTTP) or
      raise ArgumentError, "non-HTTP/HTTPS URI: #{url}"

    request_uri = site.request_uri
    if (host = site.host).match(/[[:upper:]]/)
      site.host = host.downcase
    end
    site.path = '/'
    return site, request_uri
  end

  def robots_txt_for(url)
    site, = split_uri(url)
    get_robots_txt(site)
  end

  def get_robots_txt(site)
    @robotstxt[site] ||= fetch_robots_txt(site)
  end

  def fetch_robots_txt(site)
    begin
      body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
    rescue => e
      return RobotsTxt.unfetchable(site, e, @user_agent)
    end
    @parser_mutex.synchronize {
      @parser.parse!(body, site)
    }
  end

  def http_get(uri)
    response = nil
    referer = nil
    5.times {
      http = Net::HTTP.new(uri.host, uri.port)
      if http.use_ssl = uri.is_a?(URI::HTTPS)
        http.verify_mode = OpenSSL::SSL::VERIFY_PEER
        http.cert_store = OpenSSL::X509::Store.new.tap { |store|
          store.set_default_paths
        }
      end
      header = { 'User-Agent' => @user_agent }
      header['Referer'] = referer if referer
      # header is destroyed by this in ruby 1.9.2!
      response = http.get(uri.request_uri, header)
      case response
      when Net::HTTPSuccess
        return response.body
      when Net::HTTPRedirection
        referer = uri.to_s
        uri = URI(response['location'])
      when Net::HTTPClientError
        return ''
      end
    }
    case response
    when Net::HTTPRedirection
      # Treat too many redirections as not found
      ''
    else
      raise "#{response.code} #{response.message}"
    end
  end

  def crawl_delay_handler(delay, last_checked_at)
    if last_checked_at
      delay -= Time.now - last_checked_at
      sleep delay if delay > 0
    end
  end
end