# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
from twisted.internet import reactor, error
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
from twisted.python import failure
from twisted.trial import unittest
from scrapy.downloadermiddlewares.robotstxt import (RobotsTxtMiddleware,
                                                    logger as mw_module_logger)
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.settings import Settings
from tests import mock


class RobotsTxtMiddlewareTest(unittest.TestCase):

    def setUp(self):
        self.crawler = mock.MagicMock()
        self.crawler.settings = Settings()
        self.crawler.engine.download = mock.MagicMock()

    def tearDown(self):
        del self.crawler

    def test_robotstxt_settings(self):
        self.crawler.settings = Settings()
        self.crawler.settings.set('USER_AGENT', 'CustomAgent')
        self.assertRaises(NotConfigured, RobotsTxtMiddleware, self.crawler)

    def _get_successful_crawler(self):
        crawler = self.crawler
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        ROBOTS = re.sub(b'^\s+(?m)', b'', u'''
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/

        # taken from https://en.wikipedia.org/robots.txt
        Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
        Disallow: /wiki/Käyttäjä:

        User-Agent: UnicödeBöt
        Disallow: /some/randome/page.html
        '''.encode('utf-8'))
        response = TextResponse('http://site.local/robots.txt', body=ROBOTS)
        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred
        crawler.engine.download.side_effect = return_response
        return crawler

    def test_robotstxt(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertIgnored(Request('http://site.local/static/'), middleware),
            self.assertIgnored(Request('http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:'), middleware),
            self.assertIgnored(Request(u'http://site.local/wiki/Käyttäjä:'), middleware)
        ], fireOnOneErrback=True)

    def test_robotstxt_ready_parser(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware))
        return d

    def test_robotstxt_meta(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        meta = {'dont_obey_robotstxt': True}
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware),
            self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
        ], fireOnOneErrback=True)

    def _get_garbage_crawler(self):
        crawler = self.crawler
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        response = Response('http://site.local/robots.txt', body=b'GIF89a\xd3\x00\xfe\x00\xa2')
        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred
        crawler.engine.download.side_effect = return_response
        return crawler

    def test_robotstxt_garbage(self):
        # garbage response should be discarded, equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
        deferred = DeferredList([
            self.assertNotIgnored(Request('http://site.local'), middleware),
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertNotIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)
        return deferred

    def _get_emptybody_crawler(self):
        crawler = self.crawler
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        response = Response('http://site.local/robots.txt')
        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred
        crawler.engine.download.side_effect = return_response
        return crawler

    def test_robotstxt_empty_response(self):
        # empty response should equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertNotIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)

    def test_robotstxt_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def return_failure(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = return_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
        deferred = middleware.process_request(Request('http://site.local'), None)
        deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
        return deferred

    def test_robotstxt_immediate_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def immediate_failure(request, spider):
            deferred = Deferred()
            deferred.errback(failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = immediate_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        return self.assertNotIgnored(Request('http://site.local'), middleware)

    def test_ignore_robotstxt_request(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        def ignore_request(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
            return deferred
        self.crawler.engine.download.side_effect = ignore_request

        middleware = RobotsTxtMiddleware(self.crawler)
        mw_module_logger.error = mock.MagicMock()

        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
        return d

    def assertNotIgnored(self, request, middleware):
        spider = None  # not actually used
        dfd = maybeDeferred(middleware.process_request, request, spider)
        dfd.addCallback(self.assertIsNone)
        return dfd

    def assertIgnored(self, request, middleware):
        spider = None  # not actually used
        return self.assertFailure(maybeDeferred(middleware.process_request, request, spider),
                                  IgnoreRequest)
