1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
|
import pytest
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
from scrapy.http import HtmlResponse, Request, Response
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class TestAjaxCrawlMiddleware:
def setup_method(self):
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})
self.spider = crawler._create_spider("foo")
self.mw = AjaxCrawlMiddleware.from_crawler(crawler)
def _ajaxcrawlable_body(self):
return b'<html><head><meta name="fragment" content="!"/></head><body></body></html>'
def _req_resp(self, url, req_kwargs=None, resp_kwargs=None):
req = Request(url, **(req_kwargs or {}))
resp = HtmlResponse(url, request=req, **(resp_kwargs or {}))
return req, resp
def test_non_get(self):
req, resp = self._req_resp("http://example.com/", {"method": "HEAD"})
resp2 = self.mw.process_response(req, resp, self.spider)
assert resp == resp2
def test_binary_response(self):
req = Request("http://example.com/")
resp = Response("http://example.com/", body=b"foobar\x00\x01\x02", request=req)
resp2 = self.mw.process_response(req, resp, self.spider)
assert resp is resp2
def test_ajaxcrawl(self):
req, resp = self._req_resp(
"http://example.com/",
{"meta": {"foo": "bar"}},
{"body": self._ajaxcrawlable_body()},
)
req2 = self.mw.process_response(req, resp, self.spider)
assert req2.url == "http://example.com/?_escaped_fragment_="
assert req2.meta["foo"] == "bar"
def test_ajaxcrawl_loop(self):
req, resp = self._req_resp(
"http://example.com/", {}, {"body": self._ajaxcrawlable_body()}
)
req2 = self.mw.process_response(req, resp, self.spider)
resp2 = HtmlResponse(req2.url, body=resp.body, request=req2)
resp3 = self.mw.process_response(req2, resp2, self.spider)
assert isinstance(resp3, HtmlResponse), (resp3.__class__, resp3)
assert resp3.request.url == "http://example.com/?_escaped_fragment_="
assert resp3 is resp2
def test_noncrawlable_body(self):
req, resp = self._req_resp(
"http://example.com/", {}, {"body": b"<html></html>"}
)
resp2 = self.mw.process_response(req, resp, self.spider)
assert resp is resp2
|