test page

from pathlib import Path from time import process_time from urllib.parse import urlparse import pytest from scrapy.http import HtmlResponse, Response from scrapy.utils.python import to_bytes from scrapy.utils.response import ( _remove_html_comments, get_base_url, get_meta_refresh, open_in_browser, response_status_message, ) def test_open_in_browser(): url = "http:///www.example.com/some/page.html" body = ( b" test page test body " ) def browser_open(burl: str) -> bool: path = urlparse(burl).path if not path or not Path(path).exists(): path = burl.replace("file://", "") bbody = Path(path).read_bytes() assert b'' in bbody return True response = HtmlResponse(url, body=body) assert open_in_browser(response, _openfunc=browser_open), "Browser not called" resp = Response(url, body=body) with pytest.raises(TypeError): open_in_browser(resp, debug=True) # pylint: disable=unexpected-keyword-arg def test_get_meta_refresh(): r1 = HtmlResponse( "http://www.example.com", body=b""" Dummy blahablsdfsal& """, ) r2 = HtmlResponse( "http://www.example.com", body=b""" Dummy blahablsdfsal& """, ) r3 = HtmlResponse( "http://www.example.com", body=b""" """, ) r4 = HtmlResponse( "http://www.example.com", body=b""" Dummy blahablsdfsal& """, ) assert get_meta_refresh(r1) == (5.0, "http://example.org/newpage") assert get_meta_refresh(r2) == (None, None) assert get_meta_refresh(r3) == (None, None) assert get_meta_refresh(r4) == ( 5.0, "http://www.another-domain.com/base/path/target.html", ) def test_get_base_url(): resp = HtmlResponse( "http://www.example.com", body=b""" blahablsdfsal& """, ) assert get_base_url(resp) == "http://www.example.com/img/" resp2 = HtmlResponse( "http://www.example.com", body=b""" blahablsdfsal&""", ) assert get_base_url(resp2) == "http://www.example.com" def test_response_status_message(): assert response_status_message(200) == "200 OK" assert response_status_message(404) == "404 Not Found" assert response_status_message(573) == "573 Unknown Status" def test_inject_base_url(): url = "http://www.example.com" def check_base_url(burl): path = urlparse(burl).path if not path or not Path(path).exists(): path = burl.replace("file://", "") bbody = Path(path).read_bytes() assert bbody.count(b'') == 1 return True r1 = HtmlResponse( url, body=b""" Dummy

Hello world.

""", ) r2 = HtmlResponse( url, body=b""" Dummy Hello world. """, ) r3 = HtmlResponse( url, body=b""" Dummy

Hello header

Hello world.

""", ) r4 = HtmlResponse( url, body=b""" Dummy

Hello world.

""", ) r5 = HtmlResponse( url, body=b""" Standard head

Hello world.

""", ) assert open_in_browser(r1, _openfunc=check_base_url), "Inject base url" assert open_in_browser(r2, _openfunc=check_base_url), ( "Inject base url with argumented head" ) assert open_in_browser(r3, _openfunc=check_base_url), ( "Inject unique base url with misleading tag" ) assert open_in_browser(r4, _openfunc=check_base_url), ( "Inject unique base url with misleading comment" ) assert open_in_browser(r5, _openfunc=check_base_url), ( "Inject unique base url with conditional comment" ) def test_open_in_browser_redos_comment(): MAX_CPU_TIME = 0.02 # Exploit input from # https://makenowjust-labs.github.io/recheck/playground/ # for // (old pattern to remove comments). body = b"->" response = HtmlResponse("https://example.com", body=body) start_time = process_time() open_in_browser(response, lambda url: True) end_time = process_time() assert end_time - start_time < MAX_CPU_TIME def test_open_in_browser_redos_head(): MAX_CPU_TIME = 0.02 # Exploit input from # https://makenowjust-labs.github.io/recheck/playground/ # for /(|\s.*?>))/ (old pattern to find the head element). body = b"b", b"ab"), (b"ac", b"ac"), (b"acccd", b"acd"), (b"ad", b"ad"), ], ) def test_remove_html_comments(input_body, output_body): assert _remove_html_comments(input_body) == output_body