1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
|
import json
import os
import re
import sys
from pathlib import Path
from subprocess import PIPE, Popen
from urllib.parse import urlsplit, urlunsplit
import pytest
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.http import Request
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import SimpleSpider, SingleRequestSpider
class MitmProxy:
auth_user = "scrapy"
auth_pass = "scrapy"
def start(self):
script = """
import sys
from mitmproxy.tools.main import mitmdump
sys.argv[0] = "mitmdump"
sys.exit(mitmdump())
"""
cert_path = Path(__file__).parent.resolve() / "keys"
self.proc = Popen(
[
sys.executable,
"-u",
"-c",
script,
"--listen-host",
"127.0.0.1",
"--listen-port",
"0",
"--proxyauth",
f"{self.auth_user}:{self.auth_pass}",
"--set",
f"confdir={cert_path}",
"--ssl-insecure",
],
stdout=PIPE,
)
line = self.proc.stdout.readline().decode("utf-8")
host_port = re.search(r"listening at (?:http://)?([^:]+:\d+)", line).group(1)
return f"http://{self.auth_user}:{self.auth_pass}@{host_port}"
def stop(self):
self.proc.kill()
self.proc.communicate()
def _wrong_credentials(proxy_url):
bad_auth_proxy = list(urlsplit(proxy_url))
bad_auth_proxy[1] = bad_auth_proxy[1].replace("scrapy:scrapy@", "wrong:wronger@")
return urlunsplit(bad_auth_proxy)
class TestProxyConnect(TestCase):
@classmethod
def setUpClass(cls):
cls.mockserver = MockServer()
cls.mockserver.__enter__()
@classmethod
def tearDownClass(cls):
cls.mockserver.__exit__(None, None, None)
def setUp(self):
try:
import mitmproxy # noqa: F401
except ImportError:
pytest.skip("mitmproxy is not installed")
self._oldenv = os.environ.copy()
self._proxy = MitmProxy()
proxy_url = self._proxy.start()
os.environ["https_proxy"] = proxy_url
os.environ["http_proxy"] = proxy_url
def tearDown(self):
self._proxy.stop()
os.environ = self._oldenv
@defer.inlineCallbacks
def test_https_connect_tunnel(self):
crawler = get_crawler(SimpleSpider)
with LogCapture() as log:
yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True))
self._assert_got_response_code(200, log)
@defer.inlineCallbacks
def test_https_tunnel_auth_error(self):
os.environ["https_proxy"] = _wrong_credentials(os.environ["https_proxy"])
crawler = get_crawler(SimpleSpider)
with LogCapture() as log:
yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True))
# The proxy returns a 407 error code but it does not reach the client;
# he just sees a TunnelError.
self._assert_got_tunnel_error(log)
@defer.inlineCallbacks
def test_https_tunnel_without_leak_proxy_authorization_header(self):
request = Request(self.mockserver.url("/echo", is_secure=True))
crawler = get_crawler(SingleRequestSpider)
with LogCapture() as log:
yield crawler.crawl(seed=request)
self._assert_got_response_code(200, log)
echo = json.loads(crawler.spider.meta["responses"][0].text)
assert "Proxy-Authorization" not in echo["headers"]
def _assert_got_response_code(self, code, log):
print(log)
assert str(log).count(f"Crawled ({code})") == 1
def _assert_got_tunnel_error(self, log):
print(log)
assert "TunnelError" in str(log)
|