1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
import unittest
from scrapy.spider import BaseSpider
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url
__doctests__ = ['scrapy.utils.url']
class UrlUtilsTest(unittest.TestCase):
def test_url_is_from_any_domain(self):
url = 'http://www.wheele-bin-art.co.uk/get/product/123'
self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))
url = 'http://wheele-bin-art.co.uk/get/product/123'
self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))
url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
self.assertFalse(url_is_from_any_domain(url, ['testdomain.com']))
self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
def test_url_is_from_spider(self):
spider = BaseSpider(name='example.com')
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
def test_url_is_from_spider_class_attributes(self):
class MySpider(BaseSpider):
name = 'example.com'
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
def test_url_is_from_spider_with_allowed_domains(self):
spider = BaseSpider(name='example.com', allowed_domains=['example.org', 'example.net'])
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))
def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
class MySpider(BaseSpider):
name = 'example.com'
allowed_domains = ['example.org', 'example.net']
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
def test_canonicalize_url(self):
# simplest case
self.assertEqual(canonicalize_url("http://www.example.com"),
"http://www.example.com")
# always return a str
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
# typical usage
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3")
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
"http://www.example.com/do?a=3&b=2&c=1")
self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
"http://www.example.com/do?a=1")
# sorting by argument values
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
"http://www.example.com/do?a=50&b=2&b=5&c=3")
# using keep_blank_values
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
"http://www.example.com/do?a=2")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
"http://www.example.com/do?a=2&b=")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
"http://www.example.com/do?a=2")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
"http://www.example.com/do?a=2&b=&c=")
self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
'http://www.example.com/do?1750%2C4=')
# spaces
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
# normalize percent-encoding case (in paths)
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
"http://www.example.com/a%A3do"),
# normalize percent-encoding case (in query arguments)
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
"http://www.example.com/do?k=b%A3")
# non-ASCII percent-encoding in paths
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
"http://www.example.com/a%20do?a=1"),
self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
"http://www.example.com/a%20%20do?a=1"),
self.assertEqual(canonicalize_url("http://www.example.com/a do\xc2\xa3.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1")
# non-ASCII percent-encoding in query arguments
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=\xa3500&a=5&z=3"),
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url("http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url("http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
"http://www.example.com/do?a=1&price%28%C2%A3%29=500")
# urls containing auth and ports
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"),
u"http://user:pass@www.example.com:81/do?now=1")
# remove fragments
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"),
u"http://user:pass@www.example.com/do?a=1")
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
u"http://user:pass@www.example.com/do?a=1#frag")
# dont convert safe characters to percent encoding representation
self.assertEqual(canonicalize_url(
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
# urllib.quote uses a mapping cache of encoded characters. when parsing
# an already percent-encoded url, it will fail if that url was not
# percent-encoded as utf-8, that's why canonicalize_url must always
# convert the urls to string. the following test asserts that
# functionality.
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
'http://www.example.com/caf%E9-con-leche.htm')
# domains are case insensitive
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com"),
"http://www.example.com")
if __name__ == "__main__":
unittest.main()
|