Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <p><a href="/about.html">About us</a></p> <img src="/logo.png" alt="Company logo (not a link)" /> <p><a href="../othercat.html">Other category</a></p> <p><a href="/" /></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/', text='')]) def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')) ]) def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = BaseSgmlLinkExtractor() self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), True) class SgmlLinkExtractorTestCase(unittest.TestCase): def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def test_urls_type(self): '''Test that the resulting urls are regular strings and not a unicode objects''' lx = SgmlLinkExtractor() self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response))) def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text') ]) lx = SgmlLinkExtractor(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ]) lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) lx = SgmlLinkExtractor(allow_domains=('google.com', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample2.jpg', text=u'') ]) def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text') ]) lx = SgmlLinkExtractor(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) lx = SgmlLinkExtractor(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u'') ]) def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = SgmlLinkExtractor(allow=(r'stuff1', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = SgmlLinkExtractor(deny=(r'uglystuff', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ), allow_domains=('blah1.com', ), deny_domains=('blah2.com', )) self.assertEqual(lx.matches('http://blah1.com/blah1'), True) self.assertEqual(lx.matches('http://blah1.com/blah2'), False) self.assertEqual(lx.matches('http://blah2.com/blah1'), False) self.assertEqual(lx.matches('http://blah2.com/blah2'), False) def test_restrict_xpaths(self): lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) def test_restrict_xpaths_encoding(self): """Test restrict_xpaths with encodings""" html = """<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <div class='links'> <p><a href="/about.html">About us\xa3</a></p> </div> <div> <p><a href="/nofollow.html">This shouldn't be followed</a></p> </div> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/about.html', text=u'About us\xa3')]) def test_process_value(self): """Test restrict_xpaths with encodings""" html = """ <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a> <a href="/about.html">About us</a> """ response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') def process_value(value): m = re.search("javascript:goToPage\('(.*?)'", value) if m: return m.group(1) lx = SgmlLinkExtractor(process_value=process_value) self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/other/page.html', text='Link text')]) class HTMLImageLinkExtractorTestCase(unittest.TestCase): def setUp(self): body = get_testdata('link_extractor', 'image_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def tearDown(self): del self.response def test_urls_type(self): '''Test that the resulting urls are regular strings and not a unicode objects''' lx = HTMLImageLinkExtractor() links = lx.extract_links(self.response) self.assertTrue(all(isinstance(link.url, str) for link in links)) def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = HTMLImageLinkExtractor(locations=('//img', )) links_1 = lx.extract_links(self.response) self.assertEqual(links_1, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4') ]) lx = HTMLImageLinkExtractor(locations=('//img', ), unique=False) links_2 = lx.extract_links(self.response) self.assertEqual(links_2, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4'), Link(url='http://example.com/sample4.jpg', text=u'sample 4 repetition') ]) lx = HTMLImageLinkExtractor(locations=('//div[@id="wrapper"]', )) links_3 = lx.extract_links(self.response) self.assertEqual(links_3, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4') ]) lx = HTMLImageLinkExtractor(locations=('//a', )) links_4 = lx.extract_links(self.response) self.assertEqual(links_4, [ Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3') ]) if __name__ == "__main_

import re import unittest from scrapy.http import HtmlResponse from scrapy.link import Link from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor from scrapy.tests import get_testdata class LinkExtractorTestCase(unittest.TestCase): def test_basic(self): html = """Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <p><a href="/about.html">About us</a></p> <img src="/logo.png" alt="Company logo (not a link)" /> <p><a href="../othercat.html">Other category</a></p> <p><a href="/" /></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/', text='')]) def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')) ]) def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = BaseSgmlLinkExtractor() self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), True) class SgmlLinkExtractorTestCase(unittest.TestCase): def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def test_urls_type(self): '''Test that the resulting urls are regular strings and not a unicode objects''' lx = SgmlLinkExtractor() self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response))) def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text') ]) lx = SgmlLinkExtractor(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ]) lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) lx = SgmlLinkExtractor(allow_domains=('google.com', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample2.jpg', text=u'') ]) def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text') ]) lx = SgmlLinkExtractor(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) lx = SgmlLinkExtractor(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u'') ]) def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = SgmlLinkExtractor(allow=(r'stuff1', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = SgmlLinkExtractor(deny=(r'uglystuff', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ), allow_domains=('blah1.com', ), deny_domains=('blah2.com', )) self.assertEqual(lx.matches('http://blah1.com/blah1'), True) self.assertEqual(lx.matches('http://blah1.com/blah2'), False) self.assertEqual(lx.matches('http://blah2.com/blah1'), False) self.assertEqual(lx.matches('http://blah2.com/blah2'), False) def test_restrict_xpaths(self): lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) def test_restrict_xpaths_encoding(self): """Test restrict_xpaths with encodings""" html = """<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <div class='links'> <p><a href="/about.html">About us\xa3</a></p> </div> <div> <p><a href="/nofollow.html">This shouldn't be followed</a></p> </div> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/about.html', text=u'About us\xa3')]) def test_process_value(self): """Test restrict_xpaths with encodings""" html = """ <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a> <a href="/about.html">About us</a> """ response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') def process_value(value): m = re.search("javascript:goToPage\('(.*?)'", value) if m: return m.group(1) lx = SgmlLinkExtractor(process_value=process_value) self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/other/page.html', text='Link text')]) class HTMLImageLinkExtractorTestCase(unittest.TestCase): def setUp(self): body = get_testdata('link_extractor', 'image_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def tearDown(self): del self.response def test_urls_type(self): '''Test that the resulting urls are regular strings and not a unicode objects''' lx = HTMLImageLinkExtractor() links = lx.extract_links(self.response) self.assertTrue(all(isinstance(link.url, str) for link in links)) def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = HTMLImageLinkExtractor(locations=('//img', )) links_1 = lx.extract_links(self.response) self.assertEqual(links_1, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4') ]) lx = HTMLImageLinkExtractor(locations=('//img', ), unique=False) links_2 = lx.extract_links(self.response) self.assertEqual(links_2, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4'), Link(url='http://example.com/sample4.jpg', text=u'sample 4 repetition') ]) lx = HTMLImageLinkExtractor(locations=('//div[@id="wrapper"]', )) links_3 = lx.extract_links(self.response) self.assertEqual(links_3, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4') ]) lx = HTMLImageLinkExtractor(locations=('//a', )) links_4 = lx.extract_links(self.response) self.assertEqual(links_4, [ Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3') ]) if __name__ == "__main__": unittest.main()