""" Selectors tests, common for all backends """ import re import weakref from twisted.trial import unittest from scrapy.http import TextResponse, HtmlResponse, XmlResponse from scrapy.selector import XmlXPathSelector, HtmlXPathSelector, \ XPathSelector from scrapy.utils.test import libxml2debug class XPathSelectorTestCase(unittest.TestCase): xs_cls = XPathSelector hxs_cls = HtmlXPathSelector xxs_cls = XmlXPathSelector @libxml2debug def test_selector_simple(self): """Simple selector tests""" body = "

" response = TextResponse(url="http://example.com", body=body) xpath = self.hxs_cls(response) xl = xpath.select('//input') self.assertEqual(2, len(xl)) for x in xl: assert isinstance(x, self.hxs_cls) self.assertEqual(xpath.select('//input').extract(), [x.extract() for x in xpath.select('//input')]) self.assertEqual([x.extract() for x in xpath.select("//input[@name='a']/@name")], [u'a']) self.assertEqual([x.extract() for x in xpath.select("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], [u'12.0']) self.assertEqual(xpath.select("concat('xpath', 'rules')").extract(), [u'xpathrules']) self.assertEqual([x.extract() for x in xpath.select("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], [u'12']) def test_selector_unicode_query(self): body = u"

" response = TextResponse(url="http://example.com", body=body, encoding='utf8') xpath = self.hxs_cls(response) self.assertEqual(xpath.select(u'//input[@name="\xa9"]/@value').extract(), [u'1']) @libxml2debug def test_selector_same_type(self): """Test XPathSelector returning the same type in x() method""" text = '

test

' assert isinstance(self.xxs_cls(text=text).select("//p")[0], self.xxs_cls) assert isinstance(self.hxs_cls(text=text).select("//p")[0], self.hxs_cls) @libxml2debug def test_selector_boolean_result(self): body = "

" response = TextResponse(url="http://example.com", body=body) xs = self.hxs_cls(response) true = xs.select("//input[@name='a']/@name='a'").extract()[0] false = xs.select("//input[@name='a']/@name='n'").extract()[0] # the actual result depends on the backend used assert true in [u'1', u'True'], true assert false in [u'0', u'False'], false @libxml2debug def test_selector_xml_html(self): """Test that XML and HTML XPathSelector's behave differently""" # some text which is parsed differently by XML and HTML flavors text = '

Hello

' self.assertEqual(self.xxs_cls(text=text).select("//div").extract(), [u'

Hello

']) self.assertEqual(self.hxs_cls(text=text).select("//div").extract(), [u'

Hello

']) @libxml2debug def test_selector_nested(self): """Nested selector tests""" body = """

four
five
six

""" response = HtmlResponse(url="http://example.com", body=body) x = self.hxs_cls(response) divtwo = x.select('//div[@class="two"]') self.assertEqual(map(unicode.strip, divtwo.select("//li").extract()), ["

one

", "

two

", "

four

", "

five

", "

six

"]) self.assertEqual(map(unicode.strip, divtwo.select("./ul/li").extract()), ["

four

", "

five

", "

six

"]) self.assertEqual(map(unicode.strip, divtwo.select(".//li").extract()), ["

four

", "

five

", "

six

"]) self.assertEqual(divtwo.select("./li").extract(), []) @libxml2debug def test_dont_strip(self): hxs = self.hxs_cls(text='

fff: zzz

') self.assertEqual(hxs.select("//text()").extract(), [u'fff: ', u'zzz']) @libxml2debug def test_selector_namespaces_simple(self): body = """ take this found """ response = XmlResponse(url="http://example.com", body=body) x = self.xxs_cls(response) x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.select("//somens:a/text()").extract(), [u'take this']) @libxml2debug def test_selector_namespaces_multiple(self): body = """ hello value iron90Dried Rose """ response = XmlResponse(url="http://example.com", body=body) x = self.xxs_cls(response) x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.select("//xmlns:TestTag")), 1) self.assertEqual(x.select("//b:Operation/text()").extract()[0], 'hello') self.assertEqual(x.select("//xmlns:TestTag/@b:att").extract()[0], 'value') self.assertEqual(x.select("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90') self.assertEqual(x.select("//p:SecondTestTag").select("./xmlns:price/text()")[0].extract(), '90') self.assertEqual(x.select("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron') @libxml2debug def test_selector_re(self): body = """

Name: Mary

Name: John
Age: 10
Name: Paul
Age: 20

Age: 20

""" response = HtmlResponse(url="http://example.com", body=body) x = self.hxs_cls(response) name_re = re.compile("Name: (\w+)") self.assertEqual(x.select("//ul/li").re(name_re), ["John", "Paul"]) self.assertEqual(x.select("//ul/li").re("Age: (\d+)"), ["10", "20"]) @libxml2debug def test_selector_over_text(self): hxs = self.hxs_cls(text='lala') self.assertEqual(hxs.extract(), u'lala') xxs = self.xxs_cls(text='lala') self.assertEqual(xxs.extract(), u'lala') xxs = self.xxs_cls(text='lala') self.assertEqual(xxs.select('.').extract(), [u'lala']) @libxml2debug def test_selector_invalid_xpath(self): response = XmlResponse(url="http://example.com", body="") x = self.hxs_cls(response) xpath = "//test[@foo='bar]" try: x.select(xpath) except ValueError, e: assert xpath in str(e), "Exception message does not contain invalid xpath" except Exception: raise AssertionError("A invalid XPath does not raise ValueError") else: raise AssertionError("A invalid XPath does not raise an exception") @libxml2debug def test_http_header_encoding_precedence(self): # u'\xa3' = pound symbol in unicode # u'\xc2\xa3' = pound symbol in utf-8 # u'\xa3' = pound symbol in latin-1 (iso-8859-1) meta = u'' head = u'' + meta + u'' body_content = u'\xa3' body = u'' + body_content + u'' html = u'' + head + body + u'' encoding = 'utf-8' html_utf8 = html.encode(encoding) headers = {'Content-Type': ['text/html; charset=utf-8']} response = HtmlResponse(url="http://example.com", headers=headers, body=html_utf8) x = self.hxs_cls(response) self.assertEquals(x.select("//span[@id='blank']/text()").extract(), [u'\xa3']) @libxml2debug def test_empty_bodies(self): r1 = TextResponse('http://www.example.com', body='') self.hxs_cls(r1) # shouldn't raise error self.xxs_cls(r1) # shouldn't raise error @libxml2debug def test_weakref_slots(self): """Check that classes are using slots and are weak-referenceable""" for cls in [self.xs_cls, self.hxs_cls, self.xxs_cls]: x = cls() weakref.ref(x) assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \ x.__class__.__name__