ó
ç'ÊUc           @   s+  d  d l  Z  d  d l Z d  d l m Z d  d l m Z m Z d  d l m Z d  d l	 m
 Z
 d  d l m Z m Z d  d l m Z d  d l m Z d	 e j f d
 „  ƒ  YZ d e j f d „  ƒ  YZ d e f d „  ƒ  YZ d e j f d „  ƒ  YZ d e j f d „  ƒ  YZ e d k r'e j ƒ  n  d S(   iÿÿÿÿN(   t   RegexLinkExtractor(   t   HtmlResponset   XmlResponse(   t   Link(   t   HtmlParserLinkExtractor(   t   SgmlLinkExtractort   BaseSgmlLinkExtractor(   t   LxmlLinkExtractor(   t   get_testdatat   LinkExtractorTestCasec           B   s>   e  Z d  „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   c         C   s˜   d } t  d d | ƒ} t ƒ  } |  j | j | ƒ t d d d d ƒ t d d d d	 ƒ t d d
 d d ƒ t d d d d ƒ t d d d d ƒ g ƒ d  S(   Nsh  <html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/" /></p>
        </body></html>s&   http://example.org/somepage/index.htmlt   bodyt   urls(   http://example.org/somepage/item/12.htmlt   texts   Item 12s   http://example.org/about.htmls   About uss    http://example.org/othercat.htmls   Other categorys   http://example.org/s   >>t    (   R   R   t   assertEqualt   extract_linksR   (   t   selft   htmlt   responset   lx(    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt
   test_basic   s    	c         C   sÖ   d } t  d d | ƒ} t ƒ  } |  j | j | ƒ t d d d d ƒ g ƒ d } t  d	 d | ƒ} |  j | j | ƒ t d d
 d d ƒ g ƒ d } t  d	 d | ƒ} |  j | j | ƒ t d d d d ƒ g ƒ d  S(   NsŸ   <html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>s&   http://example.org/somepage/index.htmlR
   R   s(   http://otherdomain.com/base/item/12.htmlR   s   Item 12s{   <html><head><title>Page title<title><base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>s'   https://example.org/somepage/index.htmls    https://example.org/item/12.htmls—   <html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>s/   https://noschemedomain.com/path/to/item/12.html(   R   R   R   R   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_base_url    s    	c         C   sV   d } t  d d | d d ƒ} t ƒ  } |  j | j | ƒ t d d d d	 ƒ g ƒ d  S(
   Ns>   <body><p><a href="item/12.html">Wrong: í</a></p></body></html>s   http://www.example.comR
   t   encodings   utf-8R   s#   http://www.example.com/item/12.htmlR   u
   Wrong: ï¿½(   R   R   R   R   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_link_text_wrong_encoding8   s
    	c      	   C   sL  t  d d ƒ } t d d d | d i d g d 6ƒ } t d d	 d | ƒ } t  d d
 ƒ } t d d d | ƒ } t ƒ  } |  j | j | ƒ t d d d d ƒ t d d d d j d ƒ ƒ g ƒ |  j | j | ƒ t d d d d ƒ t d d d d j d ƒ ƒ g ƒ |  j | j | ƒ t d d d d ƒ t d d d d j d ƒ ƒ g ƒ d  S(   Nt   link_extractors   linkextractor_noenc.htmlR   s   http://example.com/utf8R
   t   headerss   text/html; charset=utf-8s   Content-Types   http://example.com/noencs   linkextractor_latin1.htmls   http://example.com/latin1s%   http://example.com/sample_%C3%B1.htmlR   R   s(   http://example.com/sample_%E2%82%AC.htmls   sample â‚¬ texts   utf-8s"   http://example.com/sample_%F1.htmls"   http://example.com/sample_%E1.htmls   sample á textt   latin1(   R   R   R   R   R   R   t   decode(   R   R
   t   response_utf8t   response_noenct   response_latin1R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_extraction_encoding@   s    %	""c         C   sK   d } d } t  ƒ  } |  j | j | ƒ t ƒ |  j | j | ƒ t ƒ d  S(   Ns#   http://lotsofstuff.com/stuff1/indexs(   http://evenmorestuff.com/uglystuff/index(   R   R   t   matchest   True(   R   t   url1t   url2R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_matchesW   s
    	c      
   C   s   d } t  d d | ƒ} t ƒ  } |  j g  | j | ƒ D] } | ^ q7 t d d d d d t ƒ t d d	 d d
 d t ƒ g ƒ d  S(   Ns…   
        <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
        <a href="about.html">About us</a>
        s   http://example.org/page.htmlR
   R   s)   http://example.org/page.html?action=printR   u   Printer-friendly paget   nofollows   http://example.org/about.htmlu   About us(   R   R   R   R   R   R!   t   False(   R   R   R   R   t   link(    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_link_nofollow_   s    	%(   t   __name__t
   __module__R   R   R   R   R$   R(   (    (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR	      s   					t   SgmlLinkExtractorTestCasec           B   sÔ   e  Z e Z d  „  Z d „  Z d „  Z d „  Z d „  Z d „  Z	 d „  Z
 d „  Z d „  Z d	 „  Z d
 „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   c         C   s+   t  d d ƒ } t d d d | ƒ |  _ d  S(   NR   s   sgml_linkextractor.htmlR   s   http://example.com/indexR
   (   R   R   R   (   R   R
   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   setUpo   s    c         C   s9   |  j  ƒ  } |  j t d „  | j |  j ƒ Dƒ ƒ ƒ d S(   sJ   Test that the resulting urls are regular strings and not a unicode objectsc         s   s!   |  ] } t  | j t ƒ Vq d  S(   N(   t
   isinstanceR   t   str(   t   .0R'   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pys	   <genexpr>v   s    N(   t   extractor_clst
   assertTruet   allR   R   (   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_urls_types   s    c         C   sÔ  |  j  ƒ  } |  j g  | j |  j ƒ D] } | ^ q% t d d d d ƒ t d d d d ƒ t d d d d ƒ t d d	 d d ƒ t d d
 d d ƒ g ƒ |  j  d d ƒ } |  j g  | j |  j ƒ D] } | ^ qÀ t d d d d ƒ t d d d d ƒ t d d d d ƒ g ƒ |  j  d d d t ƒ } |  j g  | j |  j ƒ D] } | ^ q=t d d d d ƒ t d d d d ƒ t d d d d ƒ t d d d d ƒ g ƒ |  j  d d ƒ } |  j g  | j |  j ƒ D] } | ^ qÆt d d d d ƒ t d d d d ƒ t d d d d ƒ g ƒ |  j  d d d d ƒ } |  j g  | j |  j ƒ D] } | ^ qCt d d d d ƒ t d d d d ƒ g ƒ |  j  d d ƒ } |  j g  | j |  j ƒ D] } | ^ q¨t d d	 d d ƒ g ƒ d S(   s9   Test the extractor's behaviour among different situationsR   s   http://example.com/sample1.htmlR   u    s   http://example.com/sample2.htmlu   sample 2s   http://example.com/sample3.htmlu   sample 3 texts   http://www.google.com/somethings    http://example.com/innertag.htmlu	   inner tagt   allowt   samplet   uniqueu   sample 3 repetitiont   denyt   3t   allow_domainss
   google.comN(   s   sample(   s   sample(   s   sample(   s   sample(   R8   (   s
   google.com(   R0   R   R   R   R   R&   (   R   R   R'   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_extractionx   s<    ((((((c      	   C   sŒ  |  j  d d ƒ } |  j g  | j |  j ƒ D] } | ^ q+ t d d d d ƒ t d d d d ƒ t d d	 d d
 ƒ g ƒ |  j  d d d d ƒ } |  j g  | j |  j ƒ D] } | ^ q¨ t d d d d ƒ t d d d d ƒ g ƒ |  j  d d ƒ } |  j g  | j |  j ƒ D] } | ^ qt d d d d ƒ g ƒ |  j  d d ƒ } |  j g  | j |  j ƒ D] } | ^ q`t d d d d ƒ g ƒ d S(   s9   Test the extractor's behaviour among different situationsR4   R5   R   s   http://example.com/sample1.htmlR   u    s   http://example.com/sample2.htmlu   sample 2s   http://example.com/sample3.htmlu   sample 3 textR7   R8   R9   s
   google.coms   http://www.google.com/somethingt   deny_domainss   example.comN(   R0   R   R   R   R   (   R   R   R'   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt#   test_extraction_using_single_values¥   s    ((((c         C   s   d } t  d d | ƒ} |  j ƒ  } |  j | j | ƒ t d d d d ƒ t d d d d	 ƒ t d d
 d d d t ƒ t d d d d ƒ g ƒ d S(   s<   Test the extractor's behaviour for links with rel="nofollow"sÙ  <html><head><title>Page title<title>
        <body>
        <div class='links'>
        <p><a href="/about.html">About us</a></p>
        </div>
        <div>
        <p><a href="/follow.html">Follow this link</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
        </div>
        <div>
        <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
        </div>
        </body></html>s&   http://example.org/somepage/index.htmlR
   R   s   http://example.org/about.htmlR   u   About uss   http://example.org/follow.htmlu   Follow this links    http://example.org/nofollow.htmlu   Dont follow this oneR%   s!   http://example.org/nofollow2.htmlu   Choose to follow or notN(   R   R0   R   R   R   R!   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_nofollow¿   s    c      	   C   s¨  d } d } |  j  d d ƒ } |  j | j | ƒ t ƒ |  j | j | ƒ t ƒ |  j  d d ƒ } |  j | j | ƒ t ƒ |  j | j | ƒ t ƒ |  j  d d ƒ } |  j | j | ƒ t ƒ |  j | j | ƒ t ƒ |  j  d	 d ƒ } |  j | j | ƒ t ƒ |  j | j | ƒ t ƒ |  j  d d d d d d d	 d ƒ } |  j | j d ƒ t ƒ |  j | j d ƒ t ƒ |  j | j d ƒ t ƒ |  j | j d ƒ t ƒ d  S(   Ns#   http://lotsofstuff.com/stuff1/indexs(   http://evenmorestuff.com/uglystuff/indexR4   t   stuff1R7   t	   uglystuffR9   s   evenmorestuff.comR;   s   lotsofstuff.comt   blah1t   blah2s	   blah1.coms	   blah2.coms   http://blah1.com/blah1s   http://blah1.com/blah2s   http://blah2.com/blah1s   http://blah2.com/blah2(   R>   (   R?   (   s   evenmorestuff.com(   s   lotsofstuff.com(   R@   (   RA   (   s	   blah1.com(   s	   blah2.com(   R0   R   R    R!   R&   (   R   R"   R#   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR$   Û   s*    	c         C   si   |  j  d d	 ƒ } |  j g  | j |  j ƒ D] } | ^ q+ t d d d d ƒ t d d d d ƒ g ƒ d  S(
   Nt   restrict_xpathss   //div[@id="subwrapper"]R   s   http://example.com/sample1.htmlR   u    s   http://example.com/sample2.htmlu   sample 2(   s   //div[@id="subwrapper"](   R0   R   R   R   R   (   R   R   R'   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_restrict_xpaths÷   s    (c         C   s_   d } t  d d | d d ƒ} |  j d d ƒ } |  j | j | ƒ t d d	 d
 d ƒ g ƒ d S(   s#   Test restrict_xpaths with encodingss5  <html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <div class='links'>
        <p><a href="/about.html">About us£</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html">This shouldn't be followed</a></p>
        </div>
        </body></html>s&   http://example.org/somepage/index.htmlR
   R   s   windows-1252RB   s   //div[@class='links']R   s   http://example.org/about.htmlR   u
   About usÂ£N(   R   R0   R   R   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_restrict_xpaths_encodingþ   s
    
c         C   s\   d } t  d d | d d ƒ} t d d ƒ j | ƒ } |  j | t d d	 d
 d ƒ g ƒ d  S(   NsJ   <html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>s&   http://example.org/somepage/index.htmlR
   R   s
   iso8859-15RB   s   //pR   s,   http://example.org/%E2%99%A5/you?c=%E2%82%ACR   u   text(   R   R   R   R   R   (   R   R   R   t   links(    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt'   test_restrict_xpaths_with_html_entities  s
    	c         C   sk   d } t  d d | d d ƒ} |  j d d ƒ } |  j | j | ƒ t d d	 d
 d d d d t ƒ g ƒ d S(   s=   html entities cause SGMLParser to call handle_data hook twices>   <html><body><div><a href="/foo">&gt;¾©&lt;¶«</a></body></html>s   http://example.orgR
   R   t   gb18030RB   s   //divR   s   http://example.org/fooR   u   >äº¬<ä¸œt   fragmentR   R%   N(   R   R0   R   R   R   R&   (   R   R
   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt*   test_restrict_xpaths_concat_in_handle_data  s    c         C   sD   |  j  d d ƒ } |  j | j |  j ƒ t d d d d ƒ g ƒ d  S(   Nt   restrict_csss   #subwrapper aR   s   http://example.com/sample2.htmlR   u   sample 2(   s   #subwrapper a(   R0   R   R   R   R   (   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_restrict_css  s    c      	   C   s   |  j  d d d d ƒ } |  j g  | j |  j ƒ D] } | ^ q1 t d d d d ƒ t d d	 d d
 ƒ t d d d d ƒ g ƒ d  S(   NRB   s   //div[@id="subwrapper"]RJ   s   #subwrapper + aR   s   http://example.com/sample1.htmlR   u    s   http://example.com/sample2.htmlu   sample 2s   http://example.com/sample3.htmlu   sample 3 text(   s   //div[@id="subwrapper"](   s   #subwrapper + a(   R0   R   R   R   R   (   R   R   R'   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt.   test_restrict_css_and_restrict_xpaths_together%  s    	(c         C   sŒ   d } t  d d | d d ƒ} |  j ƒ  } | j | ƒ | j | ƒ | j | ƒ |  j | j | ƒ t d d d d	 d
 d d t ƒ g ƒ d  S(   NsM   <html><body>¾©<map><area href="http://example.org/foo" /></map></body></html>s   http://example.orgR
   R   s   utf-8R   s   http://example.org/fooR   u    RH   R   R%   (   R   R0   R   R   R   R&   (   R   R
   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt"   test_area_tag_with_unicode_present.  s    c         C   se   d } t  d d | d d ƒ} |  j ƒ  } |  j | j | ƒ t d d d d	 d
 d d t ƒ g ƒ d  S(   Ns9   <html><body><div><a href="?page=2">BinB</a></body></html>s   http://known.fm/AC%2FDC/R
   R   t   utf8R   s   http://known.fm/AC%2FDC/?page=2R   u   BinBRH   R   R%   (   R   R0   R   R   R   R&   (   R   R
   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_encoded_url9  s
    c         C   sk   d } t  d d | d d ƒ} |  j d d ƒ } |  j | j | ƒ t d d	 d
 d d d d t ƒ g ƒ d  S(   Ns9   <html><body><div><a href="?page=2">BinB</a></body></html>s   http://known.fm/AC%2FDC/R
   R   RN   RB   s   //divR   s   http://known.fm/AC%2FDC/?page=2R   u   BinBRH   R   R%   (   R   R0   R   R   R   R&   (   R   R
   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt$   test_encoded_url_in_restricted_xpathA  s
    c         C   s   d } t  d d | ƒ} |  j ƒ  } |  j | j | ƒ t d d d d ƒ g ƒ t d d	 ƒ } |  j | j | ƒ t d d d d ƒ g ƒ d  S(
   Ns4   <a href="page.html">asd</a> and <a href="photo.jpg">s   http://example.org/R
   R   s   http://example.org/page.htmlR   u   asdt   deny_extensionst   jpg(   R   R0   R   R   R   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_deny_extensionsI  s    c         C   sh   d } t  d d | d d ƒ} d „  } |  j d | ƒ } |  j | j | ƒ t d d	 d
 d ƒ g ƒ d S(   s#   Test restrict_xpaths with encodingss¶   
        <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
        <a href="/about.html">About us</a>
        s&   http://example.org/somepage/index.htmlR
   R   s   windows-1252c         S   s)   t  j d |  ƒ } | r% | j d ƒ Sd  S(   Ns   javascript:goToPage\('(.*?)'i   (   t   ret   searcht   group(   t   valuet   m(    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   process_value^  s    RY   R   s"   http://example.org/other/page.htmlR   s	   Link textN(   R   R0   R   R   R   (   R   R   R   RY   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_process_valueV  s    	c         C   sY   d } t  d d | ƒ} |  j d d ƒ } |  j | j | ƒ t d d d d	 ƒ g ƒ d  S(
   NsŸ   <html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>s&   http://example.org/somepage/index.htmlR
   RB   s   //pR   s(   http://otherdomain.com/base/item/12.htmlR   s   Item 12(   R   R0   R   R   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt"   test_base_url_with_restrict_xpathsg  s
    c         C   s²  |  j  d d ƒ } |  j | j |  j ƒ t d d d d ƒ t d d d d ƒ t d d	 d d
 ƒ t d d d d ƒ t d d d d ƒ g ƒ |  j  d d d d d d ƒ } |  j | j |  j ƒ t d d d d ƒ t d d d d ƒ t d d d d ƒ t d d	 d d
 ƒ t d d d d ƒ t d d d d ƒ g ƒ |  j  d d  ƒ } |  j | j |  j ƒ g  ƒ d } t d d | ƒ} t d d ƒ } |  j | j | ƒ t d d d d ƒ g ƒ d  S(   Nt   attrst   hrefR   s   http://example.com/sample1.htmlR   u    s   http://example.com/sample2.htmlu   sample 2s   http://example.com/sample3.htmlu   sample 3 texts   http://www.google.com/somethings    http://example.com/innertag.htmlu	   inner tagt   srct   tagst   at   areat   imgRQ   s   http://example.com/sample2.jpgsU   <html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>s   http://example.com/index.htmlR
   (   s   hrefs   src(   R`   s   areas   img(    (   R0   R   R   R   R   t   NoneR   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt
   test_attrsp  s,    c         C   se  d } t  d d | ƒ} |  j d d  ƒ } |  j | j | ƒ g  ƒ |  j ƒ  } |  j | j | ƒ t d d d d ƒ t d d	 d d
 ƒ g ƒ |  j d d ƒ } |  j | j | ƒ t d d d d ƒ g ƒ |  j d d ƒ } |  j | j | ƒ t d d	 d d
 ƒ g ƒ |  j d d d d d d ƒ } |  j | j | ƒ t d d	 d d
 ƒ t d d d d ƒ g ƒ d  S(   Nsi   <html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>s   http://example.com/index.htmlR
   R_   R   s   http://example.com/sample1.htmlR   u    s   http://example.com/sample2.htmlu   sample 2Ra   R`   Rb   R\   R]   R^   RQ   s   http://example.com/sample2.jpg(   R`   s   img(   s   hrefs   src(    (   R   R0   Rc   R   R   R   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt	   test_tagsŽ  s$    c         C   sö   d } t  d d | ƒ} |  j d d d d ƒ } |  j | j | ƒ t d d	 d
 d d d d t ƒ t d d d
 d d d d t ƒ g ƒ |  j d d d d ƒ } |  j | j | ƒ t d d	 d
 d d d d t ƒ t d d d
 d d d d t ƒ g ƒ d  S(   NsÇ   
        <html><body>
        <div id="item1" data-url="get?id=1"><a href="#">Item 1</a></div>
        <div id="item2" data-url="get?id=2"><a href="#">Item 2</a></div>
        </body></html>
        s   http://example.com/index.htmlR
   R_   t   divR\   s   data-urlR   s   http://example.com/get?id=1R   u   Item 1RH   R   R%   s   http://example.com/get?id=2u   Item 2(   s   div(   s   data-url(   R   R0   R   R   R   R&   (   R   R   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   test_tags_attrs«  s    %c         C   sh  d } t  d d | ƒ} |  j ƒ  } |  j | j | ƒ t d d d d d d	 d
 t ƒ t d d d d d d	 d
 t ƒ t d d d d d d	 d
 t ƒ t d d d d d d	 d
 t ƒ g ƒ t d d | ƒ} |  j ƒ  } |  j | j | ƒ t d d d d d d	 d
 t ƒ t d d d d d d	 d
 t ƒ t d d d d d d	 d
 t ƒ t d d d d d d	 d
 t ƒ g ƒ d  S(   Ns‚  
<?xml version="1.0"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <title>XHTML document title</title>
</head>
<body>
    <div class='links'>
    <p><a href="/about.html">About us</a></p>
    </div>
    <div>
    <p><a href="/follow.html">Follow this link</a></p>
    </div>
    <div>
    <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
    </div>
    <div>
    <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
    </div>
</body>
</html>
        s   http://example.com/index.xhtmlR
   R   s   http://example.com/about.htmlR   u   About usRH   R   R%   s   http://example.com/follow.htmlu   Follow this links    http://example.com/nofollow.htmlu   Dont follow this ones!   http://example.com/nofollow2.htmlu   Choose to follow or not(   R   R0   R   R   R   R&   R!   R   (   R   t   xhtmlR   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt
   test_xhtmlÀ  s    %(   R)   R*   R   R0   R,   R3   R:   R<   R=   R$   RC   RD   RF   RI   RK   RL   RM   RO   RP   RS   RZ   R[   Rd   Re   Rg   Ri   (    (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR+   l   s.   			-																					t   LxmlLinkExtractorTestCasec           B   s   e  Z e Z RS(    (   R)   R*   R   R0   (    (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyRj   ï  s   t   HtmlParserLinkExtractorTestCasec           B   s   e  Z d  „  Z d „  Z RS(   c         C   s+   t  d d ƒ } t d d d | ƒ |  _ d  S(   NR   s   sgml_linkextractor.htmlR   s   http://example.com/indexR
   (   R   R   R   (   R   R
   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR,   õ  s    c         C   sƒ   t  ƒ  } |  j | j |  j ƒ t d d d d ƒ t d d d d ƒ t d d d d ƒ t d d d d	 ƒ t d d
 d d ƒ g ƒ d  S(   NR   s   http://example.com/sample2.htmlR   u   sample 2s   http://example.com/sample3.htmlu   sample 3 textu   sample 3 repetitions   http://www.google.com/somethingu    s    http://example.com/innertag.htmlu	   inner tag(   R   R   R   R   R   (   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR:   ù  s    	(   R)   R*   R,   R:   (    (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyRk   ó  s   	t   RegexLinkExtractorTestCasec           B   s   e  Z d  „  Z d „  Z RS(   c         C   s+   t  d d ƒ } t d d d | ƒ |  _ d  S(   NR   s   sgml_linkextractor.htmlR   s   http://example.com/indexR
   (   R   R   R   (   R   R
   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR,     s    c      
   C   sq   t  ƒ  } |  j | j |  j ƒ t d d d d ƒ t d d d d ƒ t d d d d ƒ t d d	 d d
 ƒ g ƒ d  S(   NR   s   http://example.com/sample2.htmlR   u   sample 2s   http://example.com/sample3.htmlu   sample 3 texts   http://www.google.com/somethingu    s    http://example.com/innertag.htmlu	   inner tag(   R    R   R   R   R   (   R   R   (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyR:   
  s    	(   R)   R*   R,   R:   (    (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyRl     s   	t   __main__(   RT   t   unittestt   scrapy.linkextractors.regexR    t   scrapy.httpR   R   t   scrapy.linkR   t    scrapy.linkextractors.htmlparserR   t   scrapy.linkextractors.sgmlR   R   t   scrapy.linkextractors.lxmlhtmlR   t   testsR   t   TestCaseR	   R+   Rj   Rk   Rl   R)   t   main(    (    (    s=   /home/travis/build/scrapy/scrapy/tests/test_linkextractors.pyt   <module>   s    `ÿ „