File: test_html.doctest

package info (click to toggle)
python-mechanize 1%3A0.2.5-3
links: PTS, VCS
area: main
in suites: buster, jessie, jessie-kfreebsd, stretch, wheezy
size: 2,056 kB
ctags: 3,377
sloc: python: 23,140; makefile: 4
file content (259 lines) | stat: -rw-r--r-- 6,287 bytes
>>> import mechanize
>>> from mechanize._response import test_html_response
>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \
... MechanizeBs, \
... RobustLinksFactory,  RobustFormsFactory, RobustTitleFactory

mechanize.ParseError should be raised on parsing erroneous HTML.

For backwards compatibility, mechanize.ParseError derives from
exception classes that mechanize used to raise, prior to version
0.1.6.

>>> import sgmllib
>>> import HTMLParser
>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError)
True
>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError)
True

>>> def create_response(error=True):
...     extra = ""
...     if error:
...         extra = "<!!!>"
...     html = """\
... <html>
... <head>
...     <title>Title</title>
...     %s
... </head>
... <body>
...     <p>Hello world
... </body>
... </html>
... """ % extra
...     return test_html_response(html)

>>> f = LinksFactory()
>>> f.set_response(create_response(), "http://example.com", "latin-1")
>>> list(f.links())  # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
>>> f = FormsFactory()
>>> f.set_response(create_response(), "latin-1")
>>> list(f.forms())  # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
>>> f = TitleFactory()
>>> f.set_response(create_response(), "latin-1")
>>> f.title()  # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:


Accessing attributes on Factory may also raise ParseError

>>> def factory_getattr(attr_name):
...    fact = mechanize.DefaultFactory()
...    fact.set_response(create_response())
...    getattr(fact, attr_name)
>>> factory_getattr("title")  # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
>>> factory_getattr("global_form")  # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:


BeautifulSoup ParseErrors:

XXX If I could come up with examples that break links and forms
parsing, I'd uncomment these!

>>> def create_soup(html):
...     r = test_html_response(html)
...     return MechanizeBs("latin-1", r.read())

#>>> f = RobustLinksFactory()
#>>> html = """\
#... <a href="a">
#... <frame src="b">
#... <a href="c">
#... <iframe src="d">
#... </a>
#... </area>
#... </frame>
#... """
#>>> f.set_soup(create_soup(html), "http://example.com", "latin-1")
#>>> list(f.links())  # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
#ParseError:

#>>> html = """\
#... <table>
#... <tr><td>
#... <input name='broken'>
#... </td>
#... </form>
#... </tr>
#... </form>
#... """
#>>> f = RobustFormsFactory()
#>>> f.set_response(create_response(), "latin-1")
#>>> list(f.forms())  # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
#ParseError:

#>>> f = RobustTitleFactory()
#>>> f.set_soup(create_soup(""), "latin-1")
#>>> f.title()  # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
#ParseError:



Utility class for caching forms etc.

>>> from mechanize._html import CachingGeneratorFunction

>>> i = [1]
>>> func = CachingGeneratorFunction(i)
>>> list(func())
[1]
>>> list(func())
[1]

>>> i = [1, 2, 3]
>>> func = CachingGeneratorFunction(i)
>>> list(func())
[1, 2, 3]

>>> i = func()
>>> i.next()
1
>>> i.next()
2
>>> i.next()
3

>>> i = func()
>>> j = func()
>>> i.next()
1
>>> j.next()
1
>>> i.next()
2
>>> j.next()
2
>>> j.next()
3
>>> i.next()
3
>>> i.next()
Traceback (most recent call last):
...
StopIteration
>>> j.next()
Traceback (most recent call last):
...
StopIteration


Link text parsing

>>> def get_first_link_text_bs(html):
...     factory = RobustLinksFactory()
...     soup = MechanizeBs("utf-8", html)
...     factory.set_soup(soup, "http://example.com/", "utf-8")
...     return list(factory.links())[0].text

>>> def get_first_link_text_sgmllib(html):
...     factory = LinksFactory()
...     response = test_html_response(html)
...     factory.set_response(response, "http://example.com/", "utf-8")
...     return list(factory.links())[0].text

Whitespace gets compressed down to single spaces.  Tags are removed.

>>> html = ("""\
... <html><head><title>Title</title></head><body>
... <p><a href="http://example.com/">The  quick\tbrown fox jumps
...   over the <i><b>lazy</b></i> dog </a>
... </body></html>
... """)
>>> get_first_link_text_bs(html)
'The quick brown fox jumps over the lazy dog'
>>> get_first_link_text_sgmllib(html)
'The quick brown fox jumps over the lazy dog'

Empty <a> links have empty link text

>>> html = ("""\
... <html><head><title>Title</title></head><body>
... <p><a href="http://example.com/"></a>
... </body></html>
... """)
>>> get_first_link_text_bs(html)
''
>>> get_first_link_text_sgmllib(html)
''

But for backwards-compatibility, empty non-<a> links have None link text

>>> html = ("""\
... <html><head><title>Title</title></head><body>
... <p><frame src="http://example.com/"></frame>
... </body></html>
... """)
>>> print get_first_link_text_bs(html)
None
>>> print get_first_link_text_sgmllib(html)
None


Title parsing.  We follow Firefox's behaviour with regard to child
elements (haven't tested IE).

>>> def get_title_bs(html):
...     factory = RobustTitleFactory()
...     soup = MechanizeBs("utf-8", html)
...     factory.set_soup(soup, "utf-8")
...     return factory.title()

>>> def get_title_sgmllib(html):
...     factory = TitleFactory()
...     response = test_html_response(html)
...     factory.set_response(response, "utf-8")
...     return factory.title()

>>> html = ("""\
... <html><head>
... <title>Title</title>
... </head><body><p>Blah.<p></body></html>
... """)
>>> get_title_bs(html)
'Title'
>>> get_title_sgmllib(html)
'Title'

>>> html = ("""\
... <html><head>
... <title>  Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script>
... tle &amp;&#38;
... </title>
... </head><body><p>Blah.<p></body></html>
... """)
>>> get_title_bs(html)
'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
>>> get_title_sgmllib(html)
'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'


No more tags after <title> used to cause an exception

>>> html = ("""\
... <html><head>
... <title>""")
>>> get_title_sgmllib(html)
''