# -*- coding: utf8 -*- from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from collections import defaultdict from breadability._compat import ( to_unicode, to_bytes, unicode, ) from breadability.document import ( convert_breaks_to_paragraphs, decode_html, OriginalDocument, ) from .compat import unittest from .utils import load_snippet class TestOriginalDocument(unittest.TestCase): """Verify we can process html into a document to work off of.""" def test_convert_br_tags_to_paragraphs(self): returned = convert_breaks_to_paragraphs( ("

HI

How are you?

\t \n
" "Fine\n I guess

")) self.assertEqual( returned, "

How are you?

Fine\n I guess

") def test_convert_hr_tags_to_paragraphs(self): returned = convert_breaks_to_paragraphs( "

HI

How are you?

\t \n
Fine\n I guess

") self.assertEqual( returned, "

How are you?

Fine\n I guess

") def test_readin_min_document(self): """Verify we can read in a min html document""" doc = OriginalDocument(load_snippet('document_min.html')) self.assertTrue(to_unicode(doc).startswith('')) self.assertEqual(doc.title, 'Min Document Title') def test_readin_with_base_url(self): """Passing a url should update links to be absolute links""" doc = OriginalDocument( load_snippet('document_absolute_url.html'), url="http://blog.mitechie.com/test.html") self.assertTrue(to_unicode(doc).startswith('')) # find the links on the page and make sure each one starts with out # base url we told it to use. links = doc.links self.assertEqual(len(links), 3) # we should have two links that start with our blog url # and one link that starts with amazon link_counts = defaultdict(int) for link in links: if link.get('href').startswith('http://blog.mitechie.com'): link_counts['blog'] += 1 else: link_counts['other'] += 1 self.assertEqual(link_counts['blog'], 2) self.assertEqual(link_counts['other'], 1) def test_no_br_allowed(self): """We convert all
tags to

tags""" doc = OriginalDocument(load_snippet('document_min.html')) self.assertIsNone(doc.dom.find('.//br')) def test_empty_title(self): """We convert all
tags to

tags""" document = OriginalDocument( "") self.assertEqual(document.title, "") def test_title_only_with_tags(self): """We convert all
tags to

tags""" document = OriginalDocument( "<em></em>") self.assertEqual(document.title, "") def test_no_title(self): """We convert all
tags to

tags""" document = OriginalDocument("") self.assertEqual(document.title, "") def test_encoding(self): text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") html = decode_html(text) self.assertEqual(type(html), unicode) def test_encoding_short(self): text = to_bytes("ľščťžýáíé") html = decode_html(text) self.assertEqual(type(html), unicode) self.assertEqual(html, "ľščťžýáíé")