File: test_readable.py

package info (click to toggle)
breadability 0.1.20-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 316 kB
  • ctags: 282
  • sloc: python: 1,713; makefile: 18
file content (347 lines) | stat: -rw-r--r-- 13,325 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# -*- coding: utf8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from breadability._compat import to_unicode
from breadability.readable import (
    Article,
    get_class_weight,
    get_link_density,
    is_bad_link,
    leaf_div_elements_into_paragraphs,
    score_candidates,
)
from breadability.scoring import ScoredNode
from .compat import unittest
from .utils import load_snippet, load_article


class TestReadableDocument(unittest.TestCase):
    """Verify we can process html into a document to work off of."""

    def test_load_doc(self):
        """We get back an element tree from our original doc"""
        doc = Article(load_snippet('document_min.html'))
        # We get back the document as a div tag currently by default.
        self.assertEqual(doc.readable_dom.tag, 'div')

    def test_title_loads(self):
        """Verify we can fetch the title of the parsed article"""
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(
            doc._original_document.title,
            'Min Document Title'
        )

    def test_doc_no_scripts_styles(self):
        """Step #1 remove all scripts from the document"""
        doc = Article(load_snippet('document_scripts.html'))
        readable = doc.readable_dom
        self.assertEqual(readable.findall(".//script"), [])
        self.assertEqual(readable.findall(".//style"), [])
        self.assertEqual(readable.findall(".//link"), [])

    def test_find_body_exists(self):
        """If the document has a body, we store that as the readable html

        No sense processing anything other than the body content.

        """
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')

    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_no_body.html'))
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')

    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_only_content.html'))
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')

    def test_no_content(self):
        """Without content we supply an empty unparsed doc."""
        doc = Article('')
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
        self.assertEqual(doc.readable_dom.get('class'), 'parsing-error')


class TestCleaning(unittest.TestCase):
    """Test out our cleaning processing we do."""

    def test_unlikely_hits(self):
        """Verify we wipe out things from our unlikely list."""
        doc = Article(load_snippet('test_readable_unlikely.html'))
        readable = doc.readable_dom
        must_not_appear = [
            'comment', 'community', 'disqus', 'extra', 'foot',
            'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
            'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
            'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']

        want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']

        for i in must_not_appear:
            # we cannot find any class or id with this value
            by_class = readable.find_class(i)

            for test in by_class:
                # if it's here it cannot have the must not class without the
                # want to appear class
                found = False
                for cls in test.get('class').split():
                    if cls in want_to_appear:
                        found = True
                self.assertTrue(found)

            by_ids = readable.get_element_by_id(i, False)
            if by_ids is not False:
                found = False
                for ids in test.get('id').split():
                    if ids in want_to_appear:
                        found = True
                self.assertTrue(found)

    def test_misused_divs_transform(self):
        """Verify we replace leaf node divs with p's

        They should have the same content, just be a p vs a div

        """
        test_html = "<html><body><div>simple</div></body></html>"
        test_doc = document_fromstring(test_html)
        self.assertEqual(
            tounicode(
                leaf_div_elements_into_paragraphs(test_doc)),
            to_unicode("<html><body><p>simple</p></body></html>")
        )

        test_html2 = ('<html><body><div>simple<a href="">link</a>'
                      '</div></body></html>')
        test_doc2 = document_fromstring(test_html2)
        self.assertEqual(
            tounicode(
                leaf_div_elements_into_paragraphs(test_doc2)),
            to_unicode(
                '<html><body><p>simple<a href="">link</a></p></body></html>')
        )

    def test_dont_transform_div_with_div(self):
        """Verify that only child <div> element is replaced by <p>."""
        dom = document_fromstring(
            "<html><body><div>text<div>child</div>"
            "aftertext</div></body></html>"
        )

        self.assertEqual(
            tounicode(
                leaf_div_elements_into_paragraphs(dom)),
            to_unicode(
                "<html><body><div>text<p>child</p>"
                "aftertext</div></body></html>"
            )
        )

    def test_bad_links(self):
        """Some links should just not belong."""
        bad_links = [
            '<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
            '<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
            '<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
        ]

        for l in bad_links:
            link = fragment_fromstring(l)
            self.assertTrue(is_bad_link(link))


class TestCandidateNodes(unittest.TestCase):
    """Candidate nodes are scoring containers we use."""

    def test_candidate_scores(self):
        """We should be getting back objects with some scores."""
        fives = ['<div/>']
        threes = ['<pre/>', '<td/>', '<blockquote/>']
        neg_threes = ['<address/>', '<ol/>']
        neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']

        for n in fives:
            doc = fragment_fromstring(n)
            self.assertEqual(ScoredNode(doc).content_score, 5)

        for n in threes:
            doc = fragment_fromstring(n)
            self.assertEqual(ScoredNode(doc).content_score, 3)

        for n in neg_threes:
            doc = fragment_fromstring(n)
            self.assertEqual(ScoredNode(doc).content_score, -3)

        for n in neg_fives:
            doc = fragment_fromstring(n)
            self.assertEqual(ScoredNode(doc).content_score, -5)

    def test_article_enables_candidate_access(self):
        """Candidates are accessible after document processing."""
        doc = Article(load_article('ars.001.html'))
        self.assertTrue(hasattr(doc, 'candidates'))


class TestClassWeights(unittest.TestCase):
    """Certain ids and classes get us bonus points."""

    def test_positive_class(self):
        """Some classes get us bonus points."""
        node = fragment_fromstring('<p class="article">')
        self.assertEqual(get_class_weight(node), 25)

    def test_positive_ids(self):
        """Some ids get us bonus points."""
        node = fragment_fromstring('<p id="content">')
        self.assertEqual(get_class_weight(node), 25)

    def test_negative_class(self):
        """Some classes get us negative points."""
        node = fragment_fromstring('<p class="comment">')
        self.assertEqual(get_class_weight(node), -25)

    def test_negative_ids(self):
        """Some ids get us negative points."""
        node = fragment_fromstring('<p id="media">')
        self.assertEqual(get_class_weight(node), -25)


class TestScoringNodes(unittest.TestCase):
    """We take out list of potential nodes and score them up."""

    def test_we_get_candidates(self):
        """Processing candidates should get us a list of nodes to try out."""
        doc = document_fromstring(load_article("ars.001.html"))
        test_nodes = tuple(doc.iter("p", "td", "pre"))
        candidates = score_candidates(test_nodes)

        # this might change as we tweak our algorithm, but if it does,
        # it signifies we need to look at what we changed.
        self.assertEqual(len(candidates.keys()), 37)

        # one of these should have a decent score
        scores = sorted(c.content_score for c in candidates.values())
        self.assertTrue(scores[-1] > 100)

    def test_bonus_score_per_100_chars_in_p(self):
        """Nodes get 1 point per 100 characters up to max. 3 points."""
        def build_candidates(length):
            html = "<p>%s</p>" % ("c" * length)
            node = fragment_fromstring(html)

            return [node]

        test_nodes = build_candidates(50)
        candidates = score_candidates(test_nodes)
        pscore_50 = max(c.content_score for c in candidates.values())

        test_nodes = build_candidates(100)
        candidates = score_candidates(test_nodes)
        pscore_100 = max(c.content_score for c in candidates.values())

        test_nodes = build_candidates(300)
        candidates = score_candidates(test_nodes)
        pscore_300 = max(c.content_score for c in candidates.values())

        test_nodes = build_candidates(400)
        candidates = score_candidates(test_nodes)
        pscore_400 = max(c.content_score for c in candidates.values())

        self.assertAlmostEqual(pscore_50 + 0.5, pscore_100)
        self.assertAlmostEqual(pscore_100 + 2.0, pscore_300)
        self.assertAlmostEqual(pscore_300, pscore_400)


class TestLinkDensityScoring(unittest.TestCase):
    """Link density will adjust out candidate scoresself."""

    def test_link_density(self):
        """Test that we get a link density"""
        doc = document_fromstring(load_article('ars.001.html'))
        for node in doc.iter('p', 'td', 'pre'):
            density = get_link_density(node)

            # the density must be between 0, 1
            self.assertTrue(density >= 0.0 and density <= 1.0)


class TestSiblings(unittest.TestCase):
    """Siblings will be included if their content is related."""

    @unittest.skip("Not implemented yet.")
    def test_bad_siblings_not_counted(self):
        raise NotImplementedError()

    @unittest.skip("Not implemented yet.")
    def test_good_siblings_counted(self):
        raise NotImplementedError()


class TestMainText(unittest.TestCase):
    def test_empty(self):
        article = Article("")
        annotated_text = article.main_text

        self.assertEqual(annotated_text, [])

    def test_no_annotations(self):
        article = Article("<div><p>This is text with no annotations</p></div>")
        annotated_text = article.main_text

        self.assertEqual(annotated_text,
            [(("This is text with no annotations", None),)])

    def test_one_annotation(self):
        article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
        annotated_text = article.main_text

        expected = [(
            ("This is text\nwith", None),
            ("no", ("del",)),
            ("annotations", None),
        )]
        self.assertEqual(annotated_text, expected)

    def test_simple_snippet(self):
        snippet = Article(load_snippet("annotated_1.html"))
        annotated_text = snippet.main_text

        expected = [
            (
                ("Paragraph is more", None),
                ("better", ("em",)),
                (".\nThis text is very", None),
                ("pretty", ("strong",)),
                ("'cause she's girl.", None),
            ),
            (
                ("This is not", None),
                ("crap", ("big",)),
                ("so", None),
                ("readability", ("dfn",)),
                ("me :)", None),
            )
        ]
        self.assertEqual(annotated_text, expected)