File: test_article.py

package info (click to toggle)
breadability 0.1.20-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 316 kB
  • ctags: 282
  • sloc: python: 1,713; makefile: 18
file content (74 lines) | stat: -rw-r--r-- 2,409 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf8 -*-

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals
)

import os

from operator import attrgetter
from breadability.readable import Article
from breadability.readable import check_siblings
from breadability.readable import prep_article
from ...compat import unittest


class TestArticle(unittest.TestCase):
    """Test the scoring and parsing of the Article"""

    def setUp(self):
        """Load up the article for us"""
        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
        self.article = open(article_path).read()

    def tearDown(self):
        """Drop the article"""
        self.article = None

    def test_parses(self):
        """Verify we can parse the document."""
        doc = Article(self.article)
        self.assertTrue('id="readabilityBody"' in doc.readable)

    def test_content_exists(self):
        """Verify that some content exists."""
        doc = Article(self.article)
        self.assertTrue('Amazon and Google' in doc.readable)
        self.assertFalse('Linkblog updated' in doc.readable)
        self.assertFalse(
            '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)

    @unittest.skip("Test fails because of some weird hash.")
    def test_candidates(self):
        """Verify we have candidates."""
        doc = Article(self.article)
        # from lxml.etree import tounicode
        found = False
        wanted_hash = '04e46055'

        for node in doc.candidates.values():
            if node.hash_id == wanted_hash:
                found = node

        self.assertTrue(found)

        # we have the right node, it must be deleted for some reason if it's
        # not still there when we need it to be.
        # Make sure it's not in our to drop list.
        for node in doc._should_drop:
            self.assertFalse(node == found.node)

        by_score = sorted(
            [c for c in doc.candidates.values()],
            key=attrgetter('content_score'), reverse=True)
        self.assertTrue(by_score[0].node == found.node)

        updated_winner = check_siblings(by_score[0], doc.candidates)
        updated_winner.node = prep_article(updated_winner.node)

        # This article hits up against the img > p conditional filtering
        # because of the many .gif images in the content. We've removed that
        # rule.