File: feed_parser.py

package info (click to toggle)
gnome-feeds 2.2.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 1,520 kB
  • sloc: python: 5,369; sh: 93; xml: 28; makefile: 2
file content (127 lines) | stat: -rw-r--r-- 4,194 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from pathlib import Path
from typing import List, Optional
from gettext import gettext as _
from gfeeds.util.get_favicon import get_favicon
from os.path import isfile
from gfeeds.util.paths import THUMBS_CACHE_PATH
from gfeeds.util.sha import shasum
# from bs4 import UnicodeDammit  # TODO: reimplement it!
from syndom import Feed as SynDomFeed, FeedItem as SynDomFeedItem


class FeedParserRes:
    def __init__(
        self,
        is_null: bool = False,
        error: Optional[str] = None,
        sd_feed: SynDomFeed = None,
        rss_link: Optional[str] = None,
        title: Optional[str] = None,
        link: Optional[str] = None,
        description: Optional[str] = None,
        image_url: Optional[str] = None,
        favicon_path: Optional[str] = None,
        raw_entries: List[SynDomFeedItem] = []
    ):
        self.is_null = is_null
        self.error = error
        self.sd_feed = sd_feed
        self.rss_link = rss_link or ''
        self.title = title or ''
        self.link = link or ''
        self.description = description or ''
        self.image_url = image_url
        self.favicon_path = favicon_path
        self.raw_entries = raw_entries

    @property
    def feed_identifier(self) -> str:
        assert self.rss_link
        assert self.title
        return self.rss_link + self.title


def parse_feed(
        feedpath: Optional[Path],
        rss_link_: Optional[str] = None,
        failed: bool = False,
        error: Optional[str] = None
) -> FeedParserRes:
    if failed:
        print(error)
        return FeedParserRes(is_null=True, error=(error or '<NULL ERROR>'))
    sd_feed = None
    try:
        sd_feed = SynDomFeed(str(feedpath))
    except Exception:
        print('Error parsing feed (caught); will try extracting from HTML')
    if sd_feed is None:
        return FeedParserRes(
            is_null=True, error=_(
                'Errors while parsing feed `{0}`, URL: `{1}`'
            ).format(feedpath, rss_link_)
        )
    try:
        title = sd_feed.get_title()
        raw_entries = sd_feed.get_items()
        link = sd_feed.get_url()
        rss_link = rss_link_ or sd_feed.get_rss_url()
        image_url = sd_feed.get_img_url()
        description = sd_feed.get_description()
    except UnicodeDecodeError:
        return FeedParserRes(
            is_null=True,
            error=_(
                'Error decoding unicode data from feed `{0}`, URL: `{1}`'
            ).format(feedpath, rss_link_)
        )
    except Exception:
        return FeedParserRes(
            is_null=True,
            error=_(
                'Error extracting data from feed `{0}`, URL: `{1}`'
            ).format(feedpath, rss_link_)
        )
    if not title and len(raw_entries) == 0:
        # if these conditions are met, there's reason to believe
        # this is not an rss/atom feed
        return FeedParserRes(
            is_null=False, error=_(
                '`{0}` may not be an RSS or Atom feed'
            ).format(rss_link_)
        )
    if not title:
        title = rss_link
    favicon_path: Optional[str] = str(THUMBS_CACHE_PATH.joinpath(
        shasum(rss_link+'v2')+'.png'
    ))
    if not isfile(favicon_path):
        if image_url:
            try:
                get_favicon(image_url, favicon_path, direct=True)
            except Exception:
                print('Invalid image url for feed `{0}` ({1})'.format(
                    rss_link, image_url
                ))
                image_url = None
        if not image_url:
            try:
                get_favicon(rss_link, favicon_path)
                if not isfile(favicon_path):
                    get_favicon(
                        link or raw_entries[0].uri,
                        favicon_path
                    )
            except Exception:
                print(f'No favicon for feed `{rss_link}`')
                favicon_path = None
    return FeedParserRes(
        is_null=False, error=None, sd_feed=sd_feed,
        rss_link=rss_link,
        title=title,
        link=link,
        description=description,
        image_url=image_url,
        favicon_path=favicon_path,
        raw_entries=raw_entries
    )