1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
from pathlib import Path
from typing import List, Optional
from gettext import gettext as _
from gfeeds.util.get_favicon import get_favicon
from os.path import isfile
from gfeeds.util.paths import THUMBS_CACHE_PATH
from gfeeds.util.sha import shasum
# from bs4 import UnicodeDammit # TODO: reimplement it!
from syndom import Feed as SynDomFeed, FeedItem as SynDomFeedItem
class FeedParserRes:
def __init__(
self,
is_null: bool = False,
error: Optional[str] = None,
sd_feed: SynDomFeed = None,
rss_link: Optional[str] = None,
title: Optional[str] = None,
link: Optional[str] = None,
description: Optional[str] = None,
image_url: Optional[str] = None,
favicon_path: Optional[str] = None,
raw_entries: List[SynDomFeedItem] = []
):
self.is_null = is_null
self.error = error
self.sd_feed = sd_feed
self.rss_link = rss_link or ''
self.title = title or ''
self.link = link or ''
self.description = description or ''
self.image_url = image_url
self.favicon_path = favicon_path
self.raw_entries = raw_entries
@property
def feed_identifier(self) -> str:
assert self.rss_link
assert self.title
return self.rss_link + self.title
def parse_feed(
feedpath: Optional[Path],
rss_link_: Optional[str] = None,
failed: bool = False,
error: Optional[str] = None
) -> FeedParserRes:
if failed:
print(error)
return FeedParserRes(is_null=True, error=(error or '<NULL ERROR>'))
sd_feed = None
try:
sd_feed = SynDomFeed(str(feedpath))
except Exception:
print('Error parsing feed (caught); will try extracting from HTML')
if sd_feed is None:
return FeedParserRes(
is_null=True, error=_(
'Errors while parsing feed `{0}`, URL: `{1}`'
).format(feedpath, rss_link_)
)
try:
title = sd_feed.get_title()
raw_entries = sd_feed.get_items()
link = sd_feed.get_url()
rss_link = rss_link_ or sd_feed.get_rss_url()
image_url = sd_feed.get_img_url()
description = sd_feed.get_description()
except UnicodeDecodeError:
return FeedParserRes(
is_null=True,
error=_(
'Error decoding unicode data from feed `{0}`, URL: `{1}`'
).format(feedpath, rss_link_)
)
except Exception:
return FeedParserRes(
is_null=True,
error=_(
'Error extracting data from feed `{0}`, URL: `{1}`'
).format(feedpath, rss_link_)
)
if not title and len(raw_entries) == 0:
# if these conditions are met, there's reason to believe
# this is not an rss/atom feed
return FeedParserRes(
is_null=False, error=_(
'`{0}` may not be an RSS or Atom feed'
).format(rss_link_)
)
if not title:
title = rss_link
favicon_path: Optional[str] = str(THUMBS_CACHE_PATH.joinpath(
shasum(rss_link+'v2')+'.png'
))
if not isfile(favicon_path):
if image_url:
try:
get_favicon(image_url, favicon_path, direct=True)
except Exception:
print('Invalid image url for feed `{0}` ({1})'.format(
rss_link, image_url
))
image_url = None
if not image_url:
try:
get_favicon(rss_link, favicon_path)
if not isfile(favicon_path):
get_favicon(
link or raw_entries[0].uri,
favicon_path
)
except Exception:
print(f'No favicon for feed `{rss_link}`')
favicon_path = None
return FeedParserRes(
is_null=False, error=None, sd_feed=sd_feed,
rss_link=rss_link,
title=title,
link=link,
description=description,
image_url=image_url,
favicon_path=favicon_path,
raw_entries=raw_entries
)
|