File: 06-feed.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (33 lines) | stat: -rw-r--r-- 1,209 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Newsfeed, plaintext, URL
from pattern.db  import date

# This example reads a given RSS or Atom newsfeed channel.
# Some example feeds to try out:
NATURE  = "http://feeds.nature.com/nature/rss/current"
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
NYT     = "http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"
TIME    = "http://feeds.feedburner.com/time/topstories"
CNN     = "http://rss.cnn.com/rss/edition.rss"

engine = Newsfeed()

for result in engine.search(CNN, cached=True):
    print result.title.upper()
    print plaintext(result.text) # Remove HTML formatting.
    print result.url
    print result.date
    print

# News item URL's lead to the page with the full article.
# This page can have any kind of formatting.
# There is no default way to read it.
# But we could just download the source HTML and convert it to plain text:

#html = URL(result.url).download()
#print plaintext(html)

# The resulting text may contain a lot of garbage.
# A better way is to use a DOM parser to select the HTML elements we want.
# This is demonstrated in one of the next examples.