1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
"""
Purpose
-------
This fdw can be used to access items from an rss feed.
The column names are mapped to the elements inside an item.
An rss item has the following strcture:
.. code-block:: xml
<item>
<title>Title</title>
<pubDate>2011-01-02</pubDate>
<link>http://example.com/test</link>
<guid>http://example.com/test</link>
<description>Small description</description>
</item>
You can access every element by defining a column with the same name. Be
careful to match the case! Example: pubDate should be quoted like this:
``pubDate`` to preserve the uppercased ``D``.
.. api_compat::
:read:
Dependencies
------------
You will need the `lxml`_ library.
.. _lxml: http://lxml.de/
Required options
-----------------
``url`` (string)
The RSS feed URL.
Usage Example
-------------
.. _Radicale: http://radicale.org
If you want to parse the `radicale`_ rss feed, you can use the following
definition:
.. code-block:: sql
CREATE SERVER rss_srv foreign data wrapper multicorn options (
wrapper 'multicorn.rssfdw.RssFdw'
);
CREATE FOREIGN TABLE radicalerss (
"pubDate" timestamp,
description character varying,
title character varying,
link character varying
) server rss_srv options (
url 'http://radicale.org/rss/'
);
select "pubDate", title, link from radicalerss limit 10;
.. code-block:: bash
pubDate | title | link
---------------------+----------------------------------+----------------------------------------------
2011-09-27 06:07:42 | Radicale 0.6.2 | http://radicale.org/news#2011-09-27@06:07:42
2011-08-28 13:20:46 | Radicale 0.6.1, Changes, Future | http://radicale.org/news#2011-08-28@13:20:46
2011-08-01 08:54:43 | Radicale 0.6 Released | http://radicale.org/news#2011-08-01@08:54:43
2011-07-02 20:13:29 | Feature Freeze for 0.6 | http://radicale.org/news#2011-07-02@20:13:29
2011-05-01 17:24:33 | Ready for WSGI | http://radicale.org/news#2011-05-01@17:24:33
2011-04-30 10:21:12 | Apple iCal Support | http://radicale.org/news#2011-04-30@10:21:12
2011-04-25 22:10:59 | Two Features and One New Roadmap | http://radicale.org/news#2011-04-25@22:10:59
2011-04-10 20:04:33 | New Features | http://radicale.org/news#2011-04-10@20:04:33
2011-04-02 12:11:37 | Radicale 0.5 Released | http://radicale.org/news#2011-04-02@12:11:37
2011-02-03 23:35:55 | Jabber Room and iPhone Support | http://radicale.org/news#2011-02-03@23:35:55
(10 lignes)
"""
from . import ForeignDataWrapper
from datetime import datetime, timedelta
from lxml import etree
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
from logging import ERROR, WARNING
from multicorn.utils import log_to_postgres
import json
def element_to_dict(element):
"""
This method takes a lxml element and return a json string containing
the element attributes and a text key and a child node.
>>> test = lambda x: sorted([(k, sorted(v.items())) if isinstance(v, dict) else (k, [sorted(e.items()) for e in v]) if isinstance(v, list) else (k, v) for k, v in element_to_dict(etree.fromstring(x)).items()])
>>> test('<t a1="v1"/>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', '')]
>>> test('<t a1="v1">Txt</t>')
[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', 'Txt')]
>>> test('<t>Txt<s1 a1="v1">Sub1</s1>Txt2<s2 a2="v2">Sub2</s2>Txt3</t>')
[('attributes', {}), ('children', [[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 's1'), ('text', 'Sub1')], [('attributes', {'a2': 'v2'}), ('children', []), ('tag', 's2'), ('text', 'Sub2')]]), ('tag', 't'), ('text', 'Txt')]
"""
return {
'tag': etree.QName(element.tag).localname,
'text': element.text or '',
'attributes': dict(element.attrib),
'children': [element_to_dict(e) for e in element]
}
class RssFdw(ForeignDataWrapper):
"""An rss foreign data wrapper.
The following options are accepted:
url -- The rss feed urls.
The columns named are parsed, and are used as xpath expression on
each item xml node. Exemple: a column named "pubDate" would return the
pubDate element of an rss item.
"""
def __init__(self, options, columns):
super(RssFdw, self).__init__(options, columns)
self.url = options.get('url', None)
self.cache = (None, None)
self.cache_duration = options.get('cache_duration', None)
if self.cache_duration is not None:
self.cache_duration = timedelta(seconds=int(self.cache_duration))
if self.url is None:
log_to_postgres("You MUST set an url when creating the table!",
ERROR)
self.columns = columns
self.default_namespace_prefix = options.pop(
'default_namespace_prefix', None)
self.item_root = options.pop('item_root', 'item')
def get_namespaces(self, xml):
ns = dict(xml.nsmap)
if None in ns:
ns[self.default_namespace_prefix] = ns.pop(None)
return ns
def make_item_from_xml(self, xml_elem):
"""Internal method used for parsing item xml element from the
columns definition."""
item = {}
for prop, column in self.columns.items():
value = xml_elem.xpath(
prop, namespaces=self.get_namespaces(xml_elem))
if value:
if column.type_name.startswith('json'):
item[prop] = json.dumps([
element_to_dict(val) for val in value])
# There should be a better way
# oid is 1009 ?
elif column.type_name.endswith('[]'):
item[prop] = [elem.text for elem in value]
else:
item[prop] = getattr(value[0], 'text', value[0])
return item
def execute(self, quals, columns):
"""Quals are ignored."""
if self.cache_duration is not None:
date, values = self.cache
if values is not None:
if (datetime.now() - date) < self.cache_duration:
return values
try:
xml = etree.fromstring(urlopen(self.url).read())
items = [self.make_item_from_xml(elem)
for elem in xml.xpath(
'//%s' % self.item_root,
namespaces=self.get_namespaces(xml))]
self.cache = (datetime.now(), items)
return items
except etree.ParseError:
log_to_postgres("Malformed xml, returning nothing")
except IOError:
log_to_postgres("Cannot retrieve '%s'" % self.url, WARNING)
|