Title

""" Purpose ------- This fdw can be used to access items from an rss feed. The column names are mapped to the elements inside an item. An rss item has the following strcture: .. code-block:: xml Title 2011-01-02 http://example.com/test http://example.com/test Small description You can access every element by defining a column with the same name. Be careful to match the case! Example: pubDate should be quoted like this: ``pubDate`` to preserve the uppercased ``D``. .. api_compat:: :read: Dependencies ------------ You will need the `lxml`_ library. .. _lxml: http://lxml.de/ Required options ----------------- ``url`` (string) The RSS feed URL. Usage Example ------------- .. _Radicale: http://radicale.org If you want to parse the `radicale`_ rss feed, you can use the following definition: .. code-block:: sql CREATE SERVER rss_srv foreign data wrapper multicorn options ( wrapper 'multicorn.rssfdw.RssFdw' ); CREATE FOREIGN TABLE radicalerss ( "pubDate" timestamp, description character varying, title character varying, link character varying ) server rss_srv options ( url 'http://radicale.org/rss/' ); select "pubDate", title, link from radicalerss limit 10; .. code-block:: bash pubDate | title | link ---------------------+----------------------------------+---------------------------------------------- 2011-09-27 06:07:42 | Radicale 0.6.2 | http://radicale.org/news#2011-09-27@06:07:42 2011-08-28 13:20:46 | Radicale 0.6.1, Changes, Future | http://radicale.org/news#2011-08-28@13:20:46 2011-08-01 08:54:43 | Radicale 0.6 Released | http://radicale.org/news#2011-08-01@08:54:43 2011-07-02 20:13:29 | Feature Freeze for 0.6 | http://radicale.org/news#2011-07-02@20:13:29 2011-05-01 17:24:33 | Ready for WSGI | http://radicale.org/news#2011-05-01@17:24:33 2011-04-30 10:21:12 | Apple iCal Support | http://radicale.org/news#2011-04-30@10:21:12 2011-04-25 22:10:59 | Two Features and One New Roadmap | http://radicale.org/news#2011-04-25@22:10:59 2011-04-10 20:04:33 | New Features | http://radicale.org/news#2011-04-10@20:04:33 2011-04-02 12:11:37 | Radicale 0.5 Released | http://radicale.org/news#2011-04-02@12:11:37 2011-02-03 23:35:55 | Jabber Room and iPhone Support | http://radicale.org/news#2011-02-03@23:35:55 (10 lignes) """ from . import ForeignDataWrapper from datetime import datetime, timedelta from lxml import etree try: from urllib.request import urlopen except ImportError: from urllib import urlopen from logging import ERROR, WARNING from multicorn.utils import log_to_postgres import json def element_to_dict(element): """ This method takes a lxml element and return a json string containing the element attributes and a text key and a child node. >>> test = lambda x: sorted([(k, sorted(v.items())) if isinstance(v, dict) else (k, [sorted(e.items()) for e in v]) if isinstance(v, list) else (k, v) for k, v in element_to_dict(etree.fromstring(x)).items()]) >>> test('') [('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', '')] >>> test('Txt') [('attributes', {'a1': 'v1'}), ('children', []), ('tag', 't'), ('text', 'Txt')] >>> test('TxtSub1Txt2Sub2Txt3') [('attributes', {}), ('children', [[('attributes', {'a1': 'v1'}), ('children', []), ('tag', 's1'), ('text', 'Sub1')], [('attributes', {'a2': 'v2'}), ('children', []), ('tag', 's2'), ('text', 'Sub2')]]), ('tag', 't'), ('text', 'Txt')] """ return { 'tag': etree.QName(element.tag).localname, 'text': element.text or '', 'attributes': dict(element.attrib), 'children': [element_to_dict(e) for e in element] } class RssFdw(ForeignDataWrapper): """An rss foreign data wrapper. The following options are accepted: url -- The rss feed urls. The columns named are parsed, and are used as xpath expression on each item xml node. Exemple: a column named "pubDate" would return the pubDate element of an rss item. """ def __init__(self, options, columns): super(RssFdw, self).__init__(options, columns) self.url = options.get('url', None) self.cache = (None, None) self.cache_duration = options.get('cache_duration', None) if self.cache_duration is not None: self.cache_duration = timedelta(seconds=int(self.cache_duration)) if self.url is None: log_to_postgres("You MUST set an url when creating the table!", ERROR) self.columns = columns self.default_namespace_prefix = options.pop( 'default_namespace_prefix', None) self.item_root = options.pop('item_root', 'item') def get_namespaces(self, xml): ns = dict(xml.nsmap) if None in ns: ns[self.default_namespace_prefix] = ns.pop(None) return ns def make_item_from_xml(self, xml_elem): """Internal method used for parsing item xml element from the columns definition.""" item = {} for prop, column in self.columns.items(): value = xml_elem.xpath( prop, namespaces=self.get_namespaces(xml_elem)) if value: if column.type_name.startswith('json'): item[prop] = json.dumps([ element_to_dict(val) for val in value]) # There should be a better way # oid is 1009 ? elif column.type_name.endswith('[]'): item[prop] = [elem.text for elem in value] else: item[prop] = getattr(value[0], 'text', value[0]) return item def execute(self, quals, columns): """Quals are ignored.""" if self.cache_duration is not None: date, values = self.cache if values is not None: if (datetime.now() - date) < self.cache_duration: return values try: xml = etree.fromstring(urlopen(self.url).read()) items = [self.make_item_from_xml(elem) for elem in xml.xpath( '//%s' % self.item_root, namespaces=self.get_namespaces(xml))] self.cache = (datetime.now(), items) return items except etree.ParseError: log_to_postgres("Malformed xml, returning nothing") except IOError: log_to_postgres("Cannot retrieve '%s'" % self.url, WARNING)