File: lxmldocument.py

package info (click to toggle)
python-scrapy 0.24.2-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 3,240 kB
  • ctags: 4,259
  • sloc: python: 21,170; xml: 199; makefile: 67; sh: 44
file content (31 lines) | stat: -rw-r--r-- 921 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""
This module contains a simple class (LxmlDocument) which provides cache and
garbage collection to lxml element tree documents.
"""

import weakref
from lxml import etree
from scrapy.utils.trackref import object_ref


def _factory(response, parser_cls):
    url = response.url
    body = response.body_as_unicode().strip().encode('utf8') or '<html/>'
    parser = parser_cls(recover=True, encoding='utf8')
    return etree.fromstring(body, parser=parser, base_url=url)


class LxmlDocument(object_ref):

    cache = weakref.WeakKeyDictionary()
    __slots__ = ['__weakref__']

    def __new__(cls, response, parser=etree.HTMLParser):
        cache = cls.cache.setdefault(response, {})
        if parser not in cache:
            obj = object_ref.__new__(cls)
            cache[parser] = _factory(response, parser)
        return cache[parser]

    def __str__(self):
        return "<LxmlDocument %s>" % self.root.tag