File: document.py

package info (click to toggle)
python-scrapy 0.14.4-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 3,064 kB
  • sloc: python: 19,468; xml: 199; sh: 134; makefile: 67
file content (40 lines) | stat: -rw-r--r-- 1,279 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
This module contains a simple class (Libxml2Document) which provides cache and
garbage collection to libxml2 documents (xmlDoc).
"""

import weakref

from scrapy.utils.trackref import object_ref
from .factories import xmlDoc_from_html

class Libxml2Document(object_ref):

    cache = weakref.WeakKeyDictionary()
    __slots__ = ['xmlDoc', 'xpathContext', '__weakref__']

    def __new__(cls, response, factory=xmlDoc_from_html):
        cache = cls.cache.setdefault(response, {})
        if factory not in cache:
            obj = object_ref.__new__(cls)
            obj.xmlDoc = factory(response)
            obj.xpathContext = obj.xmlDoc.xpathNewContext()
            cache[factory] = obj
        return cache[factory]

    def __del__(self):
        # we must call both cleanup functions, so we try/except all exceptions
        # to make sure one doesn't prevent the other from being called
        # this call sometimes raises a "NoneType is not callable" TypeError
        # so the try/except block silences them
        try:
            self.xmlDoc.freeDoc()
        except:
            pass
        try:
            self.xpathContext.xpathFreeContext()
        except:
            pass

    def __str__(self):
        return "<Libxml2Document %s>" % self.xmlDoc.name