File: serialize.py

package info (click to toggle)
python-scrapy 0.24.2-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 3,240 kB
  • ctags: 4,259
  • sloc: python: 21,170; xml: 199; makefile: 67; sh: 44
file content (123 lines) | stat: -rw-r--r-- 4,295 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import datetime
import decimal
import json

from twisted.internet import defer

from scrapy.spider import Spider
from scrapy.http import Request, Response
from scrapy.item import BaseItem


class SpiderReferencer(object):
    """Class to serialize (and deserialize) objects (typically dicts)
    containing references to running spiders (ie. Spider objects). This is
    required because json library fails to serialize dicts containing
    non-primitive types as keys, even when you override
    ScrapyJSONEncoder.default() with a custom encoding mechanism.
    """

    spider_ref_re = re.compile('^spider:([0-9a-f]+)?:?(.+)?$')

    def __init__(self, crawler):
        self.crawler = crawler

    def get_reference_from_spider(self, spider):
        return 'spider:%x:%s' % (id(spider), spider.name)

    def get_spider_from_reference(self, ref):
        """Returns the Spider referenced by text, if text is a spider
        reference. Otherwise it returns the text itself. If the text references
        a non-running spider it raises a RuntimeError.
        """
        m = self.spider_ref_re.search(ref)
        if m:
            spid, spname = m.groups()
            for spider in self.crawler.engine.open_spiders:
                if "%x" % id(spider) == spid or spider.name == spname:
                    return spider
            raise RuntimeError("Spider not running: %s" % ref)
        return ref

    def encode_references(self, obj):
        """Look for Spider objects and replace them with spider references"""
        if isinstance(obj, Spider):
            return self.get_reference_from_spider(obj)
        elif isinstance(obj, dict):
            d = {}
            for k, v in obj.items():
                k = self.encode_references(k)
                v = self.encode_references(v)
                d[k] = v
            return d
        elif isinstance(obj, (list, tuple)):
            return [self.encode_references(x) for x in obj]
        else:
            return obj

    def decode_references(self, obj):
        """Look for spider references and replace them with Spider objects"""
        if isinstance(obj, basestring):
            return self.get_spider_from_reference(obj)
        elif isinstance(obj, dict):
            d = {}
            for k, v in obj.items():
                k = self.decode_references(k)
                v = self.decode_references(v)
                d[k] = v
            return d
        elif isinstance(obj, (list, tuple)):
            return [self.decode_references(x) for x in obj]
        else:
            return obj


class ScrapyJSONEncoder(json.JSONEncoder):

    DATE_FORMAT = "%Y-%m-%d"
    TIME_FORMAT = "%H:%M:%S"

    def __init__(self, *a, **kw):
        crawler = kw.pop('crawler', None)
        self.spref = kw.pop('spref', None) or SpiderReferencer(crawler)
        super(ScrapyJSONEncoder, self).__init__(*a, **kw)

    def encode(self, o):
        if self.spref:
            o = self.spref.encode_references(o)
        return super(ScrapyJSONEncoder, self).encode(o)

    def default(self, o):
        if isinstance(o, datetime.datetime):
            return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
        elif isinstance(o, datetime.date):
            return o.strftime(self.DATE_FORMAT)
        elif isinstance(o, datetime.time):
            return o.strftime(self.TIME_FORMAT)
        elif isinstance(o, decimal.Decimal):
            return str(o)
        elif isinstance(o, defer.Deferred):
            return str(o)
        elif isinstance(o, BaseItem):
            return dict(o)
        elif isinstance(o, Request):
            return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
        elif isinstance(o, Response):
            return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
        else:
            return super(ScrapyJSONEncoder, self).default(o)


class ScrapyJSONDecoder(json.JSONDecoder):

    def __init__(self, *a, **kw):
        crawler = kw.pop('crawler', None)
        self.spref = kw.pop('spref', None) or SpiderReferencer(crawler)
        super(ScrapyJSONDecoder, self).__init__(*a, **kw)

    def decode(self, s):
        o = super(ScrapyJSONDecoder, self).decode(s)
        if self.spref:
            o = self.spref.decode_references(o)
        return o