File: pages.py

package info (click to toggle)
weboob 0.c-4.1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 5,008 kB
  • sloc: python: 28,678; perl: 244; sh: 198; makefile: 111; sql: 17
file content (120 lines) | stat: -rw-r--r-- 4,200 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-

# Copyright(C) 2012 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.


import re
from decimal import Decimal
from dateutil.parser import parse as parse_date

from weboob.tools.browser import BasePage
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.housing import Housing


__all__ = ['SearchResultsPage', 'HousingPage']


class SearchResultsPage(BasePage):
    DATE_RE = re.compile('Annonce \w+ du (.*)')
    MONTHS = {u'janvier':   'january',
              u'février':   'february',
              u'mars':      'march',
              u'avril':     'april',
              u'mai':       'may',
              u'juin':      'june',
              u'juillet':   'july',
              u'août':      'august',
              u'septembre': 'september',
              u'octobre':   'october',
              u'novembre':  'november',
              u'décembre':  'december',
             }

    def iter_housings(self):
        for div in self.document.getroot().cssselect('div.annonce-resume'):
            a = div.cssselect('td.lien-annonce')[0].find('a')
            id = a.attrib['href'].split('-')[-1]
            housing = Housing(id)
            housing.title = a.text.strip()
            m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
            if m:
                housing.area = Decimal(m.group(3))

            housing.cost = Decimal(div.cssselect('td.prix')[0].text.strip(u' \t\u20ac\xa0€\n\r').replace('.', '').replace(',', '.'))
            housing.currency = u'€'

            m = self.DATE_RE.match(div.cssselect('p.date-publication')[0].text.strip())
            if m:
                date = m.group(1)
                for fr, en in self.MONTHS.iteritems():
                    date = date.replace(fr, en)
                housing.date = parse_date(date)

            metro = div.cssselect('p.metro')
            if len(metro) > 0:
                housing.station = metro[0].text.strip()
            else:
                housing.station = NotAvailable

            p = div.cssselect('p.annonce-resume-texte')[0]
            b = p.findall('b')
            if len(b) > 0:
                housing.text = b[0].tail.strip()
                housing.location = b[0].text
            else:
                housing.text = p.text.strip()

            housing.photos = NotAvailable

            yield housing

class HousingPage(BasePage):
    def get_housing(self):
        div = self.parser.select(self.document.getroot(), 'div#annonce_detail', 1)
        housing = Housing(self.url.split('-')[-1])

        parts = div.find('h1').text.split(' - ')
        housing.title = parts[0].strip()
        housing.cost = Decimal(parts[1].strip(u' \t\u20ac\xa0€\n\r').replace('.', '').replace(',', '.'))
        housing.currency = u'€'

        m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
        if m:
            housing.area = Decimal(m.group(3))

        housing.date = housing.station = housing.location = housing.phone = NotAvailable

        metro = div.cssselect('p.metro')
        if len(metro) > 0:
            housing.station = metro[0].text.strip()

        p = div.cssselect('p.annonce-detail-texte')[0]
        b = p.findall('b')
        if len(b) > 0:
            housing.text = b[0].tail.strip()
            housing.location = b[0].text
            if len(b) > 1:
                housing.phone = b[1].text
        else:
            housing.text = p.text.strip()

        housing.details = NotAvailable
        housing.photos = NotAvailable

        return housing