File: 12-dom.py

package info (click to toggle)
python-pattern 2.6%2Bgit20180818-2
  • links: PTS
  • area: main
  • in suites: bullseye
  • size: 93,888 kB
  • sloc: python: 28,119; xml: 15,085; makefile: 194
file content (163 lines) | stat: -rw-r--r-- 5,757 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int

import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import URL, DOM, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The pattern.web module has a number of convenient search engines, as demonstrated.
# But often you will need to handle the HTML in web pages of your interest manually.
# The DOM object can be used for this, similar to the Javascript DOM.
# The DOM (Document Object Model) parses a string of HTML
# and returns a tree of nested Element objects.
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...

# For example, top news entries on Reddit are coded as:
# <div class="_1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah">
#     ...
#     <span class="y8HYJ-y_lTUHkQIc1mdCq yj3st6-1 kYJFRo">
#     ...
#         <a class="SQnoC3ObvgnGjWt90zD9Z " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
#     ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
#print(dom.body.content)
for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah")[:5]: # Top 5 reddit entries.
    for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]:
        print(plaintext(a.content))
        print(a.attrs["href"])
        print("")

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attrs.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    print(link)

# The DOM object is a tree of nested Element and Text objects.
# All objects inherit from Node (check the source code).

# Node.type       : NODE, TEXT, COMMENT, ELEMENT or DOM
# Node.parent     : Parent Node object.
# Node.children   : List of child Node objects.
# Node.next       : Next Node in Node.parent.children.
# Node.previous   : Previous Node in Node.parent.children.

# DOM.head        : Element with tag name "head".
# DOM.body        : Element with tag name "body".

# Element.tag     : Element tag name, e.g. "body".
# Element.attrs   : Dictionary of tag attributes, e.g. {"class": "header"}
# Element.content : Element HTML content as a string.
# Element.source  : Element tag + content

# Element.get_element_by_id(value)
# Element.get_elements_by_tagname(value)
# Element.get_elements_by_classname(value)
# Element.get_elements_by_attribute(name=value)

# You can also use shorter aliases (we prefer them):
# Element.by_id(), by_tag(), by_class(), by_attr().

# The tag name passed to Element.by_tag() can include
# a class (e.g., "div.message") or an id (e.g., "div#header").

# For example:
# In the <head> tag, retrieve the <meta name="keywords"> element.
# Get the string value of its "content" attribute and split into a list:
dom = DOM(URL("https://www.apple.com/uk/").download(cached=True))
kw = dom.head.by_attr(name="Description")[0]
kw = kw.attrs["content"]
print(kw)
print("")

# If you know CSS, you can also use short and handy CSS selectors:
# http://www.w3.org/TR/CSS2/selector.html
# Element(selector) will return a list of nested elements that match the given string.
dom = DOM(URL("http://www.clips.ua.ac.be").download())
for e in dom("div#ContentPlaceHolder1_ctl00_ctl01_Omkadering span div:contents p"):
    print(plaintext(e.content))
    print("")



######################################## Test Techcrunch - https://techcrunch.com/ ####################################

print("#"*40, "Test Techcrunch", "#"*40)
url = URL("https://techcrunch.com/startups/")
dom = DOM(url.download(cached=True))

for e in dom.by_tag("header.post-block__header")[:5]:
    for a in e.by_tag("h2.post-block__title")[:1]:
        print(plaintext(a.content))
        for h in a.by_tag("a.post-block__title__link")[:1]:
            print(h.attrs["href"])
        print("")
print("\n")

header = dom.by_class("river__title")[0]
print(header.content)
print("\n")


title_image = dom.by_attr(name="msapplication-TileImage")[0]
print(title_image.attrs['content'])
print("\n")


url = URL("https://techcrunch.com")
dom = DOM(url.download(cached=True))
for k in dom.by_class("post-block__title__link"):
    print(k.content.strip())
    print("")

print("\n")

for e in dom("header:post-block__header h2:post-block__title a:post-block__title__link"):
    print(e.content.strip())
    print(e.attrs["href"])
    print("")


################################ Test Habr - https://habr.com ####################################

print("#"*40, "Test Habr", "#"*40)
url = URL("https://habr.com")
dom = DOM(url.download(cached=True))

for e in dom.by_tag("h2.post__title")[:5]:
    for a in e.by_tag("a.post__title_link")[:1]:
        print(plaintext(a.content))
        print("")
print("\n")

for k in dom.by_class("post__hubs inline-list"):
    for p in k.by_tag("li.inline-list__item inline-list__item_hub"):
        for t in p.by_tag("a.inline-list__item-link hub-link "):
            print(t.content)
print("\n")


descr = dom.by_attr(name="description")[0]
print(descr.attrs['content'])
print("\n")

for p in dom("div#broadcast_tabs_posts"):
    for e in p.by_class("content-list content-list_most-read"):
        for k in e.by_tag("a.post-info__title post-info__title_large"):
            print(plaintext(k.content))
        print("")