File: 10-dbpedia.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (148 lines) | stat: -rw-r--r-- 4,720 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 *-*
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import DBPedia

dbp = DBPedia()

# DBPedia is a database of structured information mined from Wikipedia.
# DBPedia data is stored as RDF triples: (subject, predicate, object),
# e.g., X is-a Actor, Y is-a Country, Z has-birthplace Country, ...
# If you know about pattern.graph (or graphs in general),
# this triple format should look familiar. 

# DBPedia can be queried using SPARQL: 
# http://dbpedia.org/sparql
# http://www.w3.org/TR/rdf-sparql-query/
# A SPARQL query yields rows that match all triples in the WHERE clause.
# A SPARQL query uses ?wildcards in triple subject/object to select fields.

# 1) Search DBPedia for actors.

# Variables are indicated with a "?" prefix.
# Variables will be bound to the corresponding part of each matched triple.
# The "a" is short for "is of the class".
# The "prefix" statement creates a shorthand for a given namespace.
# To see what semantic constraints are available in "dbo" (for example):
# http://dbpedia.org/ontology/
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor where { 
    ?actor a dbo:Actor.
}
"""
for result in dbp.search(q, start=1, count=10):
    print result.actor
print
    
# You may notice that each Result.actor is of the form: 
# "http://dbpedia.org/resource/[NAME]"
# This kind of string is a subclass of unicode: DBPediaResource.
# DBPediaResource has a DBPediaResource.name property (see below).

# 2) Search DBPedia for actors and their place of birth.

q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor ?place where { 
    ?actor a dbo:Actor.
    ?actor dbo:birthPlace ?place.
}
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
    print "%s (%s)" % (r.actor.name, r.place.name)
print

# You will notice that the results now include duplicates,
# the same actor with a city name, and with a country name.
# We could refine ?place by including the following triple:
# "?place a dbo:Country."

# 3) Search DBPedia for actors born in 1970.

# Each result must match both triples, i.e.,
# X is an actor + X is born on Y.
# We don't want to filter by month and day (e.g., "1970-12-31"),
# so we use a regular expression instead with filter():
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor ?date where { 
    ?actor a dbo:Actor.
    ?actor dbo:birthDate ?date. 
    filter(regex(str(?date), "1970-..-.."))
}
order by ?date
"""
for r in dbp.search(q, start=1, count=10):
    print "%s (%s)" % (r.actor.name, r.date)
print

# We could also make this query shorter,
# by combining the two ?actor triples into one:
# "?actor a dbo:Actor; dbo:birthDate ?date."

# 4) A more advanced example, in German:

q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?actor ?place where { 
    ?_actor a dbo:Actor.
    ?_actor dbo:birthPlace ?_place.
    ?_actor rdfs:label ?actor.
    ?_place rdfs:label ?place.
    filter(lang(?actor) = "de" && lang(?place) = "de")
}
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
    print "%s (%s)" % (r.actor, r.place)
print

# This extracts a German label for each matched DBPedia resource.
# - X is an actor,
# - X is born in Y,
# - X has a German label A,
# - Y has a German label B,
# - Retrieve A and B.

# For example, say one of the matched resources was:
# "<http://dbpedia.org/page/Erwin_Schrödinger>"
# If you open this URL in a browser, 
# you will see all the available semantic properties and their values.
# One of the properties is "rdfs:label": a human-readable & multilingual label.

# 5) Find triples involving cats.

# <http://purl.org/dc/terms/subject> 
# means: "is in the category of".
q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?cat ?relation ?concept where {
    ?cat <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Felis>.
    ?cat ?_relation ?_concept.
    ?_relation rdfs:label ?relation.
    ?_concept rdfs:label ?concept.
    filter(lang(?relation) = "en" && lang(?concept) = "en")
} order by ?cat
"""
for r in dbp.search(q, start=1, count=10):
    print "%s ---%s--> %s" % (r.cat.name, r.relation.ljust(10, "-"), r.concept)
print

# 6) People whose first name includes "Édouard"

q = u"""
prefix dbo: <http://dbpedia.org/ontology/>
prefix foaf: <http://xmlns.com/foaf/0.1/>
select ?person ?name where { 
    ?person a dbo:Person.
    ?person foaf:givenName ?name.
    filter(regex(?name, "Édouard"))
}
"""
for result in dbp.search(q, start=1, count=10, cached=False):
    print "%s (%s)" % (result.person.name, result.name)
print