File: commonsense.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (281 lines) | stat: -rw-r--r-- 11,504 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#### PATTERN | COMMONSENSE #########################################################################
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern

####################################################################################################

from codecs    import BOM_UTF8
from urllib    import urlopen
from itertools import chain

from __init__ import Graph, Node, Edge, bfs
from __init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS

import os

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

#### COMMONSENSE SEMANTIC NETWORK ##################################################################

#--- CONCEPT ---------------------------------------------------------------------------------------

class Concept(Node):
    
    def __init__(self, *args, **kwargs):
        """ A concept in the sematic network.
        """
        Node.__init__(self, *args, **kwargs)
        self._properties = None
    
    @property
    def halo(self, depth=2):
        """ Returns the concept halo: a list with this concept + surrounding concepts.
            This is useful to reason more fluidly about the concept,
            since the halo will include latent properties linked to nearby concepts.
        """
        return self.flatten(depth=depth)
        
    @property
    def properties(self):
        """ Returns the top properties in the concept halo, sorted by betweenness centrality.
            The return value is a list of concept id's instead of Concepts (for performance).
        """
        if self._properties is None:
            g = self.graph.copy(nodes=self.halo)
            p = (n for n in g.nodes if n.id in self.graph.properties)
            p = [n.id for n in reversed(sorted(p, key=lambda n: n.centrality))]
            self._properties = p
        return self._properties

def halo(concept, depth=2):
    return concept.flatten(depth=depth)

def properties(concept, depth=2, centrality=BETWEENNESS):
    g = concept.graph.copy(nodes=halo(concept, depth))
    p = (n for n in g.nodes if n.id in concept.graph.properties)
    p = [n.id for n in reversed(sorted(p, key=lambda n: getattr(n, centrality)))]
    return p

#--- RELATION --------------------------------------------------------------------------------------

class Relation(Edge):
    
    def __init__(self, *args, **kwargs):
        """ A relation between two concepts, with an optional context.
            For example, "Felix is-a cat" is in the "media" context, "tiger is-a cat" in "nature".
        """
        self.context = kwargs.pop("context", None)
        Edge.__init__(self, *args, **kwargs)

#--- HEURISTICS ------------------------------------------------------------------------------------
# Similarity between concepts is measured using a featural approach:
# a comparison of the features/properties that are salient in each concept's halo.
# Commonsense.similarity() takes an optional "heuristic" parameter to tweak this behavior.
# It is a tuple of two functions:
# 1) function(concept) returns a list of salient properties (or other),
# 2) function(concept1, concept2) returns the cost to traverse this edge (0.0-1.0).

COMMONALITY = (
    # Similarity heuristic that only traverses relations between properties.
    lambda concept: concept.properties,
    lambda edge: 1 - int(edge.context == "properties" and \
                         edge.type != "is-opposite-of"))

#--- COMMONSENSE -----------------------------------------------------------------------------------

class Commonsense(Graph):
    
    def __init__(self, data=os.path.join(MODULE, "commonsense.csv"), **kwargs):
        """ A semantic network of commonsense, using different relation types:
            - is-a,
            - is-part-of,
            - is-opposite-of,
            - is-property-of,
            - is-related-to,
            - is-same-as,
            - is-effect-of.
        """
        Graph.__init__(self, **kwargs)
        self._properties = None
        # Load data from the given path,
        # a CSV-file of (concept1, relation, concept2, context, weight)-items.
        if data is not None:
            s = open(data).read()
            s = s.strip(BOM_UTF8)
            s = s.decode("utf-8")
            s = ((v.strip("\"") for v in r.split(",")) for r in s.splitlines())
            for concept1, relation, concept2, context, weight in s:
                self.add_edge(concept1, concept2, 
                    type = relation, 
                 context = context, 
                  weight = min(int(weight)*0.1, 1.0))

    @property
    def concepts(self):
        return self.nodes
        
    @property
    def relations(self):
        return self.edges
        
    @property
    def properties(self):
        """ Yields all concepts that are properties (i.e., adjectives).
            For example: "cold is-property-of winter" => "cold".
        """
        if self._properties is None:
            #self._properties = set(e.node1.id for e in self.edges if e.type == "is-property-of")
            self._properties = (e for e in self.edges if e.context == "properties")
            self._properties = set(chain(*((e.node1.id, e.node2.id) for e in self._properties)))
        return self._properties
    
    def add_node(self, id, *args, **kwargs):
        """ Returns a Concept (Node subclass).
        """
        self._properties = None
        kwargs.setdefault("base", Concept)
        return Graph.add_node(self, id, *args, **kwargs)
        
    def add_edge(self, id1, id2, *args, **kwargs):
        """ Returns a Relation between two concepts (Edge subclass).
        """
        self._properties = None
        kwargs.setdefault("base", Relation)
        return Graph.add_edge(self, id1, id2, *args, **kwargs)
        
    def remove(self, x):
        self._properties = None
        Graph.remove(self, x)

    def similarity(self, concept1, concept2, k=3, heuristic=COMMONALITY):
        """ Returns the similarity of the given concepts,
            by cross-comparing shortest path distance between k concept properties.
            A given concept can also be a flat list of properties, e.g. ["creepy"].
            The given heuristic is a tuple of two functions:
            1) function(concept) returns a list of salient properties,
            2) function(edge) returns the cost for traversing this edge (0.0-1.0).
        """
        if isinstance(concept1, basestring):
            concept1 = self[concept1]
        if isinstance(concept2, basestring):
            concept2 = self[concept2]
        if isinstance(concept1, Node):
            concept1 = heuristic[0](concept1)
        if isinstance(concept2, Node):
            concept2 = heuristic[0](concept2)
        if isinstance(concept1, list):
            concept1 = [isinstance(n, Node) and n or self[n] for n in concept1]
        if isinstance(concept2, list):
            concept2 = [isinstance(n, Node) and n or self[n] for n in concept2]
        h = lambda id1, id2: heuristic[1](self.edge(id1, id2))
        w = 0.0
        for p1 in concept1[:k]:
            for p2 in concept2[:k]:
                p = self.shortest_path(p1, p2, heuristic=h)
                w += 1.0 / (p is None and 1e10 or len(p))
        return w / k
        
    def nearest_neighbors(self, concept, concepts=[], k=3):
        """ Returns the k most similar concepts from the given list.
        """
        return sorted(concepts, key=lambda candidate: self.similarity(concept, candidate, k), reverse=True)
        
    similar = neighbors = nn = nearest_neighbors

    def taxonomy(self, concept, depth=3, fringe=2):
        """ Returns a list of concepts that are descendants of the given concept, using "is-a" relations.
            Creates a subgraph of "is-a" related concepts up to the given depth,
            then takes the fringe (i.e., leaves) of the subgraph.
        """
        def traversable(node, edge):
            # Follow parent-child edges.
            return edge.node2 == node and edge.type == "is-a"
        if not isinstance(concept, Node):
            concept = self[concept]
        g = self.copy(nodes=concept.flatten(depth, traversable))
        g = g.fringe(depth=fringe)
        g = [self[n.id] for n in g if n != concept]
        return g
        
    field = semantic_field = taxonomy

#g = Commonsense()
#print(g.nn("party", g.field("animal")))
#print(g.nn("creepy", g.field("animal")))

#### COMMONSENSE DATA ##############################################################################

#--- NODEBOX.NET/PERCEPTION ------------------------------------------------------------------------

def download(path=os.path.join(MODULE, "commonsense.csv"), threshold=50):
    """ Downloads commonsense data from http://nodebox.net/perception.
        Saves the data as commonsense.csv which can be the input for Commonsense.load().
    """
    s = "http://nodebox.net/perception?format=txt&robots=1"
    s = urlopen(s).read()
    s = s.decode("utf-8")
    s = s.replace("\\'", "'")
    # Group relations by author.
    a = {}
    for r in ([v.strip("'") for v in r.split(", ")] for r in s.split("\n")):
        if len(r) == 7:
            a.setdefault(r[-2], []).append(r)
    # Iterate authors sorted by number of contributions.
    # 1) Authors with 50+ contributions can define new relations and context.
    # 2) Authors with 50- contributions (or robots) can only reinforce existing relations.
    a = sorted(a.items(), cmp=lambda v1, v2: len(v2[1]) - len(v1[1]))
    r = {}
    for author, relations in a:
        if author == "" or author.startswith("robots@"):
            continue
        if len(relations) < threshold:
            break
        # Sort latest-first (we prefer more recent relation types).
        relations = sorted(relations, cmp=lambda r1, r2: r1[-1] > r2[-1])
        # 1) Define new relations.
        for concept1, relation, concept2, context, weight, author, date in relations:
            id = (concept1, relation, concept2)
            if id not in r:
                r[id] = [None, 0]
            if r[id][0] is None and context is not None:
                r[id][0] = context
    for author, relations in a:
        # 2) Reinforce existing relations.
        for concept1, relation, concept2, context, weight, author, date in relations:
            id = (concept1, relation, concept2)
            if id in r:
                r[id][1] += int(weight)
    # Export CSV-file.
    s = []
    for (concept1, relation, concept2), (context, weight) in r.items():
        s.append("\"%s\",\"%s\",\"%s\",\"%s\",%s" % (
            concept1, relation, concept2, context, weight))
    f = open(path, "w")
    f.write(BOM_UTF8)
    f.write("\n".join(s).encode("utf-8"))
    f.close()
    
def json():
    """ Returns a JSON-string with the data from commonsense.csv.
        Each relation is encoded as a [concept1, relation, concept2, context, weight] list.
    """
    f = lambda s: s.replace("'", "\\'").encode("utf-8")
    s = []
    g = Commonsense()
    for e in g.edges:
        s.append("\n\t['%s', '%s', '%s', '%s', %.2f]" % (
            f(e.node1.id),
            f(e.type),
            f(e.node2.id),
            f(e.context),
              e.weight
        ))
    return "commonsense = [%s];" % ", ".join(s)

#download("commonsense.csv", threshold=50)
#open("commonsense.js", "w").write(json())