File: similarity.py

package info (click to toggle)
python-wn 1.0.0-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,100 kB
  • sloc: python: 8,429; xml: 566; sql: 238; makefile: 12
file content (234 lines) | stat: -rw-r--r-- 8,072 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""Synset similarity metrics."""

import math

import wn
from wn._core import Synset
from wn.constants import ADJ, ADJ_SAT
from wn.ic import Freq, information_content


def path(synset1: Synset, synset2: Synset, simulate_root: bool = False) -> float:
    """Return the Path similarity of *synset1* and *synset2*.

    Arguments:
        synset1: The first synset to compare.
        synset2: The second synset to compare.
        simulate_root: When :python:`True`, a fake root node connects
            all other roots; default: :python:`False`.

    Example:
        >>> import wn
        >>> from wn.similarity import path
        >>> ewn = wn.Wordnet("ewn:2020")
        >>> spatula = ewn.synsets("spatula")[0]
        >>> path(spatula, ewn.synsets("pancake")[0])
        0.058823529411764705
        >>> path(spatula, ewn.synsets("utensil")[0])
        0.2
        >>> path(spatula, spatula)
        1.0
        >>> flip = ewn.synsets("flip", pos="v")[0]
        >>> turn_over = ewn.synsets("turn over", pos="v")[0]
        >>> path(flip, turn_over)
        0.0
        >>> path(flip, turn_over, simulate_root=True)
        0.16666666666666666

    """
    _check_if_pos_compatible(synset1.pos, synset2.pos)
    try:
        path = synset1.shortest_path(synset2, simulate_root=simulate_root)
    except wn.Error:
        distance = float("inf")
    else:
        distance = len(path)
    return 1 / (distance + 1)


def wup(synset1: Synset, synset2: Synset, simulate_root=False) -> float:
    """Return the Wu-Palmer similarity of *synset1* and *synset2*.

    Arguments:
        synset1: The first synset to compare.
        synset2: The second synset to compare.
        simulate_root: When :python:`True`, a fake root node connects
            all other roots; default: :python:`False`.

    Raises:
        wn.Error: When no path connects the *synset1* and *synset2*.

    Example:
        >>> import wn
        >>> from wn.similarity import wup
        >>> ewn = wn.Wordnet("ewn:2020")
        >>> spatula = ewn.synsets("spatula")[0]
        >>> wup(spatula, ewn.synsets("pancake")[0])
        0.2
        >>> wup(spatula, ewn.synsets("utensil")[0])
        0.8
        >>> wup(spatula, spatula)
        1.0
        >>> flip = ewn.synsets("flip", pos="v")[0]
        >>> turn_over = ewn.synsets("turn over", pos="v")[0]
        >>> wup(flip, turn_over, simulate_root=True)
        0.2857142857142857

    """
    _check_if_pos_compatible(synset1.pos, synset2.pos)
    lcs_list = _least_common_subsumers(synset1, synset2, simulate_root)
    lcs = lcs_list[0]
    i = len(synset1.shortest_path(lcs, simulate_root=simulate_root))
    j = len(synset2.shortest_path(lcs, simulate_root=simulate_root))
    k = lcs.max_depth() + 1
    return (2 * k) / (i + j + 2 * k)


def lch(
    synset1: Synset, synset2: Synset, max_depth: int, simulate_root: bool = False
) -> float:
    """Return the Leacock-Chodorow similarity between *synset1* and *synset2*.

    Arguments:
        synset1: The first synset to compare.
        synset2: The second synset to compare.
        max_depth: The taxonomy depth (see :func:`wn.taxonomy.taxonomy_depth`)
        simulate_root: When :python:`True`, a fake root node connects
            all other roots; default: :python:`False`.

    Example:
        >>> import wn, wn.taxonomy
        >>> from wn.similarity import lch
        >>> ewn = wn.Wordnet("ewn:2020")
        >>> n_depth = wn.taxonomy.taxonomy_depth(ewn, "n")
        >>> spatula = ewn.synsets("spatula")[0]
        >>> lch(spatula, ewn.synsets("pancake")[0], n_depth)
        0.8043728156701697
        >>> lch(spatula, ewn.synsets("utensil")[0], n_depth)
        2.0281482472922856
        >>> lch(spatula, spatula, n_depth)
        3.6375861597263857
        >>> v_depth = taxonomy.taxonomy_depth(ewn, "v")
        >>> flip = ewn.synsets("flip", pos="v")[0]
        >>> turn_over = ewn.synsets("turn over", pos="v")[0]
        >>> lch(flip, turn_over, v_depth, simulate_root=True)
        1.3862943611198906

    """
    _check_if_pos_compatible(synset1.pos, synset2.pos)
    distance = len(synset1.shortest_path(synset2, simulate_root=simulate_root))
    if max_depth <= 0:
        raise wn.Error("max_depth must be greater than 0")
    return -math.log((distance + 1) / (2 * max_depth))


def res(synset1: Synset, synset2: Synset, ic: Freq) -> float:
    """Return the Resnik similarity between *synset1* and *synset2*.

    Arguments:
        synset1: The first synset to compare.
        synset2: The second synset to compare.
        ic: Information Content weights.

    Example:
        >>> import wn, wn.ic, wn.taxonomy
        >>> from wn.similarity import res
        >>> pwn = wn.Wordnet("pwn:3.0")
        >>> ic = wn.ic.load("~/nltk_data/corpora/wordnet_ic/ic-brown.dat", pwn)
        >>> spatula = pwn.synsets("spatula")[0]
        >>> res(spatula, pwn.synsets("pancake")[0], ic)
        0.8017591149538994
        >>> res(spatula, pwn.synsets("utensil")[0], ic)
        5.87738923441087

    """
    _check_if_pos_compatible(synset1.pos, synset2.pos)
    lcs = _most_informative_lcs(synset1, synset2, ic)
    return information_content(lcs, ic)


def jcn(synset1: Synset, synset2: Synset, ic: Freq) -> float:
    """Return the Jiang-Conrath similarity of two synsets.

    Arguments:
        synset1: The first synset to compare.
        synset2: The second synset to compare.
        ic: Information Content weights.

    Example:
        >>> import wn, wn.ic, wn.taxonomy
        >>> from wn.similarity import jcn
        >>> pwn = wn.Wordnet("pwn:3.0")
        >>> ic = wn.ic.load("~/nltk_data/corpora/wordnet_ic/ic-brown.dat", pwn)
        >>> spatula = pwn.synsets("spatula")[0]
        >>> jcn(spatula, pwn.synsets("pancake")[0], ic)
        0.04061799236354239
        >>> jcn(spatula, pwn.synsets("utensil")[0], ic)
        0.10794048564613007

    """
    _check_if_pos_compatible(synset1.pos, synset2.pos)
    ic1 = information_content(synset1, ic)
    ic2 = information_content(synset2, ic)
    lcs = _most_informative_lcs(synset1, synset2, ic)
    ic_lcs = information_content(lcs, ic)
    if ic1 == ic2 == ic_lcs == 0:
        return 0
    elif ic1 + ic2 == 2 * ic_lcs:
        return float("inf")
    else:
        return 1 / (ic1 + ic2 - 2 * ic_lcs)


def lin(synset1: Synset, synset2: Synset, ic: Freq) -> float:
    """Return the Lin similarity of two synsets.

    Arguments:
        synset1: The first synset to compare.
        synset2: The second synset to compare.
        ic: Information Content weights.

    Example:
        >>> import wn, wn.ic, wn.taxonomy
        >>> from wn.similarity import lin
        >>> pwn = wn.Wordnet("pwn:3.0")
        >>> ic = wn.ic.load("~/nltk_data/corpora/wordnet_ic/ic-brown.dat", pwn)
        >>> spatula = pwn.synsets("spatula")[0]
        >>> lin(spatula, pwn.synsets("pancake")[0], ic)
        0.061148956278604116
        >>> lin(spatula, pwn.synsets("utensil")[0], ic)
        0.5592415686750427

    """
    _check_if_pos_compatible(synset1.pos, synset2.pos)
    lcs = _most_informative_lcs(synset1, synset2, ic)
    ic1 = information_content(synset1, ic)
    ic2 = information_content(synset2, ic)
    if ic1 == 0 or ic2 == 0:
        return 0.0
    return 2 * information_content(lcs, ic) / (ic1 + ic2)


# Helper functions


def _least_common_subsumers(
    synset1: Synset, synset2: Synset, simulate_root: bool
) -> list[Synset]:
    lcs = synset1.lowest_common_hypernyms(synset2, simulate_root=simulate_root)
    if not lcs:
        raise wn.Error(f"no common hypernyms for {synset1!r} and {synset2!r}")
    return lcs


def _most_informative_lcs(synset1: Synset, synset2: Synset, ic: Freq) -> Synset:
    pos_ic = ic[synset1.pos]
    lcs = _least_common_subsumers(synset1, synset2, False)
    return max(lcs, key=lambda ss: pos_ic[ss.id])


def _check_if_pos_compatible(pos1: str, pos2: str) -> None:
    _pos1 = ADJ if pos1 == ADJ_SAT else pos1
    _pos2 = ADJ if pos2 == ADJ_SAT else pos2
    if _pos1 != _pos2:
        raise wn.Error("synsets must have the same part of speech")