1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
"""Information Content is a corpus-based metrics of synset or sense
specificity.
"""
from collections import Counter
from collections.abc import Callable, Iterable, Iterator
from math import log
from pathlib import Path
from typing import TextIO, TypeAlias
from wn import Synset, Wordnet
from wn._types import AnyPath
from wn.constants import ADJ, ADJ_SAT, ADV, NOUN, VERB
from wn.util import synset_id_formatter
# Just use a subset of all available parts of speech
IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV))
Freq: TypeAlias = dict[str, dict[str | None, float]]
def information_content(synset: Synset, freq: Freq) -> float:
"""Calculate the Information Content value for a synset.
The information content of a synset is the negative log of the
synset probability (see :func:`synset_probability`).
"""
return -log(synset_probability(synset, freq))
def synset_probability(synset: Synset, freq: Freq) -> float:
"""Calculate the synset probability.
The synset probability is defined as freq(ss)/N where freq(ss) is
the IC weight for the synset and N is the total IC weight for all
synsets with the same part of speech.
Note: this function is not generally used directly, but indirectly
through :func:`information_content`.
"""
pos_freq = freq[synset.pos]
return pos_freq[synset.id] / pos_freq[None]
def _initialize(
wordnet: Wordnet,
smoothing: float,
) -> Freq:
"""Populate an Information Content weight mapping to a smoothing value.
All synsets in *wordnet* are inserted into the dictionary and
mapped to *smoothing*.
"""
freq: Freq = {
pos: {synset.id: smoothing for synset in wordnet.synsets(pos=pos)}
for pos in IC_PARTS_OF_SPEECH
}
# pretend ADJ_SAT is just ADJ
for synset in wordnet.synsets(pos=ADJ_SAT):
freq[ADJ][synset.id] = smoothing
# also initialize totals (when synset is None) for each part-of-speech
for pos in IC_PARTS_OF_SPEECH:
freq[pos][None] = smoothing
return freq
def compute(
corpus: Iterable[str],
wordnet: Wordnet,
distribute_weight: bool = True,
smoothing: float = 1.0,
) -> Freq:
"""Compute Information Content weights from a corpus.
Arguments:
corpus: An iterable of string tokens. This is a flat list of
words and the order does not matter. Tokens may be single
words or multiple words separated by a space.
wordnet: An instantiated :class:`wn.Wordnet` object, used to
look up synsets from words.
distribute_weight: If :python:`True`, the counts for a word
are divided evenly among all synsets for the word.
smoothing: The initial value given to each synset.
Example:
>>> import wn, wn.ic, wn.morphy
>>> ewn = wn.Wordnet("ewn:2020", lemmatizer=wn.morphy.morphy)
>>> freq = wn.ic.compute(["Dogs", "run", ".", "Cats", "sleep", "."], ewn)
>>> dog = ewn.synsets("dog", pos="n")[0]
>>> cat = ewn.synsets("cat", pos="n")[0]
>>> frog = ewn.synsets("frog", pos="n")[0]
>>> freq["n"][dog.id]
1.125
>>> freq["n"][cat.id]
1.1
>>> freq["n"][frog.id] # no occurrence; smoothing value only
1.0
>>> carnivore = dog.lowest_common_hypernyms(cat)[0]
>>> freq["n"][carnivore.id]
1.3250000000000002
"""
freq = _initialize(wordnet, smoothing)
counts = Counter(corpus)
hypernym_cache: dict[Synset, list[Synset]] = {}
for word, count in counts.items():
synsets = wordnet.synsets(word)
num = len(synsets)
if num == 0:
continue
weight = float(count / num if distribute_weight else count)
for synset in synsets:
pos = synset.pos
if pos == ADJ_SAT:
pos = ADJ
if pos not in IC_PARTS_OF_SPEECH:
continue
freq[pos][None] += weight
# The following while-loop is equivalent to:
#
# freq[pos][synset.id] += weight
# for path in synset.hypernym_paths():
# for ss in path:
# freq[pos][ss.id] += weight
#
# ...but it caches hypernym lookups for speed
agenda: list[tuple[Synset, set[Synset]]] = [(synset, set())]
while agenda:
ss, seen = agenda.pop()
# avoid cycles
if ss in seen:
continue
freq[pos][ss.id] += weight
if ss not in hypernym_cache:
hypernym_cache[ss] = ss.hypernyms()
agenda.extend((hyp, seen | {ss}) for hyp in hypernym_cache[ss])
return freq
def load(
source: AnyPath,
wordnet: Wordnet,
get_synset_id: Callable | None = None,
) -> Freq:
"""Load an Information Content mapping from a file.
Arguments:
source: A path to an information content weights file.
wordnet: A :class:`wn.Wordnet` instance with synset
identifiers matching the offsets in the weights file.
get_synset_id: A callable that takes a synset offset and part
of speech and returns a synset ID valid in *wordnet*.
Raises:
:class:`wn.Error`: If *wordnet* does not have exactly one
lexicon.
Example:
>>> import wn, wn.ic
>>> pwn = wn.Wordnet("pwn:3.0")
>>> path = "~/nltk_data/corpora/wordnet_ic/ic-brown-resnik-add1.dat"
>>> freq = wn.ic.load(path, pwn)
"""
source = Path(source).expanduser().resolve(strict=True)
assert len(wordnet.lexicons()) == 1
lexid = wordnet.lexicons()[0].id
if get_synset_id is None:
get_synset_id = synset_id_formatter(prefix=lexid)
freq = _initialize(wordnet, 0.0)
with source.open() as icfile:
for offset, pos, weight, is_root in _parse_ic_file(icfile):
ssid = get_synset_id(offset=offset, pos=pos)
# synset = wordnet.synset(ssid)
freq[pos][ssid] = weight
if is_root:
freq[pos][None] += weight
return freq
def _parse_ic_file(icfile: TextIO) -> Iterator[tuple[int, str, float, bool]]:
"""Parse the Information Content file.
A sample of the format is::
wnver::eOS9lXC6GvMWznF1wkZofDdtbBU
1740n 1915712 ROOT
1930n 859272
2137n 1055337
"""
next(icfile) # skip header
for line in icfile:
ssinfo, value, *isroot = line.split()
yield (int(ssinfo[:-1]), ssinfo[-1], float(value), bool(isroot))
|