1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
|
"""Wordnet lexicon validation.
This module is for checking whether the the contents of a lexicon are
valid according to a series of checks. Those checks are:
==== ==========================================================
Code Message
==== ==========================================================
E101 ID is not unique within the lexicon.
W201 Lexical entry has no senses.
W202 Redundant sense between lexical entry and synset.
W203 Redundant lexical entry with the same lemma and synset.
E204 Synset of sense is missing.
W301 Synset is empty (not associated with any lexical entries).
W302 ILI is repeated across synsets.
W303 Proposed ILI is missing a definition.
W304 Existing ILI has a spurious definition.
W305 Synset has a blank definition.
W306 Synset has a blank example.
W307 Synset repeats an existing definition.
E401 Relation target is missing or invalid.
W402 Relation type is invalid for the source and target.
W403 Redundant relation between source and target.
W404 Reverse relation is missing.
W501 Synset's part-of-speech is different from its hypernym's.
W502 Relation is a self-loop.
==== ==========================================================
"""
from collections import Counter
from collections.abc import (
Callable,
Iterator,
Sequence,
)
from itertools import chain
from typing import TypedDict, cast
from wn import lmf
from wn.constants import (
REVERSE_RELATIONS,
SENSE_RELATIONS,
SENSE_SYNSET_RELATIONS,
SYNSET_RELATIONS,
)
from wn.util import ProgressBar, ProgressHandler
_Ids = dict[str, Counter]
_Result = dict[str, dict]
_CheckFunction = Callable[[lmf.Lexicon, _Ids], _Result]
class _Check(TypedDict):
message: str
items: _Result
_Report = dict[str, _Check]
def _non_unique_id(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""ID is not unique within the lexicon"""
return _multiples(
chain(
[lex["id"]],
(f["id"] for e in _entries(lex) for f in _forms(e) if f.get("id")),
(sb["id"] for sb in lex.get("frames", []) if sb.get("id")),
ids["entry"].elements(),
ids["sense"].elements(),
ids["synset"].elements(),
)
)
def _has_no_senses(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""lexical entry has no senses"""
return {e["id"]: {} for e in _entries(lex) if not _senses(e)}
def _redundant_sense(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""redundant sense between lexical entry and synset"""
result: _Result = {}
for e in _entries(lex):
redundant = _multiples(s["synset"] for s in _senses(e))
result.update(
(s["id"], {"entry": e["id"], "synset": s["synset"]})
for s in _senses(e)
if s["synset"] in redundant
)
return result
def _redundant_entry(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""redundant lexical entry with the same lemma and synset"""
redundant = _multiples(
(e["lemma"]["writtenForm"], s["synset"])
for e in _entries(lex)
for s in _senses(e)
)
return {form: {"synset": synset} for form, synset in redundant}
def _missing_synset(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset of sense is missing"""
synset_ids = ids["synset"]
return {
s["id"]: {"synset": s["synset"]}
for e in _entries(lex)
for s in _senses(e)
if s["synset"] not in synset_ids
}
def _empty_synset(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset is empty (not associated with any lexical entries)"""
synsets = {s["synset"] for e in _entries(lex) for s in _senses(e)}
return {ss["id"]: {} for ss in _synsets(lex) if ss["id"] not in synsets}
def _repeated_ili(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""ILI is repeated across synsets"""
repeated = _multiples(
ss["ili"] for ss in _synsets(lex) if ss["ili"] and ss["ili"] != "in"
)
return {
ss["id"]: {"ili": ss["ili"]} for ss in _synsets(lex) if ss["ili"] in repeated
}
def _missing_ili_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""proposed ILI is missing a definition"""
return {
ss["id"]: {}
for ss in _synsets(lex)
if ss["ili"] == "in" and not ss.get("ili_definition")
}
def _spurious_ili_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""existing ILI has a spurious definition"""
return {
ss["id"]: {"ili_definitin": ss["ili_definition"]}
for ss in _synsets(lex)
if ss["ili"] and ss["ili"] != "in" and ss.get("ili_definition")
}
def _blank_synset_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset has a blank definition"""
return {
ss["id"]: {}
for ss in _synsets(lex)
if any(dfn["text"].strip() == "" for dfn in ss.get("definitions", []))
}
def _blank_synset_example(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset has a blank example"""
return {
ss["id"]: {}
for ss in _synsets(lex)
if any(ex["text"].strip() == "" for ex in ss.get("examples", []))
}
def _repeated_synset_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset repeats an existing definition"""
repeated = _multiples(
dfn["text"] for ss in _synsets(lex) for dfn in ss.get("definitions", [])
)
return {
ss["id"]: {}
for ss in _synsets(lex)
if any(dfn["text"] in repeated for dfn in ss.get("definitions", []))
}
def _missing_relation_target(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""relation target is missing or invalid"""
result = {
s["id"]: {"type": r["relType"], "target": r["target"]}
for s, r in _sense_relations(lex)
if r["target"] not in ids["sense"] and r["target"] not in ids["synset"]
}
result.update(
(ss["id"], {"type": r["relType"], "target": r["target"]})
for ss, r in _synset_relations(lex)
if r["target"] not in ids["synset"]
)
return result
def _invalid_relation_type(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""relation type is invalid for the source and target"""
result = {
s["id"]: {"type": r["relType"], "target": r["target"]}
for s, r in _sense_relations(lex)
if (r["target"] in ids["sense"] and r["relType"] not in SENSE_RELATIONS)
or (r["target"] in ids["synset"] and r["relType"] not in SENSE_SYNSET_RELATIONS)
}
result.update(
(ss["id"], {"type": r["relType"], "target": r["target"]})
for ss, r in _synset_relations(lex)
if r["relType"] not in SYNSET_RELATIONS
)
return result
def _redundant_relation(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""redundant relation between source and target"""
redundant = _multiples(
chain(
(
(s["id"], r["relType"], r["target"], _get_dc_type(r))
for s, r in _sense_relations(lex)
),
(
(ss["id"], r["relType"], r["target"], _get_dc_type(r))
for ss, r in _synset_relations(lex)
),
)
)
return {
src: ({"type": typ, "target": tgt} | ({"dc:type": dctyp} if dctyp else {}))
for src, typ, tgt, dctyp in redundant
}
def _missing_reverse_relation(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""reverse relation is missing"""
regular = {
(s["id"], r["relType"], r["target"])
for s, r in _sense_relations(lex)
if r["target"] in ids["sense"]
}
regular.update(
(ss["id"], r["relType"], r["target"]) for ss, r in _synset_relations(lex)
)
return {
tgt: {"type": REVERSE_RELATIONS[typ], "target": src}
for src, typ, tgt in regular
if typ in REVERSE_RELATIONS
and (tgt, REVERSE_RELATIONS[typ], src) not in regular
}
def _hypernym_wrong_pos(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset's part-of-speech is different from its hypernym's"""
sspos = {ss["id"]: ss.get("partOfSpeech") for ss in _synsets(lex)}
return {
ss["id"]: {"type": r["relType"], "target": r["target"]}
for ss, r in _synset_relations(lex)
if r["relType"] == "hypernym" and ss.get("partOfSpeech") != sspos[r["target"]]
}
def _self_loop(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""relation is a self-loop"""
relations = chain(_sense_relations(lex), _synset_relations(lex))
return {
x["id"]: {"type": r["relType"], "target": r["target"]}
for x, r in relations
if x["id"] == r["target"]
}
# Helpers
def _multiples(iterable):
counts = Counter(iterable)
return {x: {"count": cnt} for x, cnt in counts.items() if cnt > 1}
def _entries(lex: lmf.Lexicon) -> list[lmf.LexicalEntry]:
return lex.get("entries", [])
def _forms(e: lmf.LexicalEntry) -> list[lmf.Form]:
return e.get("forms", [])
def _senses(e: lmf.LexicalEntry) -> list[lmf.Sense]:
return e.get("senses", [])
def _synsets(lex: lmf.Lexicon) -> list[lmf.Synset]:
return lex.get("synsets", [])
def _sense_relations(lex: lmf.Lexicon) -> Iterator[tuple[lmf.Sense, lmf.Relation]]:
for e in _entries(lex):
for s in _senses(e):
for r in s.get("relations", []):
yield (s, r)
def _synset_relations(lex: lmf.Lexicon) -> Iterator[tuple[lmf.Synset, lmf.Relation]]:
for ss in _synsets(lex):
for r in ss.get("relations", []):
yield (ss, r)
def _get_dc_type(r: lmf.Relation) -> str | None:
return (r.get("meta") or {}).get("type")
# Check codes and messages
#
# categories:
# E - errors
# W - warnings
# subcategories:
# 100 - general
# 200 - words and senses
# 300 - synsets and ilis
# 400 - relations
# 500 - graph and taxonomy
_codes: dict[str, _CheckFunction] = {
# 100 - general
"E101": _non_unique_id,
# 200 - words and senses
"W201": _has_no_senses,
"W202": _redundant_sense,
"W203": _redundant_entry,
"E204": _missing_synset,
# 300 - synsets and ilis
"W301": _empty_synset,
"W302": _repeated_ili,
"W303": _missing_ili_definition,
"W304": _spurious_ili_definition,
"W305": _blank_synset_definition,
"W306": _blank_synset_example,
"W307": _repeated_synset_definition,
# 400 - relations
"E401": _missing_relation_target,
"W402": _invalid_relation_type,
"W403": _redundant_relation,
"W404": _missing_reverse_relation,
# 500 - graph
"W501": _hypernym_wrong_pos,
"W502": _self_loop,
}
def _select_checks(select: Sequence[str]) -> list[tuple[str, _CheckFunction, str]]:
selectset = set(select)
return [
(code, func, func.__doc__ or "")
for code, func in _codes.items()
if code in selectset or code[0] in selectset
]
# Main function
def validate(
lex: lmf.Lexicon | lmf.LexiconExtension,
select: Sequence[str] = ("E", "W"),
progress_handler: type[ProgressHandler] | None = ProgressBar,
) -> _Report:
"""Check *lex* for validity and return a report of the results.
The *select* argument is a sequence of check codes (e.g.,
``E101``) or categories (``E`` or ``W``).
The *progress_handler* parameter takes a subclass of
:class:`wn.util.ProgressHandler`. An instance of the class will be
created, used, and closed by this function.
"""
if lex.get("extends"):
print("validation of lexicon extensions is not supported")
return {}
lex = cast("lmf.Lexicon", lex)
if progress_handler is None:
progress_handler = ProgressHandler
ids: _Ids = {
"entry": Counter(entry["id"] for entry in _entries(lex)),
"sense": Counter(
sense["id"] for entry in _entries(lex) for sense in _senses(entry)
),
"synset": Counter(synset["id"] for synset in _synsets(lex)),
}
checks = _select_checks(select)
progress = progress_handler(message="Validate", total=len(checks))
report: _Report = {}
for code, func, message in checks:
progress.set(
status=getattr(func, "__name__", "(unknown test)").replace("_", " ")
)
report[code] = _Check(message=message, items=func(lex, ids))
progress.update()
progress.set(status="")
progress.close()
return report
|