1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
|
"""Interlingual Indices
This module provides classes and functions for inspecting Interlingual
Index (ILI) objects, both existing and proposed and including their
definitions and any metadata, for synsets and lexicons.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from itertools import zip_longest
from pathlib import Path
from typing import TYPE_CHECKING, Literal, Protocol, overload
from wn._lexicon import Lexicon, LexiconElementWithMetadata
from wn._metadata import HasMetadata
from wn._queries import (
find_ilis,
find_proposed_ilis,
get_ili,
)
from wn._wordnet import Wordnet
if TYPE_CHECKING:
from collections.abc import Iterator
from wn._core import Synset
from wn._metadata import Metadata
from wn._types import AnyPath
class ILIStatus(str, Enum):
__module__ = "wn"
UNKNOWN = "unknown" # no information available
ACTIVE = "active" # attested in ILI file and marked as active
PRESUPPOSED = "presupposed" # used by lexicon, ILI file not loaded
PROPOSED = "proposed" # proposed by lexicon for addition to ILI
@dataclass(slots=True)
class ILIDefinition(HasMetadata):
"""Class for modeling ILI definitions."""
__module__ = "wn"
text: str
_metadata: Metadata | None = field(default=None, compare=False, repr=False)
_lexicon: str | None = field(default=None, compare=False, repr=False)
def metadata(self) -> Metadata:
"""Return the ILI's metadata."""
return self._metadata if self._metadata is not None else {}
def confidence(self) -> float:
c = self.metadata().get("confidenceScore")
if c is None:
if self._lexicon:
# ProposedILIs are lexicon elements and inherit their
# lexicon's confidence value
c = Lexicon.from_specifier(self._lexicon).confidence()
else:
# Regular ILIs are not lexicon elements
c = 1.0
return float(c)
class ILIProtocol(Protocol):
_definition_text: str | None
_definition_metadata: Metadata | None
@property
def id(self) -> str | None:
"""The ILI identifier."""
...
@property
def status(self) -> ILIStatus:
"""The status of the ILI."""
...
@overload
def definition(self, *, data: Literal[False] = False) -> str | None: ...
@overload
def definition(self, *, data: Literal[True] = True) -> ILIDefinition | None: ...
# fallback for non-literal bool argument
@overload
def definition(self, *, data: bool) -> str | ILIDefinition | None: ...
def definition(self, *, data: bool = False) -> str | ILIDefinition | None:
"""Return the ILI's definition.
If the *data* argument is :python:`False` (the default), the
definition is returned as a :class:`str` type. If it is
:python:`True`, a :class:`wn.ILIDefinition` object is used instead.
Note that :class:`ILI` objects will not have definitions unless
an ILI resource has been added, but :class:`ProposedILI` objects
will have definitions if one is provided by the proposing lexicon.
"""
if data and self._definition_text:
return ILIDefinition(
self._definition_text,
_metadata=self._definition_metadata,
# lexicon is defined only for proposed ILIs
_lexicon=getattr(self, "_lexicon", None),
)
return self._definition_text
@dataclass(frozen=True, slots=True)
class ILI(ILIProtocol):
"""A class for interlingual indices."""
__module__ = "wn"
id: str
status: ILIStatus = field(
default=ILIStatus.UNKNOWN, repr=False, hash=False, compare=False
)
_definition_text: str | None = field(
default=None, repr=False, hash=False, compare=False
)
_definition_metadata: Metadata | None = field(
default=None, repr=False, hash=False, compare=False
)
@dataclass(frozen=True, slots=True)
class ProposedILI(LexiconElementWithMetadata, ILIProtocol):
__module__ = "wn"
_synset: str
_lexicon: str
_definition_text: str | None = field(
default=None, repr=False, hash=False, compare=False
)
_definition_metadata: Metadata | None = field(
default=None, repr=False, hash=False, compare=False
)
@property
def id(self) -> Literal[None]:
"""Always return :python:`None`.
Proposed ILIs do not have identifiers. This method is kept for
interface consistency.
"""
return None
@property
def status(self) -> Literal[ILIStatus.PROPOSED]:
"""Always return :attr:`ILIStatus.PROPOSED`.
Proposed ILI objects are only used for ILIs that are proposed.
"""
return ILIStatus.PROPOSED
def synset(self) -> Synset:
"""Return the synset object associated with the proposed ILI."""
return Wordnet(self._lexicon).synset(self._synset)
def get(id: str) -> ILI | None:
"""Get the ILI object with the given id.
The *id* argument is a string ILI identifier. If *id* does not
match a known ILI, :python:`None` is returned. Note that a
:python:`None` value does not necessarily mean that there is no
such ILI, but rather that no resource declaring that ILI has been
loaded into Wn's database.
Example:
>>> from wn import ili
>>> ili.get("i12345")
ILI('i12345')
>>> ili.get("i0") is None
True
"""
if row := get_ili(id=id):
id, status, defn, meta = row
return ILI(
id,
status=ILIStatus(status),
_definition_text=defn,
_definition_metadata=meta,
)
return None
def get_all(
*,
status: ILIStatus | str | None = None,
lexicon: str | None = None,
) -> list[ILI]:
"""Get the list of all matching ILI objects.
The *status* argument may be a string matching a single
:class:`ILIStatus`, or a union of one or more :class:`ILIStatus`
values. The *lexicon* argument is a space-separated string of
lexicon specifiers. All ILIs with a matching status and lexicon
will be returned.
Example:
>>> from wn import ili
>>> len(ili.get_all())
117442
"""
if isinstance(status, str):
status = ILIStatus(status)
lexicons = lexicon.split() if lexicon else []
return [
ILI(
id,
status=ILIStatus(status),
_definition_text=defn,
_definition_metadata=meta,
)
for id, status, defn, meta in find_ilis(status=status, lexicons=lexicons)
]
def get_proposed(synset: Synset) -> ProposedILI | None:
"""Get a proposed ILI for *synset* if it exists.
The synset itself does not give a good indication if it has an
associated proposed ILI. The :attr:`wn.Synset.ili` value will be
:python:`None`, but this is also true if there is no ILI at all.
In most cases it is easier to list the proposed ILIs for a lexicon
using :func:`get_all_proposed`, then to retrieve their associated
synsets.
Example:
>>> import wn
>>> from wn import ili
>>> en = wn.Wordnet("oewn:2024")
>>> en.synset("oewn-00002935-r").ili is None
True
>>> ili.get_proposed(en.synset("oewn-00002935-r"))
ProposedILI(_synset='oewn-00002935-r', _lexicon='oewn:2024')
"""
results = find_proposed_ilis(
synset_id=synset.id,
lexicons=(synset.lexicon().specifier(),),
)
if row := next(results, None):
return ProposedILI(*row)
return None
def get_all_proposed(lexicon: str | None = None) -> list[ProposedILI]:
"""Get the list of all proposed ILI objects.
The *lexicon* argument is a space-separated string of lexicon
specifiers. Proposed ILIs matching the lexicon will be returned.
Example:
>>> from wn import ili
>>> proposed = ili.get_all_proposed("oewn:2024")
>>> proposed[0]
ProposedILI(_synset='oewn-00002935-r', _lexicon='oewn:2024')
>>> proposed[0].synset()
Synset('oewn-00002935-r')
"""
lexicons = lexicon.split() if lexicon else []
return [ProposedILI(*row) for row in find_proposed_ilis(lexicons=lexicons)]
def is_ili_tsv(source: AnyPath) -> bool:
"""Return True if *source* is an ILI tab-separated-value file.
This only checks that the first column, split by tabs, of the
first line is 'ili' or 'ILI'. It does not check if each line has
the correct number of columns.
"""
source = Path(source).expanduser()
if source.is_file():
try:
with source.open("rb") as fh:
return next(fh).split(b"\t")[0] in (b"ili", b"ILI")
except (StopIteration, IndexError):
pass
return False
def load_tsv(source: AnyPath) -> Iterator[dict[str, str]]:
"""Yield data from an ILI tab-separated-value file.
This function yields dictionaries mapping field names to values.
The *source* argument is a path to an ILI file.
Example:
>>> from wn import ili
>>> obj = next(ili._load_tsv("cili.tsv"))
>>> obj.keys()
dict_keys(['ili', 'definition'])
>>> obj["ili"]
'i1'
"""
source = Path(source).expanduser()
with source.open(encoding="utf-8") as fh:
header = next(fh).rstrip("\r\n")
fields = tuple(map(str.lower, header.split("\t")))
for line in fh:
yield dict(
zip_longest(
fields,
line.rstrip("\r\n").split("\t"),
fillvalue="",
)
)
|