File: context.py

package info (click to toggle)
python-precis-i18n 1.1.1-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,836 kB
  • sloc: python: 1,825; sh: 28; makefile: 3
file content (239 lines) | stat: -rw-r--r-- 7,241 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
"""Implements PRECIS rules for derived properties CONTEXTJ and CONTEXTO."""


def context_rule_error(value, offset, ucd):
    """Apply the PRECIS context rules to `value[offset]`.

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        str: '' if no error, or name of the rule that failed.
    """
    cp = ord(value[offset])
    if ucd.arabic_indic(cp):
        rule = rule_arabic_indic
    elif ucd.extended_arabic_indic(cp):
        rule = rule_extended_arabic_indic
    else:
        rule = _RULES[cp]

    try:
        valid = rule(value, offset, ucd)
    except IndexError:
        # Handle failure of _before and _after accessors.
        valid = False

    if valid:
        return ""

    # If context rule fails, return name of context rule (the name of the
    # function with 'rule_' prefix removed.)
    result = rule.__name__
    if result.startswith("rule_"):
        result = result[5:]
    return result


# These rules test a character at a given offset in the string.


def rule_zero_width_nonjoiner(value, offset, ucd):
    """Return true if context permits a ZERO WIDTH NON-JOINER (U+200C).

    From https://tools.ietf.org/html/rfc5892#appendix-A.1:

      "This may occur in a formally cursive script (such as Arabic) in a
      context where it breaks a cursive connection as required for
      orthographic rules, as in the Persian language, for example.  It
      also may occur in Indic scripts in a consonant-conjunct context
      (immediately following a virama), to control required display of
      such conjuncts."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert value[offset] == "\u200c"
    if ucd.combining_virama(_before(value, offset)):
        return True
    if ucd.valid_jointype(value, offset):
        return True
    return False


def rule_zero_width_joiner(value, offset, ucd):
    """Return true if context permits a ZERO WIDTH JOINER (U+200D).

    From https://tools.ietf.org/html/rfc5892#appendix-A.2:

      "This may occur in Indic scripts in a consonant-conjunct context
      (immediately following a virama), to control required display of
      such conjuncts."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert value[offset] == "\u200d"
    return ucd.combining_virama(_before(value, offset))


def rule_middle_dot(value, offset, ucd):
    """Return true if context permits a MIDDLE DOT (U+00B7).

    From https://tools.ietf.org/html/rfc5892#appendix-A.3:

      "Between 'l' (U+006C) characters only, used to permit the Catalan
      character ela geminada to be expressed."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    # pylint: disable=unused-argument
    assert value[offset] == "\u00b7"
    return 0x06C == _before(value, offset) == _after(value, offset)


def rule_greek_keraia(value, offset, ucd):
    """Return true if context permits GREEK LOWER NUMERAL SIGN (U+0375).

    From https://tools.ietf.org/html/rfc5892#appendix-A.4:

      "The script of the following character MUST be Greek."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert value[offset] == "\u0375"
    return ucd.greek_script(_after(value, offset))


def rule_hebrew_punctuation(value, offset, ucd):
    """Return true if context permits HEBREW PUNCTUATION GERESH or GERSHAYIM
    (U+05F3, U+05F4).

    From https://tools.ietf.org/html/rfc5892#appendix-A.5,
         https://tools.ietf.org/html/rfc5892#appendix-A.6:

      "The script of the preceding character MUST be Hebrew."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert value[offset] in "\u05f3\u05f4"
    return ucd.hebrew_script(_before(value, offset))


# These rules ignore the offset argument; they test the entire string. A string
# only needs to be tested once, irrespective of the number of times the rule is
# triggered.


def rule_katakana_middle_dot(value, offset, ucd):
    """Return true if context permits KATAKANA MIDDLE DOT (U+30FB).

    From https://tools.ietf.org/html/rfc5892#appendix-A.7:

      "Note that the Script of Katakana Middle Dot is not any of
      "Hiragana", "Katakana", or "Han".  The effect of this rule is to
      require at least one character in the label to be in one of those
      scripts."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert value[offset] == "\u30fb"
    return any(ucd.hiragana_katakana_han_script(ord(x)) for x in value)


def rule_arabic_indic(value, offset, ucd):
    """Return true if context permits ARABIC-INDIC DIGITS (U+0660..U+0669).

    From https://tools.ietf.org/html/rfc5892#appendix-A.8:

      "Can not be mixed with Extended Arabic-Indic Digits."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert ucd.arabic_indic(ord(value[offset]))
    return not any(ucd.extended_arabic_indic(ord(x)) for x in value)


def rule_extended_arabic_indic(value, offset, ucd):
    """Return true if context permits EXTENDED ARABIC-INDIC DIGITS
    (U+06F0..U+06F9).

    From https://tools.ietf.org/html/rfc5892#appendix-A.9:

      "Can not be mixed with Arabic-Indic Digits."

    Args:
        value (str): String value to check.
        offset (int): Position within `value`.
        ucd (UnicodeData): Unicode character database.

    Returns:
        bool: True if value is allowed.
    """
    assert ucd.extended_arabic_indic(ord(value[offset]))
    return not any(ucd.arabic_indic(ord(x)) for x in value)


_RULES = {
    0x200C: rule_zero_width_nonjoiner,
    0x200D: rule_zero_width_joiner,
    0x00B7: rule_middle_dot,
    0x0375: rule_greek_keraia,
    0x05F3: rule_hebrew_punctuation,
    0x05F4: rule_hebrew_punctuation,
    0x30FB: rule_katakana_middle_dot,
}


def _before(value, offset):
    """Return code point before `value[offset]` or raise IndexError."""
    if offset <= 0:
        raise IndexError(offset - 1)
    return ord(value[offset - 1])


def _after(value, offset):
    """Return code point after `value[offset]` or raise IndexError."""
    return ord(value[offset + 1])