File: character.py

package info (click to toggle)
firefox 147.0.4-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,683,532 kB
  • sloc: cpp: 7,607,356; javascript: 6,533,348; ansic: 3,775,236; python: 1,415,508; xml: 634,561; asm: 438,949; java: 186,241; sh: 62,760; makefile: 18,079; objc: 13,092; perl: 12,808; yacc: 4,583; cs: 3,846; pascal: 3,448; lex: 1,720; ruby: 1,003; php: 436; lisp: 258; awk: 247; sql: 66; sed: 54; csh: 10; exp: 6
file content (125 lines) | stat: -rw-r--r-- 4,665 bytes parent folder | download | duplicates (34)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
# Copyright JS Foundation and other contributors, https://js.foundation/
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import absolute_import, unicode_literals

import sys

import unicodedata
from collections import defaultdict

from .compat import uchr, xrange

# http://stackoverflow.com/questions/14245893/efficiently-list-all-characters-in-a-given-unicode-category
U_CATEGORIES = defaultdict(list)
for c in map(uchr, xrange(sys.maxunicode + 1)):
    U_CATEGORIES[unicodedata.category(c)].append(c)
UNICODE_LETTER = set(
    U_CATEGORIES['Lu'] + U_CATEGORIES['Ll'] +
    U_CATEGORIES['Lt'] + U_CATEGORIES['Lm'] +
    U_CATEGORIES['Lo'] + U_CATEGORIES['Nl']
)
UNICODE_OTHER_ID_START = set((
    # Other_ID_Start
    '\u1885', '\u1886', '\u2118', '\u212E', '\u309B', '\u309C',
    # New in Unicode 8.0
    '\u08B3', '\u0AF9', '\u13F8', '\u9FCD', '\uAB60', '\U00010CC0', '\U000108E0', '\U0002B820',
    # New in Unicode 9.0
    '\u1C80', '\U000104DB', '\U0001E922',
    '\U0001EE00', '\U0001EE06', '\U0001EE0A',
))
UNICODE_OTHER_ID_CONTINUE = set((
    # Other_ID_Continue
    '\xB7', '\u0387', '\u1369', '\u136A', '\u136B', '\u136C',
    '\u136D', '\u136E', '\u136F', '\u1370', '\u1371', '\u19DA',
    # New in Unicode 8.0
    '\u08E3', '\uA69E', '\U00011730',
    # New in Unicode 9.0
    '\u08D4', '\u1DFB', '\uA8C5', '\U00011450',
    '\U0001EE03', '\U0001EE0B',
))
UNICODE_COMBINING_MARK = set(U_CATEGORIES['Mn'] + U_CATEGORIES['Mc'])
UNICODE_DIGIT = set(U_CATEGORIES['Nd'])
UNICODE_CONNECTOR_PUNCTUATION = set(U_CATEGORIES['Pc'])
IDENTIFIER_START = UNICODE_LETTER.union(UNICODE_OTHER_ID_START).union(set(('$', '_', '\\')))
IDENTIFIER_PART = IDENTIFIER_START.union(UNICODE_COMBINING_MARK).union(UNICODE_DIGIT).union(UNICODE_CONNECTOR_PUNCTUATION).union(set(('\u200D', '\u200C'))).union(UNICODE_OTHER_ID_CONTINUE)

WHITE_SPACE = set((
    '\x09', '\x0B', '\x0C', '\x20', '\xA0',
    '\u1680', '\u180E', '\u2000', '\u2001', '\u2002',
    '\u2003', '\u2004', '\u2005', '\u2006', '\u2007',
    '\u2008', '\u2009', '\u200A', '\u202F', '\u205F',
    '\u3000', '\uFEFF',
))
LINE_TERMINATOR = set(('\x0A', '\x0D', '\u2028', '\u2029'))

DECIMAL_CONV = dict((c, n) for n, c in enumerate('0123456789'))
OCTAL_CONV = dict((c, n) for n, c in enumerate('01234567'))
HEX_CONV = dict((c, n) for n, c in enumerate('0123456789abcdef'))
for n, c in enumerate('ABCDEF', 10):
    HEX_CONV[c] = n
DECIMAL_DIGIT = set(DECIMAL_CONV.keys())
OCTAL_DIGIT = set(OCTAL_CONV.keys())
HEX_DIGIT = set(HEX_CONV.keys())


class Character:
    @staticmethod
    def fromCodePoint(code):
        return uchr(code)

    # https://tc39.github.io/ecma262/#sec-white-space

    @staticmethod
    def isWhiteSpace(ch):
        return ch in WHITE_SPACE

    # https://tc39.github.io/ecma262/#sec-line-terminators

    @staticmethod
    def isLineTerminator(ch):
        return ch in LINE_TERMINATOR

    # https://tc39.github.io/ecma262/#sec-names-and-keywords

    @staticmethod
    def isIdentifierStart(ch):
        return ch in IDENTIFIER_START

    @staticmethod
    def isIdentifierPart(ch):
        return ch in IDENTIFIER_PART

    # https://tc39.github.io/ecma262/#sec-literals-numeric-literals

    @staticmethod
    def isDecimalDigit(ch):
        return ch in DECIMAL_DIGIT

    @staticmethod
    def isHexDigit(ch):
        return ch in HEX_DIGIT

    @staticmethod
    def isOctalDigit(ch):
        return ch in OCTAL_DIGIT