1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
Description: Use unicode-data instead of embedded copies or downloads
Upstream embeds the generated files in their release tarballs.
As they are giant blobs which are impossible to review we will just use
the script provided by upstream to regenerate them – just with data
we have already in the unicode-data package instead of trying to download
Author: David Kalnischkies <donkult@debian.org>
Forwarded: not-needed
--- a/update_unicode.py
+++ b/update_unicode.py
@@ -18,19 +18,13 @@
import pprint
import sys
+import bz2
from collections import defaultdict, OrderedDict
from os import path as p
from io import StringIO
-
-
-DIR_OF_THIS_SCRIPT = p.dirname( p.abspath( __file__ ) )
-DIR_OF_THIRD_PARTY = p.join( DIR_OF_THIS_SCRIPT, 'third_party' )
-
-sys.path[ 0:0 ] = [ p.join( DIR_OF_THIRD_PARTY, 'regex-build' ) ]
-
import regex as re
-import urllib.request
+DIR_OF_THIS_SCRIPT = p.dirname( p.abspath( __file__ ) )
DIR_OF_CPP_SOURCES = p.join( DIR_OF_THIS_SCRIPT, 'cpp', 'ycm' )
UNICODE_TABLE_TEMPLATE = (
"""// This file was automatically generated with the update_unicode.py script
@@ -109,9 +103,14 @@
HANGUL_LVT_COUNT = HANGUL_L_COUNT * HANGUL_VT_COUNT
-def Download( url ):
- with urllib.request.urlopen( url ) as response:
- return response.read().decode( 'utf8' ).splitlines()
+def ReadFile( filepath ):
+ with open( filepath, mode='rt', encoding = 'utf8' ) as f:
+ return f.read().splitlines()
+
+
+def ReadBz2File( filepath ):
+ with bz2.open( filepath, mode='rt', encoding = 'utf8' ) as f:
+ return f.read().splitlines()
# Encode a Unicode code point in UTF-8 binary form.
@@ -179,8 +178,7 @@
def GetUnicodeVersion():
- readme = Download(
- 'https://www.unicode.org/Public/UCD/latest/ReadMe.txt' )
+ readme = ReadFile( '/usr/share/unicode/ReadMe.txt' )
for line in readme:
match = UNICODE_VERSION_REGEX.search( line )
if match:
@@ -190,8 +188,7 @@
# See https://www.unicode.org/reports/tr44#UnicodeData.txt
def GetUnicodeData():
- data = Download(
- 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' )
+ data = ReadFile( '/usr/share/unicode/UnicodeData.txt' )
unicode_data = OrderedDict()
@@ -235,7 +232,7 @@
# https://www.unicode.org/reports/tr44#GraphemeBreakProperty.txt
# https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
def GetBreakProperty( data_url, break_property_regex ):
- data = Download( data_url )
+ data = ReadFile( data_url )
nb_code_points = 0
break_data = {}
@@ -278,8 +275,7 @@
# See https://www.unicode.org/reports/tr44#SpecialCasing.txt
def GetSpecialFolding():
- data = Download(
- 'https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt' )
+ data = ReadFile( '/usr/share/unicode/SpecialCasing.txt' )
folding_data = {}
for line in data:
@@ -304,8 +300,7 @@
# See https://www.unicode.org/reports/tr44#CaseFolding.txt
def GetCaseFolding():
- data = Download(
- 'https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt' )
+ data = ReadFile( '/usr/share/unicode/CaseFolding.txt' )
folding_data = {}
for line in data:
@@ -325,8 +320,7 @@
def GetEmojiData():
- data = Download(
- 'https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt' )
+ data = ReadFile( '/usr/share/unicode/emoji/emoji-data.txt' )
nb_code_points = 0
emoji_data = defaultdict( list )
@@ -434,11 +428,10 @@
code_points = []
unicode_data = GetUnicodeData()
grapheme_break_data = GetBreakProperty(
- 'https://www.unicode.org/Public/UCD/latest'
- '/ucd/auxiliary/GraphemeBreakProperty.txt',
+ '/usr/share/unicode/auxiliary/GraphemeBreakProperty.txt',
GRAPHEME_BREAK_PROPERTY_REGEX )
indic_conjunct_break_data = GetBreakProperty(
- 'https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt',
+ '/usr/share/unicode/DerivedCoreProperties.txt',
INDIC_CONJUNCT_BREAK_PROPERTY_REGEX )
special_folding = GetSpecialFolding()
case_folding = GetCaseFolding()
@@ -587,8 +580,7 @@
def GenerateNormalizationTestCases( output_file ):
- test_contents = Download(
- 'https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt' )
+ test_contents = ReadBz2File( '/usr/share/unicode/NormalizationTest.txt.bz2' )
hex_codepoint = '(?:[A-F0-9]{4,} ?)+'
pattern = f'(?:{ hex_codepoint };){{5}}'
pattern = re.compile( pattern )
@@ -612,8 +604,7 @@
def GenerateGraphemeBreakTestCases( output_file ):
- test_contents = Download( 'https://www.unicode.org/'
- 'Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt' )
+ test_contents = ReadFile( '/usr/share/unicode/auxiliary/GraphemeBreakTest.txt' )
res = []
for line in test_contents:
|