File: 12-use-system-unicode-data.patch

package info (click to toggle)
ycmd 0%2B20240823%2Bgit8b61f19%2Bds-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 8,040 kB
  • sloc: python: 44,018; cpp: 6,138; java: 486; sh: 378; cs: 207; javascript: 150; ansic: 82; makefile: 45; xml: 18; objc: 10
file content (143 lines) | stat: -rw-r--r-- 4,711 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
Description: Use unicode-data instead of embedded copies or downloads
 Upstream embeds the generated files in their release tarballs.
 As they are giant blobs which are impossible to review we will just use
 the script provided by upstream to regenerate them – just with data
 we have already in the unicode-data package instead of trying to download
Author: David Kalnischkies <donkult@debian.org>
Forwarded: not-needed

--- a/update_unicode.py
+++ b/update_unicode.py
@@ -18,19 +18,13 @@
 
 import pprint
 import sys
+import bz2
 from collections import defaultdict, OrderedDict
 from os import path as p
 from io import StringIO
-
-
-DIR_OF_THIS_SCRIPT = p.dirname( p.abspath( __file__ ) )
-DIR_OF_THIRD_PARTY = p.join( DIR_OF_THIS_SCRIPT, 'third_party' )
-
-sys.path[ 0:0 ] = [ p.join( DIR_OF_THIRD_PARTY, 'regex-build' ) ]
-
 import regex as re
-import urllib.request
 
+DIR_OF_THIS_SCRIPT = p.dirname( p.abspath( __file__ ) )
 DIR_OF_CPP_SOURCES = p.join( DIR_OF_THIS_SCRIPT, 'cpp', 'ycm' )
 UNICODE_TABLE_TEMPLATE = (
   """// This file was automatically generated with the update_unicode.py script
@@ -109,9 +103,14 @@
 HANGUL_LVT_COUNT = HANGUL_L_COUNT * HANGUL_VT_COUNT
 
 
-def Download( url ):
-  with urllib.request.urlopen( url ) as response:
-    return response.read().decode( 'utf8' ).splitlines()
+def ReadFile( filepath ):
+  with open( filepath, mode='rt', encoding = 'utf8' ) as f:
+    return f.read().splitlines()
+
+
+def ReadBz2File( filepath ):
+  with bz2.open( filepath, mode='rt', encoding = 'utf8' ) as f:
+    return f.read().splitlines()
 
 
 # Encode a Unicode code point in UTF-8 binary form.
@@ -179,8 +178,7 @@
 
 
 def GetUnicodeVersion():
-  readme = Download(
-    'https://www.unicode.org/Public/UCD/latest/ReadMe.txt' )
+  readme = ReadFile( '/usr/share/unicode/ReadMe.txt' )
   for line in readme:
     match = UNICODE_VERSION_REGEX.search( line )
     if match:
@@ -190,8 +188,7 @@
 
 # See https://www.unicode.org/reports/tr44#UnicodeData.txt
 def GetUnicodeData():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' )
+  data = ReadFile( '/usr/share/unicode/UnicodeData.txt' )
 
   unicode_data = OrderedDict()
 
@@ -235,7 +232,7 @@
 # https://www.unicode.org/reports/tr44#GraphemeBreakProperty.txt
 # https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
 def GetBreakProperty( data_url, break_property_regex ):
-  data = Download( data_url )
+  data = ReadFile( data_url )
 
   nb_code_points = 0
   break_data = {}
@@ -278,8 +275,7 @@
 
 # See https://www.unicode.org/reports/tr44#SpecialCasing.txt
 def GetSpecialFolding():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt' )
+  data = ReadFile( '/usr/share/unicode/SpecialCasing.txt' )
 
   folding_data = {}
   for line in data:
@@ -304,8 +300,7 @@
 
 # See https://www.unicode.org/reports/tr44#CaseFolding.txt
 def GetCaseFolding():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt' )
+  data = ReadFile( '/usr/share/unicode/CaseFolding.txt' )
 
   folding_data = {}
   for line in data:
@@ -325,8 +320,7 @@
 
 
 def GetEmojiData():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt' )
+  data = ReadFile( '/usr/share/unicode/emoji/emoji-data.txt' )
 
   nb_code_points = 0
   emoji_data = defaultdict( list )
@@ -434,11 +428,10 @@
   code_points = []
   unicode_data = GetUnicodeData()
   grapheme_break_data = GetBreakProperty(
-    'https://www.unicode.org/Public/UCD/latest'
-      '/ucd/auxiliary/GraphemeBreakProperty.txt',
+    '/usr/share/unicode/auxiliary/GraphemeBreakProperty.txt',
     GRAPHEME_BREAK_PROPERTY_REGEX )
   indic_conjunct_break_data = GetBreakProperty(
-    'https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt',
+    '/usr/share/unicode/DerivedCoreProperties.txt',
     INDIC_CONJUNCT_BREAK_PROPERTY_REGEX )
   special_folding = GetSpecialFolding()
   case_folding = GetCaseFolding()
@@ -587,8 +580,7 @@
 
 
 def GenerateNormalizationTestCases( output_file ):
-  test_contents = Download(
-      'https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt' )
+  test_contents = ReadBz2File( '/usr/share/unicode/NormalizationTest.txt.bz2' )
   hex_codepoint = '(?:[A-F0-9]{4,} ?)+'
   pattern = f'(?:{ hex_codepoint };){{5}}'
   pattern = re.compile( pattern )
@@ -612,8 +604,7 @@
 
 
 def GenerateGraphemeBreakTestCases( output_file ):
-  test_contents = Download( 'https://www.unicode.org/'
-      'Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt' )
+  test_contents = ReadFile( '/usr/share/unicode/auxiliary/GraphemeBreakTest.txt' )
 
   res = []
   for line in test_contents: