File: 12-use-system-unicode-data.patch

package info (click to toggle)
ycmd 0%2B20240823%2Bgit8b61f19%2Bds-4
links: PTS, VCS
area: main
in suites: forky, sid
size: 8,040 kB
sloc: python: 44,018; cpp: 6,138; java: 486; sh: 378; cs: 207; javascript: 150; ansic: 82; makefile: 45; xml: 18; objc: 10
file content (143 lines) | stat: -rw-r--r-- 4,711 bytes
parent folder | download | duplicates (2)
Description: Use unicode-data instead of embedded copies or downloads
 Upstream embeds the generated files in their release tarballs.
 As they are giant blobs which are impossible to review we will just use
 the script provided by upstream to regenerate them – just with data
 we have already in the unicode-data package instead of trying to download
Author: David Kalnischkies <donkult@debian.org>
Forwarded: not-needed

--- a/update_unicode.py
+++ b/update_unicode.py
@@ -18,19 +18,13 @@
 
 import pprint
 import sys
+import bz2
 from collections import defaultdict, OrderedDict
 from os import path as p
 from io import StringIO
-
-
-DIR_OF_THIS_SCRIPT = p.dirname( p.abspath( __file__ ) )
-DIR_OF_THIRD_PARTY = p.join( DIR_OF_THIS_SCRIPT, 'third_party' )
-
-sys.path[ 0:0 ] = [ p.join( DIR_OF_THIRD_PARTY, 'regex-build' ) ]
-
 import regex as re
-import urllib.request
 
+DIR_OF_THIS_SCRIPT = p.dirname( p.abspath( __file__ ) )
 DIR_OF_CPP_SOURCES = p.join( DIR_OF_THIS_SCRIPT, 'cpp', 'ycm' )
 UNICODE_TABLE_TEMPLATE = (
   """// This file was automatically generated with the update_unicode.py script
@@ -109,9 +103,14 @@
 HANGUL_LVT_COUNT = HANGUL_L_COUNT * HANGUL_VT_COUNT
 
 
-def Download( url ):
-  with urllib.request.urlopen( url ) as response:
-    return response.read().decode( 'utf8' ).splitlines()
+def ReadFile( filepath ):
+  with open( filepath, mode='rt', encoding = 'utf8' ) as f:
+    return f.read().splitlines()
+
+
+def ReadBz2File( filepath ):
+  with bz2.open( filepath, mode='rt', encoding = 'utf8' ) as f:
+    return f.read().splitlines()
 
 
 # Encode a Unicode code point in UTF-8 binary form.
@@ -179,8 +178,7 @@
 
 
 def GetUnicodeVersion():
-  readme = Download(
-    'https://www.unicode.org/Public/UCD/latest/ReadMe.txt' )
+  readme = ReadFile( '/usr/share/unicode/ReadMe.txt' )
   for line in readme:
     match = UNICODE_VERSION_REGEX.search( line )
     if match:
@@ -190,8 +188,7 @@
 
 # See https://www.unicode.org/reports/tr44#UnicodeData.txt
 def GetUnicodeData():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' )
+  data = ReadFile( '/usr/share/unicode/UnicodeData.txt' )
 
   unicode_data = OrderedDict()
 
@@ -235,7 +232,7 @@
 # https://www.unicode.org/reports/tr44#GraphemeBreakProperty.txt
 # https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
 def GetBreakProperty( data_url, break_property_regex ):
-  data = Download( data_url )
+  data = ReadFile( data_url )
 
   nb_code_points = 0
   break_data = {}
@@ -278,8 +275,7 @@
 
 # See https://www.unicode.org/reports/tr44#SpecialCasing.txt
 def GetSpecialFolding():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt' )
+  data = ReadFile( '/usr/share/unicode/SpecialCasing.txt' )
 
   folding_data = {}
   for line in data:
@@ -304,8 +300,7 @@
 
 # See https://www.unicode.org/reports/tr44#CaseFolding.txt
 def GetCaseFolding():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt' )
+  data = ReadFile( '/usr/share/unicode/CaseFolding.txt' )
 
   folding_data = {}
   for line in data:
@@ -325,8 +320,7 @@
 
 
 def GetEmojiData():
-  data = Download(
-    'https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt' )
+  data = ReadFile( '/usr/share/unicode/emoji/emoji-data.txt' )
 
   nb_code_points = 0
   emoji_data = defaultdict( list )
@@ -434,11 +428,10 @@
   code_points = []
   unicode_data = GetUnicodeData()
   grapheme_break_data = GetBreakProperty(
-    'https://www.unicode.org/Public/UCD/latest'
-      '/ucd/auxiliary/GraphemeBreakProperty.txt',
+    '/usr/share/unicode/auxiliary/GraphemeBreakProperty.txt',
     GRAPHEME_BREAK_PROPERTY_REGEX )
   indic_conjunct_break_data = GetBreakProperty(
-    'https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt',
+    '/usr/share/unicode/DerivedCoreProperties.txt',
     INDIC_CONJUNCT_BREAK_PROPERTY_REGEX )
   special_folding = GetSpecialFolding()
   case_folding = GetCaseFolding()
@@ -587,8 +580,7 @@
 
 
 def GenerateNormalizationTestCases( output_file ):
-  test_contents = Download(
-      'https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt' )
+  test_contents = ReadBz2File( '/usr/share/unicode/NormalizationTest.txt.bz2' )
   hex_codepoint = '(?:[A-F0-9]{4,} ?)+'
   pattern = f'(?:{ hex_codepoint };){{5}}'
   pattern = re.compile( pattern )
@@ -612,8 +604,7 @@
 
 
 def GenerateGraphemeBreakTestCases( output_file ):
-  test_contents = Download( 'https://www.unicode.org/'
-      'Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt' )
+  test_contents = ReadFile( '/usr/share/unicode/auxiliary/GraphemeBreakTest.txt' )
 
   res = []
   for line in test_contents: