From 32e42c96281ebddcce5ae1ea79b80a0ac07cb19e Mon Sep 17 00:00:00 2001
From: Boris Staletic <boris.staletic@protonmail.com>
Date: Tue, 8 Oct 2024 21:50:50 +0200
Subject: [PATCH] Upgrade to unicode 16

---
 cpp/ycm/Character.cpp                |   6 +-
 cpp/ycm/Character.h                  |   2 +-
 cpp/ycm/CodePoint.h                  |   2 +-
 cpp/ycm/UnicodeTable.inc             |  42 +-
 cpp/ycm/tests/CodePoint_test.cpp     |  11 +-
 cpp/ycm/tests/GraphemeBreakCases.inc | 446 ++++++-------
 cpp/ycm/tests/NormalizationCases.inc | 893 ++++++++++++++++++++++++++-
 third_party/mrab-regex-github        |   2 +-
 update_unicode.py                    |   4 +-
 9 files changed, 1105 insertions(+), 303 deletions(-)

diff --git a/cpp/ycm/Character.cpp b/cpp/ycm/Character.cpp
index 1233f11e..2ea42fa5 100644
--- a/cpp/ycm/Character.cpp
+++ b/cpp/ycm/Character.cpp
@@ -31,7 +31,7 @@ bool CodePointCompare( const CodePoint *left, const CodePoint *right ) {
 
 
 // Sort the code points according to the Canonical Ordering Algorithm.
-// See https://www.unicode.org/versions/latest/ch03.pdf#G49591
+// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591
 CodePointSequence CanonicalSort( CodePointSequence code_points ) {
   auto code_point_start = code_points.begin();
   auto code_point_end = code_points.end();
@@ -64,7 +64,7 @@ CodePointSequence CanonicalSort( CodePointSequence code_points ) {
 
 // Decompose a UTF-8 encoded string into a sequence of code points according to
 // Canonical Decomposition. See
-// https://www.unicode.org/versions/latest/ch03.pdf#G733
+// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G733
 CodePointSequence CanonicalDecompose( std::string_view text ) {
   assert( NormalizeInput( text ) == text );
   return CanonicalSort( BreakIntoCodePoints( text ) );
@@ -78,7 +78,7 @@ Character::Character( std::string_view character )
     is_punctuation_( false ),
     is_uppercase_( false ) {
   // Normalize the character through NFD (Normalization Form D). See
-  // https://www.unicode.org/versions/latest/ch03.pdf#G49621
+  // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621
   CodePointSequence code_points = CanonicalDecompose( character );
 
   for ( const auto &code_point : code_points ) {
diff --git a/cpp/ycm/Character.h b/cpp/ycm/Character.h
index 87b6eb04..8cf3fbeb 100644
--- a/cpp/ycm/Character.h
+++ b/cpp/ycm/Character.h
@@ -27,7 +27,7 @@ namespace YouCompleteMe {
 // This class represents a UTF-8 character. It takes a UTF-8 encoded string
 // corresponding to a grapheme cluster (see
 // https://www.unicode.org/glossary/#grapheme_cluster), normalize it through NFD
-// (see https://www.unicode.org/versions/latest/ch03.pdf#G49621), and
+// (see https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621), and
 // compute the folded and swapped case versions of the normalized character. It
 // also holds some properties like if the character is a letter or a
 // punctuation, and if it is uppercase.
diff --git a/cpp/ycm/CodePoint.h b/cpp/ycm/CodePoint.h
index 9ce82f60..199747ea 100644
--- a/cpp/ycm/CodePoint.h
+++ b/cpp/ycm/CodePoint.h
@@ -91,7 +91,7 @@ struct RawCodePoint {
 //  - its breaking property: used to split a word into characters.
 //  - its combining class: used to sort a sequence of code points according to
 //    the Canonical Ordering algorithm (see
-//    https://www.unicode.org/versions/latest/ch03.pdf#G49591).
+//    https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591).
 class CodePoint {
 public:
   YCM_EXPORT explicit CodePoint( std::string_view code_point );
diff --git a/update_unicode.py b/update_unicode.py
index d13b4a3d..04e9a65f 100755
--- a/update_unicode.py
+++ b/update_unicode.py
@@ -364,7 +364,7 @@ def GetEmojiData():
 
 
 # Decompose a hangul syllable using the algorithm described in
-# https://www.unicode.org/versions/latest/ch03.pdf#G61399
+# https://www.unicode.org/versions/latest/core-spec/chapter-3/#G61399
 def DecomposeHangul( code_point ):
   index = int( code_point, 16 ) - HANGUL_BASE
   if index < 0 or index >= HANGUL_LVT_COUNT:
@@ -381,7 +381,7 @@ def DecomposeHangul( code_point ):
 
 # Recursively decompose a Unicode code point into a list of code points
 # according to canonical decomposition.
-# See https://www.unicode.org/versions/latest/ch03.pdf#G733
+# See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G733
 def Decompose( code_point, unicode_data ):
   code_points = DecomposeHangul( code_point )
   if code_points:
-- 
2.45.2