1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
|
%# -*- mode: ruby; coding: utf-8 -*-
<%
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...
# Constants for input and output directory
InputDataDir = ARGV[0] || 'enc/unicode/data'
unicode_version = InputDataDir[/.*\/(\d+\.\d+\.\d+)(?=\/|\z)/, 1]
# convenience methods
class Integer
def to_UTF8() # convert to string, taking legibility into account
if self>0xFFFF
"\\u{#{to_s(16).upcase}}"
elsif self>0x7f
"\\u#{to_s(16).upcase.rjust(4, '0')}"
else
chr.sub(/[\\\"]/, "\\\\\\\&")
end
end
end
module Enumerable
unless method_defined?(:each_slice)
def each_slice(n)
ary = []
each do |i|
ary << i
if ary.size >= n
yield ary
ary = []
end
end
yield ary unless ary.empty?
self
end
end
end
class Array
def to_UTF8() collect {|c| c.to_UTF8}.join('') end
def each_regexp_chars(n = 1) # converts an array of Integers to character ranges
sort.inject([]) do |ranges, value|
if ranges.last and ranges.last[1]+1>=value
ranges.last[1] = value
ranges
else
ranges << [value, value]
end
end.collect do |first, last|
case last-first
when 0
first.to_UTF8
when 1
first.to_UTF8 + last.to_UTF8
else
first.to_UTF8 + '-' + last.to_UTF8
end
end.each_slice(n) do |slice|
yield slice.join('')
end
end
end
# read the file 'CompositionExclusions.txt'
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt", 'rb') {|f|
base = Regexp.quote(File.basename(f.path, '.*'))
ext = Regexp.quote(File.extname(f.path))
version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
abort "No file version in #{f.path}: #{line}"
(unicode_version ||= version) == version or
abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
}
decomposition_table = {}
kompatible_table = {}
combining_class = {} # constant to allow use in Integer#to_UTF8
# read the file 'UnicodeData.txt'
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
codepoint, name, _, char_class, _, decomposition, *_rest = line.split(";")
case decomposition
when /^[0-9A-F]/
decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
when /^</
kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
end
combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
warn "Unexpected: Character range with data relevant to normalization!"
end
end
# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
composition_exclusions.member? character or # predefined composition exclusion
decomposition.length<=1 or # Singleton Decomposition
combining_class[character] or # character is not a Starter
combining_class[decomposition.first] # decomposition begins with a character that is not a Starter
end.invert
# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values
accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
composition_starters = composition_table.keys.collect {|key| key.first}
hangul_no_trailing = []
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
# expand decomposition table values
decomposition_table.each do |key, value|
position = 0
while position < value.length
if decomposition = decomposition_table[value[position]]
decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
value[position, 1] = decomposition
else
position += 1
end
end
end
# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
value = value.dup
expanded = false
position = 0
while position < value.length
if decomposition = kompatible_table[value[position]]
value[position, 1] = decomposition
expanded = true
else
position += 1
end
end
kompatible_table[key] = value if expanded
end
while kompatible_table.any? {|key, value|
expanded = value.map {|v| kompatible_table[v] || v}.flatten
kompatible_table[key] = expanded unless value == expanded
}
end
# generate normalization tables file
%># coding: us-ascii
# frozen_string_literal: true
%# >
# automatically generated by template/unicode_norm_gen.tmpl
module UnicodeNormalize # :nodoc:
accents = "" \
"[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]"
ACCENTS = accents
REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
}" \
"[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]#{accents}*" \
"|#{'' # characters that can be the result of a composition, except composition starters
}" \
"[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]?#{accents}+" \
"|#{'' # precomposed Hangul syllables
}" \
"[\u{AC00}-\u{D7A4}]"
REGEXP_C_STRING = "#{'' # composition exclusions
}" \
"[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]#{accents}*" \
"|#{'' # composition starters and characters that can be the result of a composition
}" \
"[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]?#{accents}+" \
"|#{'' # Hangul syllables with separate trailer
}" \
"[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>][\u11A8-\u11C2]" \
"|#{'' # decomposed Hangul syllables
}" \
"[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?"
REGEXP_K_STRING = "" \
"[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
"<%end%>]"
class_table = {
% combining_class.each do |key, value|
"<%=key.to_UTF8%>"=><%=value%><%=%>,
% end
}
class_table.default = 0
CLASS_TABLE = class_table.freeze
DECOMPOSITION_TABLE = {
% decomposition_table.each do |key, value|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
% end
}.freeze
KOMPATIBLE_TABLE = {
% kompatible_table.each do |key, value|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
% end
}.freeze
COMPOSITION_TABLE = {
% composition_table.each do |key, value|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
% end
}.freeze
end
|