1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
module RegexpPropertyValues
module Updater
module_function
require 'fileutils'
require 'set'
BASE_URL = 'http://www.unicode.org/Public/'
UCD_FILES = %w[
Blocks.txt
DerivedAge.txt
DerivedCoreProperties.txt
PropertyAliases.txt
PropertyValueAliases.txt
PropList.txt
Scripts.txt
]
EMOJI_FILES = %w[
emoji-data.txt
]
TMP_DIR = File.join(__dir__, 'tmp_ucd')
def call
prepare_tmp_dir
download_ucd_files
write_values
write_aliases
remove_tmp_dir
print_stats
end
def prepare_tmp_dir
FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
FileUtils.mkdir(TMP_DIR)
end
def download_ucd_files
unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION')
emoji_version = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION')
puts 'This will load ucd and emoji data for the CURRENT RUBY '\
"(#{unicode_version} / #{emoji_version}). Run this on the "\
'latest Ruby version you want to support. Continue? [y/n]'
return puts 'download skipped.' unless $stdin.gets =~ /^y/i
Dir.chdir(TMP_DIR) do
UCD_FILES.each { |f| `wget #{BASE_URL}/#{unicode_version}/ucd/#{f}` }
EMOJI_FILES.each { |f| `wget #{BASE_URL}/emoji/#{emoji_version}/#{f}` }
end
end
def write_values
@values = Set.new
# posix properties
@values += %w[
Alpha Blank Cntrl Digit Graph Lower Print
Punct Space Upper XDigit Word Alnum ASCII
XPosixPunct
]
# special properties
@values += %w[Any Assigned In_No_Block Unknown]
# legacy properties
@values += %w[Newline]
regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
%w[
DerivedCoreProperties.txt
PropList.txt
Scripts.txt
emoji-data.txt
].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }
scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
@values << caps[:prop_name]
end
scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
@values << 'In_' + caps[:block_name].gsub(/\W/, '_')
end
scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
@values << 'Age=' + caps[:age_num]
end
File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
end
def write_aliases
@aliases = Set.new
scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
if in_values?(caps[:name]) && !in_values?(caps[:alias])
@aliases << [caps[:alias], caps[:name]]
end
end
scan('PropertyValueAliases.txt',
/^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
if in_values?(caps[:name]) && !in_values?(caps[:alias1])
@aliases << [caps[:alias1], caps[:name]]
end
if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
@aliases << [caps[:alias2], caps[:name]]
end
end
File.write(RegexpPropertyValues::ALIASES_PATH,
@aliases.sort.map { |pair| pair.join(';') }.join("\n"))
end
def in_values?(string)
@values.any? { |value| value.casecmp?(string) }
end
def scan(file, pattern)
path = File.join(TMP_DIR, file)
File.read(path).scan(pattern) { yield(Regexp.last_match) }
end
def remove_tmp_dir
FileUtils.rm_rf(TMP_DIR)
end
def print_stats
print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
end
end
end
|