File: updater.rb

package info (click to toggle)
ruby-regexp-property-values 1.0.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 164 kB
  • sloc: ruby: 243; ansic: 51; makefile: 6; sh: 4
file content (133 lines) | stat: -rw-r--r-- 3,697 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
module RegexpPropertyValues
  module Updater
    module_function

    require 'fileutils'
    require 'set'

    BASE_URL = 'http://www.unicode.org/Public/'

    UCD_FILES = %w[
      Blocks.txt
      DerivedAge.txt
      DerivedCoreProperties.txt
      PropertyAliases.txt
      PropertyValueAliases.txt
      PropList.txt
      Scripts.txt
    ]

    EMOJI_FILES = %w[
      emoji-data.txt
    ]

    TMP_DIR = File.join(__dir__, 'tmp_ucd')

    def call
      prepare_tmp_dir
      download_ucd_files
      write_values
      write_aliases
      remove_tmp_dir
      print_stats
    end

    def prepare_tmp_dir
      FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
      FileUtils.mkdir(TMP_DIR)
    end

    def download_ucd_files
      unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION')
      emoji_version   = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION')
      puts 'This will load ucd and emoji data for the CURRENT RUBY '\
           "(#{unicode_version} / #{emoji_version}). Run this on the "\
           'latest Ruby version you want to support. Continue? [y/n]'
      return puts 'download skipped.' unless $stdin.gets =~ /^y/i

      Dir.chdir(TMP_DIR) do
        UCD_FILES.each   { |f| `wget #{BASE_URL}/#{unicode_version}/ucd/#{f}` }
        EMOJI_FILES.each { |f| `wget #{BASE_URL}/emoji/#{emoji_version}/#{f}` }
      end
    end

    def write_values
      @values = Set.new

      # posix properties
      @values += %w[
        Alpha Blank Cntrl Digit Graph Lower Print
        Punct Space Upper XDigit Word Alnum ASCII
        XPosixPunct
      ]

      # special properties
      @values += %w[Any Assigned In_No_Block Unknown]

      # legacy properties
      @values += %w[Newline]

      regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
      %w[
        DerivedCoreProperties.txt
        PropList.txt
        Scripts.txt
        emoji-data.txt
      ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }

      scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
        @values << caps[:prop_name]
      end

      scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
        @values << 'In_' + caps[:block_name].gsub(/\W/, '_')
      end

      scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
        @values << 'Age=' + caps[:age_num]
      end

      File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
    end

    def write_aliases
      @aliases = Set.new

      scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
        if in_values?(caps[:name]) && !in_values?(caps[:alias])
          @aliases << [caps[:alias], caps[:name]]
        end
      end

      scan('PropertyValueAliases.txt',
        /^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
        if in_values?(caps[:name]) && !in_values?(caps[:alias1])
          @aliases << [caps[:alias1], caps[:name]]
        end
        if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
          @aliases << [caps[:alias2], caps[:name]]
        end
      end

      File.write(RegexpPropertyValues::ALIASES_PATH,
                 @aliases.sort.map { |pair| pair.join(';') }.join("\n"))
    end

    def in_values?(string)
      @values.any? { |value| value.casecmp?(string) }
    end

    def scan(file, pattern)
      path = File.join(TMP_DIR, file)
      File.read(path).scan(pattern) { yield(Regexp.last_match) }
    end

    def remove_tmp_dir
      FileUtils.rm_rf(TMP_DIR)
    end

    def print_stats
      print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
    end
  end
end