| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 
 | #!/usr/bin/env ruby
# from https://github.com/rails/rails/blob/master/activesupport/bin/generate_tables
# under MIT license
# Usage:
#   ruby tasks/generate_tables
# The Unicode version downloaded is determined by the UNICODE_VERSION lib/mail/multibye/unicode.rb
require './lib/mail/multibyte/unicode'
require 'open-uri'
require 'tmpdir'
module Mail
  module Multibyte
    module Unicode
      class UnicodeDatabase
        def load; end
      end
      class DatabaseGenerator
        BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
        SOURCES = {
          :codepoints => BASE_URI + 'UnicodeData.txt',
          :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
          :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
          :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
        }
        def initialize
          @ucd = Unicode::UnicodeDatabase.new
        end
        def parse_codepoints(line)
          codepoint = Codepoint.new
          raise "Could not parse input." unless line =~ /^
            ([0-9A-F]+);        # code
            ([^;]+);            # name
            ([A-Z]+);           # general category
            ([0-9]+);           # canonical combining class
            ([A-Z]+);           # bidi class
            (<([A-Z]*)>)?       # decomposition type
            ((\ ?[0-9A-F]+)*);  # decomposition mapping
            ([0-9]*);           # decimal digit
            ([0-9]*);           # digit
            ([^;]*);            # numeric
            ([YN]*);            # bidi mirrored
            ([^;]*);            # unicode 1.0 name
            ([^;]*);            # iso comment
            ([0-9A-F]*);        # simple uppercase mapping
            ([0-9A-F]*);        # simple lowercase mapping
            ([0-9A-F]*)$/ix     # simple titlecase mapping
          codepoint.code              = $1.hex
          #codepoint.name              = $2
          #codepoint.category          = $3
          codepoint.combining_class   = Integer($4)
          #codepoint.bidi_class        = $5
          codepoint.decomp_type       = $7
          codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
          #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
          codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
          codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
          #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
          @ucd.codepoints[codepoint.code] = codepoint
        end
        def parse_grapheme_break_property(line)
          if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/
            type = $2.downcase.intern
            @ucd.boundary[type] ||= []
            if $1.include? '..'
              parts = $1.split '..'
              @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
            else
              @ucd.boundary[type] << $1.hex
            end
          end
        end
        def parse_composition_exclusion(line)
          if line =~ /^([0-9A-F]+)/i
            @ucd.composition_exclusion << $1.hex
          end
        end
        def parse_cp1252(line)
          if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
            @ucd.cp1252[$1.hex] = $2.hex
          end
        end
        def create_composition_map
          @ucd.codepoints.each do |_, cp|
            if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
              @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
              @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
            end
          end
        end
        def normalize_boundary_map
          @ucd.boundary.each do |k,v|
            if [:lf, :cr].include? k
              @ucd.boundary[k] = v[0]
            end
          end
        end
        def parse
          SOURCES.each do |type, url|
            filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
            unless File.exist?(filename)
              $stderr.puts "Downloading #{url.split('/').last}"
              File.open(filename, 'wb') do |target|
                open(url) do |source|
                  source.each_line { |line| target.write line }
                end
              end
            end
            File.open(filename) do |file|
              file.each_line { |line| send "parse_#{type}".intern, line }
            end
          end
          create_composition_map
          normalize_boundary_map
        end
        def dump_to(filename)
          File.open(filename, 'wb') do |f|
            f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
          end
        end
      end
    end
  end
end
if __FILE__ == $0
  filename = Mail::Multibyte::Unicode::UnicodeDatabase.filename
  generator = Mail::Multibyte::Unicode::DatabaseGenerator.new
  generator.parse
  print "Writing to: #{filename}"
  generator.dump_to filename
  puts " (#{File.size(filename)} bytes)"
end
 |