File: unidecoder.rb

package info (click to toggle)
ruby-stringex 2.8.5-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 1,232 kB
  • sloc: ruby: 3,745; makefile: 5
file content (82 lines) | stat: -rw-r--r-- 2,433 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# encoding: UTF-8

require 'yaml'
require 'stringex/localization'

module Stringex
  module Unidecoder
    # Contains Unicode codepoints, loading as needed from YAML files
    CODEPOINTS = Hash.new{|h, k|
      h[k] = ::YAML.load_file(File.join(File.expand_path(File.dirname(__FILE__)), "unidecoder_data", "#{k}.yml"))
    } unless defined?(CODEPOINTS)

    class << self
      # Returns string with its UTF-8 characters transliterated to ASCII ones
      #
      # You're probably better off just using the added String#to_ascii
      def decode(string)
        string.chars.map{|char| decoded(char)}.join
      end

      # Returns character for the given Unicode codepoint
      def encode(codepoint)
        ["0x#{codepoint}".to_i(16)].pack("U")
      end

      # Returns Unicode codepoint for the given character
      def get_codepoint(character)
        "%04x" % character.unpack("U")[0]
      end

      # Returns string indicating which file (and line) contains the
      # transliteration value for the character
      def in_yaml_file(character)
        unpacked = character.unpack("U")[0]
        "#{code_group(unpacked)}.yml (line #{grouped_point(unpacked) + 2})"
      end

    private

      def decoded(character)
        localized(character) || from_yaml(character)
      end

      def localized(character)
        Localization.translate(:transliterations, character)
      end

      def from_yaml(character)
        return character unless character.ord > 128
        unpacked = character.unpack("U")[0]
        CODEPOINTS[code_group(unpacked)][grouped_point(unpacked)]
      rescue
        # Hopefully this won't come up much
        # TODO: Make this note something to the user that is reportable to me perhaps
        "?"
      end

      # Returns the Unicode codepoint grouping for the given character
      def code_group(unpacked_character)
        "x%02x" % (unpacked_character >> 8)
      end

      # Returns the index of the given character in the YAML file for its codepoint group
      def grouped_point(unpacked_character)
        unpacked_character & 255
      end
    end
  end
end

module Stringex
  module StringExtensions
    module PublicInstanceMethods
      # Returns string with its UTF-8 characters transliterated to ASCII ones. Example:
      #
      #   "⠋⠗⠁⠝⠉⠑".to_ascii #=> "france"
      def to_ascii
        Stringex::Unidecoder.decode(self)
      end
    end
  end
end