File: compatibility_decomposition.rb

package info (click to toggle)
ruby-unicode-utils 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 1,988 kB
  • sloc: ruby: 1,877; makefile: 4
file content (55 lines) | stat: -rw-r--r-- 1,683 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- encoding: utf-8 -*-

require "unicode_utils/read_cdata"
require "unicode_utils/canonical_decomposition"
require "unicode_utils/hangul_syllable_decomposition"

module UnicodeUtils

  COMPATIBILITY_DECOMPOSITION_MAP =
    Impl.read_multivalued_map("compatibility_decomposition_map") # :nodoc:

  # Get the compatibility decomposition of the given string, also
  # called Normalization Form KD or short NFKD.
  #
  # Compatibility decomposition decomposes more code points than
  # canonical decomposition and contrary to Normalization Form D and
  # C, this normalization can alter how a string is displayed.
  #
  # Example:
  #
  #   require "unicode_utils/compatibility_decomposition"
  #   # LATIN SMALL LIGATURE FI => LATIN SMALL LETTER F, LATIN SMALL LETTER I
  #   UnicodeUtils.compatibility_decomposition("fi") => "fi"
  #
  # See also: UnicodeUtils.nfkd
  def compatibility_decomposition(str)
    res = String.new.force_encoding(str.encoding)
    str.each_codepoint { |cp|
      if cp >= 0xAC00 && cp <= 0xD7A3 # hangul syllable
        Impl.append_hangul_syllable_decomposition(res, cp)
      else
        Impl.append_recursive_compatibility_decomposition_mapping(res, cp)
      end
    }
    Impl.put_into_canonical_order(res)
  end
  module_function :compatibility_decomposition

  module Impl # :nodoc:

    def self.append_recursive_compatibility_decomposition_mapping(str, cp)
      mapping = COMPATIBILITY_DECOMPOSITION_MAP[cp]
      mapping ||= CANONICAL_DECOMPOSITION_MAP[cp]
      if mapping
        mapping.each { |c|
          append_recursive_compatibility_decomposition_mapping(str, c)
        }
      else
        str << cp
      end
    end

  end

end