File: sid.rb

package info (click to toggle)
ruby-unicode-utils 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 1,988 kB
  • sloc: ruby: 1,877; makefile: 4
file content (63 lines) | stat: -rw-r--r-- 2,272 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- encoding: utf-8 -*-

require "unicode_utils/name_aliases"
require "unicode_utils/code_point_type"

module UnicodeUtils

  CP_PREFERRED_ALIAS_STRING_MAP = Hash.new.tap do |map|
    NAME_ALIASES_MAP.each { |cp, aliases|
      al =
        (aliases.find { |al| al.type == :correction } ||
         aliases.find { |al| al.type == :control } ||
         aliases.find { |al| al.type == :figment } ||
         aliases.find { |al| al.type == :alternate })
      map[cp] = al.name if al
    }
  end #:nodoc:

  # Returns a unique string identifier for every code point. Returns
  # nil if +code_point+ is not in the Unicode codespace. +code_point+
  # must be an Integer.
  #
  # The returned string identifier is either the non-empty Name
  # property value of +code_point+, a non-empty Name_Alias string
  # property value of +code_point+, or the code point label as
  # described by section "Code Point Labels" in chapter 4.8 "Name" of
  # the Unicode standard.
  #
  # If the returned identifier starts with "<", it is a code point
  # label and it ends with ">". Otherwise it is the normative name or
  # a formal alias string.
  #
  # The exact name/alias/label selection algorithm may change even in
  # minor UnicodeUtils releases, but overall behaviour will stay the
  # same in spirit.
  #
  # The selection process in this version of UnicodeUtils is:
  # 1. Use an alias of type :correction, :control, :figment or
  #    :alternate (with listed precendence) if available
  # 2. Use the Unicode Name property value if it is not empty
  # 3. Construct a code point label in angle brackets.
  #
  # Examples:
  #
  #     require "unicode_utils/sid"
  #
  #     U.sid 0xa     # => "LINE FEED"
  #     U.sid 0x0     # => "NULL"
  #     U.sid 0xfeff  # => "BYTE ORDER MARK"
  #     U.sid 0xe000  # => "<private-use-E000>"
  #     U.sid 0x61    # => "LATIN SMALL LETTER A"
  #     U.sid -1      # => nil
  def sid(code_point)
    s = CP_PREFERRED_ALIAS_STRING_MAP[code_point] and return s
    cn = UnicodeUtils.char_name(code_point)
    return cn if cn && cn !~ /\A(\<|\z)/
    ct = UnicodeUtils.code_point_type(code_point) or return nil
    ts = ct.to_s.downcase.gsub('_', '-')
    "<#{ts}-#{code_point.to_s(16).upcase.rjust(4, '0')}>"
  end
  module_function :sid

end