File: identifier.rb

package info (click to toggle)
ruby-babosa 2.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 440 kB
  • sloc: ruby: 1,586; makefile: 4
file content (268 lines) | stat: -rw-r--r-- 8,384 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# frozen_string_literal: true

module Babosa
  # This class provides some string-manipulation methods specific to slugs.
  #
  # Note that this class includes many "bang methods" such as {#clean!} and
  # {#normalize!} that perform actions on the string in-place. Each of these
  # methods has a corresponding "bangless" method (i.e., +Identifier#clean!+
  # and +Identifier#clean+) which does not appear in the documentation because
  # it is generated dynamically.
  #
  # All of the bang methods return an instance of String, while the bangless
  # versions return an instance of {Babosa::Identifier}, so that calls to methods
  # specific to this class can be chained:
  #
  #   string = Identifier.new("hello world")
  #   string.with_separators! # => "hello-world"
  #   string.with_separators  # => <Babosa::Identifier:0x000001013e1590 @wrapped_string="hello-world">
  #
  # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
  class Identifier
    Error = Class.new(StandardError)

    attr_reader :wrapped_string
    alias to_s wrapped_string

    def method_missing(symbol, *args, &block)
      @wrapped_string.__send__(symbol, *args, &block)
    end

    def respond_to_missing?(name, include_all)
      @wrapped_string.respond_to?(name, include_all)
    end

    # @param string [#to_s] The string to use as the basis of the Identifier.
    def initialize(string)
      @wrapped_string = string.to_s.dup
      tidy_bytes!
      normalize_utf8!
    end

    def ==(other)
      to_s == other.to_s
    end

    def eql?(other)
      self == other
    end

    # Approximate an ASCII string. This works only for strings using characters
    # that are Roman-alphabet characters + diacritics. Non-letter characters
    # are left unmodified.
    #
    #   string = Identifier.new "Łódź, Poland"
    #   string.transliterate                 # => "Lodz, Poland"
    #   string = Identifier.new "日本"
    #   string.transliterate                 # => "日本"
    #
    # You can pass the names of any transliterator class as arguments. This
    # allows for contextual approximations. Various languages are supported,
    # you can see which ones by looking at the source of
    # {Babosa::Transliterator::Base}.
    #
    #   string = Identifier.new "Jürgen Müller"
    #   string.transliterate                 # => "Jurgen Muller"
    #   string.transliterate :german         # => "Juergen Mueller"
    #   string = Identifier.new "¡Feliz año!"
    #   string.transliterate                 # => "¡Feliz ano!"
    #   string.transliterate :spanish        # => "¡Feliz anio!"
    #
    # The approximations are an array, which you can modify if you choose:
    #
    #   # Make Spanish use "nh" rather than "nn"
    #   Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"
    #
    # Notice that this method does not simply convert to ASCII; if you want
    # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
    #
    #   string.transliterate!(:spanish)       # => "¡Feliz anio!"
    #   string.to_ascii!                      # => "Feliz anio!"
    #
    # @param *args <Symbol>
    # @return String
    def transliterate!(*kinds)
      kinds.compact!
      kinds = [:latin] if kinds.empty?
      kinds.each do |kind|
        transliterator = Transliterator.get(kind).instance
        @wrapped_string = transliterator.transliterate(@wrapped_string)
      end
      to_s
    end

    # Converts dashes to spaces, removes leading and trailing spaces, and
    # replaces multiple whitespace characters with a single space.
    #
    # @return String
    def clean!
      gsub!(/[- ]+/, " ")
      strip!
      to_s
    end

    # Remove any non-word characters. For this library's purposes, this means
    # anything other than letters, numbers, spaces, underscores, dashes,
    # newlines, and linefeeds.
    #
    # @return String
    def word_chars!
      # `^\p{letter}` = Any non-Unicode letter
      # `&&` = add the following character class
      # `[^ _\n\r]` = Anything other than space, underscore, newline or linefeed
      gsub!(/[[^\p{letter}]&&[^ \d_\-\n\r]]/, "")
      to_s
    end

    # Normalize the string for use as a URL slug. Note that in this context,
    # +normalize+ means, strip, remove non-letters/numbers, downcasing,
    # truncating to 255 bytes and converting whitespace to dashes.
    #
    # @param options [Hash]
    # @return String
    def normalize!(options = {})
      options = default_normalize_options.merge(options)

      if options[:transliterate]
        option = options[:transliterate]
        if option == true
          transliterate!(*options[:transliterations])
        else
          transliterate!(*option)
        end
      end
      to_ascii! if options[:to_ascii]
      word_chars!
      clean!
      downcase!
      truncate_bytes!(options[:max_length])
      with_separators!(options[:separator])
    end

    # Normalize a string so that it can safely be used as a Ruby method name.
    #
    # @param allow_bangs [Boolean]
    # @return String
    def to_ruby_method!(allow_bangs: true)
      last_char = self[-1]
      transliterate!
      to_ascii!
      word_chars!
      strip_leading_digits!
      clean!
      @wrapped_string += last_char if allow_bangs && ["!", "?"].include?(last_char)
      raise Error, "Input generates impossible Ruby method name" if self == ""

      with_separators!("_")
    end

    # Delete any non-ascii characters.
    #
    # @return String
    def to_ascii!
      gsub!(/[^\x00-\x7f]/u, "")
      to_s
    end

    # Truncate the string to +max+ characters.
    #
    # @example
    #   "üéøá".to_identifier.truncate(3) #=> "üéø"
    #
    # @param max [Integer] The maximum number of characters.
    # @return String
    def truncate!(max)
      @wrapped_string = slice(0, max)
    end

    # Truncate the string to +max+ bytes. This can be useful for ensuring that
    # a UTF-8 string will always fit into a database column with a certain max
    # byte length. The resulting string may be less than +max+ if the string must
    # be truncated at a multibyte character boundary.
    #
    # @example
    #   "üéøá".to_identifier.truncate_bytes(3) #=> "ü"
    #
    # @param max [Integer] The maximum number of bytes.
    # @return String
    def truncate_bytes!(max)
      truncate!(max)
      chop! until bytesize <= max
    end

    # Replaces whitespace with dashes ("-").
    #
    # @param char [String] the separator character to use.
    # @return String
    def with_separators!(char = "-")
      gsub!(/\s/u, char)
      to_s
    end

    # Perform Unicode composition on the wrapped string.
    #
    # @return String
    def normalize_utf8!
      unicode_normalize!(:nfc)
      to_s
    end

    # Strip any leading digits.
    #
    # @return String
    def strip_leading_digits!
      gsub!(/^\d+/, "")
      to_s
    end

    # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
    # UTF-8.
    # @return String
    def tidy_bytes!
      scrub! do |bad|
        bad.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
      end
      to_s
    end

    %w[clean downcase normalize normalize_utf8 strip_leading_digits
       tidy_bytes to_ascii transliterate truncate truncate_bytes upcase
       with_separators word_chars].each do |method|
      class_eval(<<-METHOD, __FILE__, __LINE__ + 1)
        def #{method}(*args)
          with_new_instance { |id| id.send(:#{method}!, *args) }
        end
      METHOD
    end

    def to_ruby_method(allow_bangs: true)
      with_new_instance { |id| id.to_ruby_method!(allow_bangs: allow_bangs) }
    end

    def to_identifier
      self
    end

    # The default options for {#normalize!}. Override to set your own defaults.
    def default_normalize_options
      {transliterate: :latin, max_length: 255, separator: "-"}
    end

    alias approximate_ascii transliterate
    alias approximate_ascii! transliterate!
    alias with_dashes with_separators
    alias with_dashes! with_separators!
    alias to_slug to_identifier

    private

    # Used as the basis of the non-mutating (bangless) methods.
    def with_new_instance
      Identifier.allocate.tap do |id|
        id.instance_variable_set :@wrapped_string, to_s

        yield id
      end
    end
  end
end