File: conversion_expressions.rb

package info (click to toggle)
ruby-stringex 2.8.5-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 1,232 kB
  • sloc: ruby: 3,745; makefile: 5
file content (154 lines) | stat: -rw-r--r-- 4,930 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# encoding: UTF-8

module Stringex
  module Localization
    module ConversionExpressions
      ABBREVIATION = /(\s|\(|^)([[:alpha:]](\.[[:alpha:]])+(\.?)[[:alpha:]]*(\s|\)|$))/

      ACCENTED_HTML_ENTITY = /&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/

      APOSTROPHE = /(^|[[:alpha:]])'|`([[:alpha:]]|$)/

      CHARACTERS =  {
        and:      /\s*&\s*/,
        at:       /\s*@\s*/,
        degrees:  /\s*°\s*/,
        divide:   /\s*÷\s*/,
        dot:      /(\S|^)\.(\S)/,
        ellipsis: /\s*\.{3,}\s*/,
        equals:   /\s*=\s*/,
        number:   /\s*#/,
        percent:  /\s*%\s*/,
        plus:     /\s*\+\s*/,
        slash:    /\s*(\\|\/|/)\s*/,
        star:     /\s*\*\s*/,
      }

      # Things that just get converted to spaces
      CLEANUP_CHARACTERS = /[\.,:;(){}\[\]\?!\^'ʼ"`~_\|<>]/
      CLEANUP_HTML_ENTITIES = /&[^;]+;/

      CURRENCIES_SUPPORTED_SIMPLE = {
        generic: /¤/,
        dollars: /\$/,
        euros:   /€/,
        pounds:  /£/,
        yen:     /¥/,
        reais:   /R\$/
      }
      CURRENCIES_SUPPORTED_COMPLEX = {
        dollars: :dollars_cents,
        euros:   :euros_cents,
        pounds:  :pounds_pence,
        reais:   :reais_cents
      }
      CURRENCIES_SUPPORTED = Regexp.new(CURRENCIES_SUPPORTED_SIMPLE.values.join('|'))
      CURRENCIES_SIMPLE = CURRENCIES_SUPPORTED_SIMPLE.inject({}) do |hash, content|
        key, expression = content
        hash[key] = /(?:\s|^)#{expression}(\d*)(?:\s|$)/
        hash
      end
      CURRENCIES_COMPLEX = CURRENCIES_SUPPORTED_SIMPLE.inject({}) do |hash, content|
        key, expression = content
        # Do we really need to not worry about complex currencies if there are none for the currency?
        complex_key = CURRENCIES_SUPPORTED_COMPLEX[key]
        if complex_key
          hash[complex_key] = /(?:\s|^)#{expression}(\d+)\.(\d+)(?:\s|$)/
        end
        hash
      end
      CURRENCIES = CURRENCIES_SIMPLE.merge(CURRENCIES_COMPLEX)

      HTML_ENTITIES = Proc.new(){
        base = {
          amp:          %w{#38 amp},
          cent:         %w{#162 cent},
          copy:         %w{#169 copy},
          deg:          %w{#176 deg},
          divide:       %w{#247 divide},
          double_quote: %w{#34 #822[012] quot ldquo rdquo dbquo},
          ellipsis:     %w{#8230 hellip},
          en_dash:      %w{#8211 ndash},
          em_dash:      %w{#8212 mdash},
          frac14:       %w{#188 frac14},
          frac12:       %w{#189 frac12},
          frac34:       %w{#190 frac34},
          gt:           %w{#62 gt},
          lt:           %w{#60 lt},
          nbsp:         %w{#160 nbsp},
          pound:        %w{#163 pound},
          reg:          %w{#174 reg},
          single_quote: %w{#39 #821[678] apos lsquo rsquo sbquo},
          times:        %w{#215 times},
          trade:        %w{#8482 trade},
          yen:          %w{#165 yen},
        }
        base.inject({}) do |hash, content|
          key, expression = content
          hash[key] = /&(#{expression.join('|')});/
          hash
        end
      }.call

      HTML_TAG = Proc.new(){
        name = /[\w:-]+/
        value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
        attr = /(#{name}(\s*=\s*#{value})?)/
        /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
      }.call

      SMART_PUNCTUATION = {
        /(“|”|\302\223|\302\224|\303\222|\303\223)/ => '"',
        /(‘|’|\302\221|\302\222|\303\225)/ => "'",
        /…/ => "...",
      }

      UNREADABLE_CONTROL_CHARACTERS = /[[:cntrl:]]/

      # Ordered by denominator then numerator of the value
      VULGAR_FRACTIONS = {
        half:          /(&#189;|&frac12;|½)/,
        one_third:     /(&#8531;|⅓)/,
        two_thirds:    /(&#8532;|⅔)/,
        one_fourth:    /(&#188;|&frac14;|¼)/,
        three_fourths: /(&#190;|&frac34;|¾)/,
        one_fifth:     /(&#8533;|⅕)/,
        two_fifths:    /(&#8534;|⅖)/,
        three_fifths:  /(&#8535;|⅗)/,
        four_fifths:   /(&#8536;|⅘)/,
        one_sixth:     /(&#8537;|⅙)/,
        five_sixths:   /(&#8538;|⅚)/,
        one_eighth:    /(&#8539;|⅛)/,
        three_eighths: /(&#8540;|⅜)/,
        five_eighths:  /(&#8541;|⅝)/,
        seven_eighths: /(&#8542;|⅞)/,
      }

      WHITESPACE = /\s+/

      class << self
        %w{
          abbreviation
          accented_html_entity
          apostrophe
          characters
          cleanup_characters
          cleanup_html_entities
          currencies
          currencies_simple
          currencies_complex
          html_entities
          html_tag
          smart_punctuation
          unreadable_control_characters
          vulgar_fractions
          whitespace
        }.each do |conversion_type|
          define_method conversion_type do
            const_get conversion_type.upcase
          end
        end
      end
    end
  end
end