1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
#!/usr/bin/env ruby -w
# encoding: UTF-8
#
# = UTF8String.rb -- The TaskJuggler III Project Management Software
#
# Copyright (c) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014
# by Chris Schlaeger <cs@taskjuggler.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
KCODE='u'
require 'base64'
# This is an extension and modification of the standard String class. We do a
# lot of UTF-8 character processing in the parser. Ruby 1.8 does not have good
# enough UTF-8 support and Ruby 1.9 only handles UTF-8 characters as Strings.
# This is very inefficient compared to representing them as Integer objects.
# Some of these hacks can be removed once we have switched to 1.9 support
# only.
class String
if RUBY_VERSION < '1.9.0'
# Iterate over the String calling the block for each UTF-8 character in
# the String. This implementation looks more awkward but is noticeably
# faster than the often propagated regexp based implementations.
def each_utf8_char
c = ''
length = 0
each_byte do |b|
c << b
if length > 0
# subsequent unicode byte
if (length -= 1) == 0
# end of unicode character reached
yield c
c = ''
end
elsif (b & 0xC0) == 0xC0
# first unicode byte
length = -1
while (b & 0x80) != 0
length += 1
b = b << 1
end
else
# ASCII character
yield c
c = ''
end
end
end
alias old_double_left_angle <<
# Replacement for the existing << operator that also works for characters
# above Integer 255 (UTF-8 characters).
def <<(obj)
if obj.is_a?(String) || (obj < 256)
# In this case we can use the built-in concat.
concat(obj)
else
# UTF-8 characters have a maximum length of 4 byte and no byte is 0.
mask = 0xFF000000
pos = 3
while pos >= 0
# Use the built-in concat operator for each byte.
concat((obj & mask) >> (8 * pos)) if (obj & mask) != 0
# Move mask and position to the next byte.
mask = mask >> 8
pos -= 1
end
end
end
# Return the number of UTF8 characters in the String. We don't override
# the built-in length() function here as we don't know who else uses it
# for what purpose.
def length_utf8
len = 0
each_utf8_char { |c| len += 1 }
len
end
def ljust(len, pad = ' ')
return self + pad * (len - length_utf8) if length_utf8 < len
self
end
alias old_reverse reverse
# UTF-8 aware version of reverse that replaces the built-in one.
def reverse
a = []
each_utf8_char { |c| a << c }
a.reverse.join
end
else
alias each_utf8_char each_char
alias length_utf8 length
end
def to_quoted_printable
[self].pack('M').gsub(/\n/, "\r\n")
end
def to_base64
Base64.encode64(self)
end
def unix2dos
gsub(/\n/, "\r\n")
end
# Ensure the String is really UTF-8 encoded and newlines are only \n. If
# that's not possible, an Encoding::UndefinedConversionError is raised.
def forceUTF8Encoding
if RUBY_VERSION < '1.9.0'
# Ruby 1.8 really only support 7 bit ASCII well. Only do the line-end
# clean-up.
gsub(/\r\n/, "\n")
else
begin
# Ensure that the text has LF line ends and is UTF-8 encoded.
encode('UTF-8', :universal_newline => true)
rescue
# The encoding of the String is broken. Find the first broken line and
# report it.
lineCtr = 1
each_line do |line|
begin
line.encode('UTF-8')
rescue
line = line.encode('UTF-8', :invalid => :replace,
:undef => :replace, :replace => '<?>')
raise Encoding::UndefinedConversionError,
"UTF-8 encoding error in line #{lineCtr}: #{line}"
end
lineCtr += 1
end
end
end
end
end
|