
|
/* Copyright 2016 Software Freedom Conservancy Inc.
*
* This software is licensed under the GNU Lesser General Public License
* (version 2.1 or later). See the COPYING file in this distribution.
*/
public const int DEFAULT_USER_TEXT_INPUT_LENGTH = 1024;
public inline bool is_string_empty(string? s) {
return (s == null || s[0] == '\0');
}
// utf8 case sensitive compare
public int utf8_cs_compare(string a, string b) {
return a.collate(b);
}
// utf8 case insensitive compare
public int utf8_ci_compare(string a, string b) {
return a.down().collate(b.down());
}
// utf8 array to string
public string uchar_array_to_string(uchar[] data, int length = -1) {
if (length < 0)
length = data.length;
StringBuilder builder = new StringBuilder();
for (int ctr = 0; ctr < length; ctr++) {
if (data[ctr] != '\0')
builder.append_c((char) data[ctr]);
else
break;
}
return builder.str;
}
// string to uchar array
public uchar[] string_to_uchar_array(string str) {
uchar[] data = new uchar[0];
for (int ctr = 0; ctr < str.length; ctr++)
data += (uchar) str[ctr];
return data;
}
// Markup.escape_text() will crash if the UTF-8 text is not valid; it relies on a call to
// g_utf8_next_char(), which demands that the string be validated before use, which escape_text()
// does not do. This handles this problem by kicking back an empty string if the text is not
// valid. Text should be validated upon entry to the system as well to guard against this
// problem.
//
// Null strings are accepted; they will result in an empty string returned.
public inline string guarded_markup_escape_text(string? plain) {
return (!is_string_empty(plain) && plain.validate()) ? Markup.escape_text(plain) : "";
}
public long find_last_offset(string str, char c) {
long offset = str.length;
while (--offset >= 0) {
if (str[offset] == c)
return offset;
}
return -1;
}
// Helper function for searching an array of case-insensitive strings. The array should be
// all lowercase.
public bool is_in_ci_array(string str, string[] strings) {
string strdown = str.down();
foreach (string str_element in strings) {
if (strdown == str_element)
return true;
}
return false;
}
[Flags]
public enum PrepareInputTextOptions {
EMPTY_IS_NULL,
VALIDATE,
INVALID_IS_NULL,
STRIP,
STRIP_CRLF,
NORMALIZE,
DEFAULT = EMPTY_IS_NULL | VALIDATE | INVALID_IS_NULL | STRIP_CRLF | STRIP | NORMALIZE;
}
private string? guess_convert(string text) {
string? output = null;
size_t bytes_read = 0;
unowned string charset = null;
debug ("CONVERT: Text did not validate as UTF-8, trying conversion");
// Try with locale
if (!GLib.get_charset(out charset)) {
output = text.locale_to_utf8(text.length, out bytes_read, null, null);
if (bytes_read == text.length) {
debug ("CONVERT: Locale is not UTF-8, convert from %s", charset);
return output;
}
}
try {
output = GLib.convert (text, text.length, "UTF-8", "WINDOWS-1252", out bytes_read);
charset = "WINDOWS-1252";
} catch (ConvertError error) {
if (error is ConvertError.NO_CONVERSION) {
try {
output = GLib.convert (text, text.length, "UTF-8", "ISO-8859-1", out bytes_read);
charset = "ISO-8859-1";
} catch (Error error) { /* do nothing */ }
}
}
if (bytes_read == text.length) {
debug ("CONVERT: Guessed conversion from %s", charset);
return output;
}
return null;
}
public string? prepare_input_text(string? text, PrepareInputTextOptions options, int dest_length) {
if (text == null)
return null;
string? prepped = text;
if (PrepareInputTextOptions.VALIDATE in options) {
if (!text.validate()) {
prepped = guess_convert (text);
if (prepped == null) {
return (options & PrepareInputTextOptions.INVALID_IS_NULL) != 0 ? null : "";
}
}
}
// Using composed form rather than GLib's default (decomposed) as NFC is the preferred form in
// Linux and WWW. More importantly, Pango seems to have serious problems displaying decomposed
// forms of Korean language glyphs (and perhaps others). See:
// https://bugzilla.gnome.org/show_bug.cgi?id=716914
if ((options & PrepareInputTextOptions.NORMALIZE) != 0)
prepped = prepped.normalize(-1, NormalizeMode.NFC);
if ((options & PrepareInputTextOptions.STRIP) != 0)
prepped = prepped.strip();
// Ticket #3245 - Prevent carriage return mayhem
// in image titles, tag names, etc.
if ((options & PrepareInputTextOptions.STRIP_CRLF) != 0)
prepped = prepped.delimit("\n\r", ' ');
if ((options & PrepareInputTextOptions.EMPTY_IS_NULL) != 0 && is_string_empty(prepped))
return null;
// Ticket #3196 - Allow calling functions to limit the length of the
// string we return to them. Passing any negative value is interpreted
// as 'do not truncate'.
if (dest_length >= 0) {
StringBuilder sb = new StringBuilder(prepped);
sb.truncate(dest_length);
return sb.str;
}
// otherwise, return normally.
return prepped;
}
namespace String {
public inline bool contains_char(string haystack, unichar needle) {
return haystack.index_of_char(needle) >= 0;
}
// Note that this method currently turns a word of all zeros into empty space ("000" -> "")
public string strip_leading_zeroes(string str) {
StringBuilder stripped = new StringBuilder();
bool prev_is_space = true;
for (unowned string iter = str; iter.get_char() != 0; iter = iter.next_char()) {
unichar ch = iter.get_char();
if (!prev_is_space || ch != '0') {
stripped.append_unichar(ch);
prev_is_space = ch.isspace();
}
}
return stripped.str;
}
public string remove_diacritics(string istring) {
var builder = new StringBuilder ();
unichar ch;
int i = 0;
while(istring.normalize().get_next_char(ref i, out ch)) {
switch(ch.type()) {
case UnicodeType.CONTROL:
case UnicodeType.FORMAT:
case UnicodeType.UNASSIGNED:
case UnicodeType.NON_SPACING_MARK:
case UnicodeType.COMBINING_MARK:
case UnicodeType.ENCLOSING_MARK:
// Ignore those
continue;
default:
break;
}
builder.append_unichar(ch);
}
return builder.str;
}
public string to_hex_string(string str) {
StringBuilder builder = new StringBuilder();
uint8 *data = (uint8 *) str;
while (*data != 0)
builder.append_printf("%02Xh%s", *data++, (*data != 0) ? " " : "");
return builder.str;
}
// A note on the collated_* and precollated_* methods:
//
// A bug report (https://bugzilla.gnome.org/show_bug.cgi?id=717135) indicated that two different Hirigana characters
// as Tag names would trigger an assertion. Investigation showed that the characters' collation
// keys computed as equal when the locale was set to anything but the default locale (C) or
// Japanese. A related bug was that another hash table was using str_equal, which does not use
// collation, meaning that in one table the strings were seen as the same and in another as
// different.
//
// The solution we arrived at is to use collation whenever possible, but if two strings have the
// same collation, then fall back on strcmp(), which looks for byte-for-byte comparisons. Note
// that this technique requires that both strings have been properly composed (use
// prepare_input_text() for that task) so that equal UTF-8 strings are byte-for-byte equal as
// well.
// See note above.
public uint collated_hash(void *ptr) {
string str = (string) ptr;
return str_hash(str.collate_key());
}
// See note above.
public uint precollated_hash(void *ptr) {
return str_hash((string) ptr);
}
// See note above.
public int collated_compare(void *a, void *b) {
string astr = (string) a;
string bstr = (string) b;
int result = astr.collate(bstr);
return (result != 0) ? result : strcmp(astr, bstr);
}
// See note above.
public int precollated_compare(string astr, string akey, string bstr, string bkey) {
int result = strcmp(akey, bkey);
return (result != 0) ? result : strcmp(astr, bstr);
}
// See note above.
public bool collated_equals(void *a, void *b) {
return collated_compare(a, b) == 0;
}
// See note above.
public bool precollated_equals(string astr, string akey, string bstr, string bkey) {
return precollated_compare(astr, akey, bstr, bkey) == 0;
}
}
|