File: string.vala

package info (click to toggle)
shotwell 0.32.13-2
links: PTS, VCS
area: main
in suites: forky, sid
size: 57,512 kB
sloc: xml: 55,555; cpp: 354; ansic: 157; python: 130; sh: 46; makefile: 12
file content (282 lines) | stat: -rw-r--r-- 8,905 bytes
parent folder | download | duplicates (2)
/* Copyright 2016 Software Freedom Conservancy Inc.
 *
 * This software is licensed under the GNU Lesser General Public License
 * (version 2.1 or later).  See the COPYING file in this distribution.
 */

public const int DEFAULT_USER_TEXT_INPUT_LENGTH = 1024;

public inline bool is_string_empty(string? s) {
    return (s == null || s[0] == '\0');
}

// utf8 case sensitive compare
public int utf8_cs_compare(string a, string b) {
    return a.collate(b);
}

// utf8 case insensitive compare
public int utf8_ci_compare(string a, string b) {
    return a.down().collate(b.down());
}

// utf8 array to string
public string uchar_array_to_string(uchar[] data, int length = -1) {
    if (length < 0)
        length = data.length;
    
    StringBuilder builder = new StringBuilder();
    for (int ctr = 0; ctr < length; ctr++) {
        if (data[ctr] != '\0')
            builder.append_c((char) data[ctr]);
        else
            break;
    }
    
    return builder.str;
}

// string to uchar array
public uchar[] string_to_uchar_array(string str) {
    uchar[] data = new uchar[0];
    for (int ctr = 0; ctr < str.length; ctr++)
        data += (uchar) str[ctr];
    
    return data;
}

// Markup.escape_text() will crash if the UTF-8 text is not valid; it relies on a call to 
// g_utf8_next_char(), which demands that the string be validated before use, which escape_text()
// does not do.  This handles this problem by kicking back an empty string if the text is not
// valid.  Text should be validated upon entry to the system as well to guard against this
// problem.
//
// Null strings are accepted; they will result in an empty string returned.
public inline string guarded_markup_escape_text(string? plain) {
    return (!is_string_empty(plain) && plain.validate()) ? Markup.escape_text(plain) : "";
}

public long find_last_offset(string str, char c) {
    long offset = str.length;
    while (--offset >= 0) {
        if (str[offset] == c)
            return offset;
    }
    
    return -1;
}

// Helper function for searching an array of case-insensitive strings.  The array should be
// all lowercase.
public bool is_in_ci_array(string str, string[] strings) {
    string strdown = str.down();
    foreach (string str_element in strings) {
        if (strdown == str_element)
            return true;
    }
    
    return false;
}

[Flags]
public enum PrepareInputTextOptions {
    EMPTY_IS_NULL,
    VALIDATE,
    INVALID_IS_NULL,
    STRIP,
    STRIP_CRLF,
    NORMALIZE,
    DEFAULT = EMPTY_IS_NULL | VALIDATE | INVALID_IS_NULL | STRIP_CRLF | STRIP | NORMALIZE;
}

private string? guess_convert(string text) {
    string? output = null;
    size_t bytes_read = 0;
    unowned string charset = null;
    debug ("CONVERT: Text did not validate as UTF-8, trying conversion");

    // Try with locale
    if (!GLib.get_charset(out charset)) {
        output = text.locale_to_utf8(text.length, out bytes_read, null, null);
        if (bytes_read == text.length) {
            debug ("CONVERT: Locale is not UTF-8, convert from %s", charset);
            return output;
        }
    }

    try {
        output = GLib.convert (text, text.length, "UTF-8", "WINDOWS-1252", out bytes_read);
        charset = "WINDOWS-1252";
    } catch (ConvertError error) {
        if (error is ConvertError.NO_CONVERSION) {
            try {
                output = GLib.convert (text, text.length, "UTF-8", "ISO-8859-1", out bytes_read);
                charset = "ISO-8859-1";
            } catch (Error error) { /* do nothing */ }
        }
    }

    if (bytes_read == text.length) {
        debug ("CONVERT: Guessed conversion from %s", charset);

        return output;
    }

    return null;
}

public string? prepare_input_text(string? text, PrepareInputTextOptions options, int dest_length) {
    if (text == null)
        return null;
    
    string? prepped = text;
    if (PrepareInputTextOptions.VALIDATE in options) {
        if (!text.validate()) {
            prepped = guess_convert (text);

            if (prepped == null) {
                return (options & PrepareInputTextOptions.INVALID_IS_NULL) != 0 ? null : "";
            }
        }
    }

    // Using composed form rather than GLib's default (decomposed) as NFC is the preferred form in
    // Linux and WWW.  More importantly, Pango seems to have serious problems displaying decomposed
    // forms of Korean language glyphs (and perhaps others).  See:
    // https://bugzilla.gnome.org/show_bug.cgi?id=716914
    if ((options & PrepareInputTextOptions.NORMALIZE) != 0)
        prepped = prepped.normalize(-1, NormalizeMode.NFC);
    
    if ((options & PrepareInputTextOptions.STRIP) != 0)
        prepped = prepped.strip();
        
    // Ticket #3245 - Prevent carriage return mayhem
    // in image titles, tag names, etc.
    if ((options & PrepareInputTextOptions.STRIP_CRLF) != 0)
        prepped = prepped.delimit("\n\r", ' ');
    
    if ((options & PrepareInputTextOptions.EMPTY_IS_NULL) != 0 && is_string_empty(prepped))
        return null;
    
    // Ticket #3196 - Allow calling functions to limit the length of the 
    // string we return to them. Passing any negative value is interpreted 
    // as 'do not truncate'.
    if (dest_length >= 0) { 
        StringBuilder sb = new StringBuilder(prepped);
        sb.truncate(dest_length);
        return sb.str;
    }
    
    // otherwise, return normally.
    return prepped;
}

namespace String {

public inline bool contains_char(string haystack, unichar needle) {
    return haystack.index_of_char(needle) >= 0;
}

// Note that this method currently turns a word of all zeros into empty space ("000" -> "")
public string strip_leading_zeroes(string str) {
    StringBuilder stripped = new StringBuilder();
    bool prev_is_space = true;
    for (unowned string iter = str; iter.get_char() != 0; iter = iter.next_char()) {
        unichar ch = iter.get_char();
        
        if (!prev_is_space || ch != '0') {
            stripped.append_unichar(ch);
            prev_is_space = ch.isspace();
        }
    }
    
    return stripped.str;
}

public string remove_diacritics(string istring) {
    var builder = new StringBuilder ();
    unichar ch;
    int i = 0;
    while(istring.normalize().get_next_char(ref i, out ch)) {
        switch(ch.type()) {
            case UnicodeType.CONTROL:
            case UnicodeType.FORMAT:
            case UnicodeType.UNASSIGNED:
            case UnicodeType.NON_SPACING_MARK:
            case UnicodeType.COMBINING_MARK:
            case UnicodeType.ENCLOSING_MARK:
            // Ignore those
                continue;
            default:
                break;
        }
        builder.append_unichar(ch);
    }
    return builder.str;
}

public string to_hex_string(string str) {
    StringBuilder builder = new StringBuilder();
    
    uint8 *data = (uint8 *) str;
    while (*data != 0)
        builder.append_printf("%02Xh%s", *data++, (*data != 0) ? " " : "");
    
    return builder.str;
}

// A note on the collated_* and precollated_* methods:
//
// A bug report (https://bugzilla.gnome.org/show_bug.cgi?id=717135) indicated that two different Hirigana characters
// as Tag names would trigger an assertion.  Investigation showed that the characters' collation
// keys computed as equal when the locale was set to anything but the default locale (C) or
// Japanese.  A related bug was that another hash table was using str_equal, which does not use
// collation, meaning that in one table the strings were seen as the same and in another as
// different.
//
// The solution we arrived at is to use collation whenever possible, but if two strings have the
// same collation, then fall back on strcmp(), which looks for byte-for-byte comparisons.  Note
// that this technique requires that both strings have been properly composed (use
// prepare_input_text() for that task) so that equal UTF-8 strings are byte-for-byte equal as
// well.

// See note above.
public uint collated_hash(void *ptr) {
    string str = (string) ptr;
    
    return str_hash(str.collate_key());
}

// See note above.
public uint precollated_hash(void *ptr) {
    return str_hash((string) ptr);
}

// See note above.
public int collated_compare(void *a, void *b) {
    string astr = (string) a;
    string bstr = (string) b;
    
    int result = astr.collate(bstr);
    
    return (result != 0) ? result : strcmp(astr, bstr);
}

// See note above.
public int precollated_compare(string astr, string akey, string bstr, string bkey) {
    int result = strcmp(akey, bkey);
    
    return (result != 0) ? result : strcmp(astr, bstr);
}

// See note above.
public bool collated_equals(void *a, void *b) {
    return collated_compare(a, b) == 0;
}

// See note above.
public bool precollated_equals(string astr, string akey, string bstr, string bkey) {
    return precollated_compare(astr, akey, bstr, bkey) == 0;
}

}