File: utf8itor.cc

package info (click to toggle)
xapian-core 1.4.3-2%2Bdeb9u3
links: PTS, VCS
area: main
in suites: stretch
size: 21,412 kB
sloc: cpp: 113,868; ansic: 8,723; sh: 4,433; perl: 836; makefile: 566; tcl: 317; python: 40
file content (141 lines) | stat: -rw-r--r-- 4,131 bytes
parent folder | download | duplicates (2)
/* utf8itor.cc: iterate over a utf8 string.
 *
 * Copyright (C) 2006,2007,2010,2013,2015 Olly Betts
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <config.h>

#include <xapian/unicode.h>

#include <cstring>

using namespace std;

inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }

namespace Xapian {

namespace Unicode {

// buf should be at least 4 bytes.
unsigned
nonascii_to_utf8(unsigned ch, char * buf)
{
    if (ch < 0x800) {
	buf[0] = 0xc0 | (ch >> 6);
	buf[1] = 0x80 | (ch & 0x3f);
	return 2;
    }
    if (ch < 0x10000) {
	buf[0] = 0xe0 | (ch >> 12);
	buf[1] = 0x80 | ((ch >> 6) & 0x3f);
	buf[2] = 0x80 | (ch & 0x3f);
	return 3;
    }
    if (ch < 0x200000) {
	buf[0] = 0xf0 | (ch >> 18);
	buf[1] = 0x80 | ((ch >> 12) & 0x3f);
	buf[2] = 0x80 | ((ch >> 6) & 0x3f);
	buf[3] = 0x80 | (ch & 0x3f);
	return 4;
    }
    // Unicode doesn't specify any characters above 0x10ffff.
    // Should we be presented with such a numeric character
    // entity or similar, we just replace it with nothing.
    return 0;
}

}

Utf8Iterator::Utf8Iterator(const char *p_)
{
    assign(p_, strlen(p_));
}

bool
Utf8Iterator::calculate_sequence_length() const XAPIAN_NOEXCEPT
{
    // Handle invalid UTF-8, overlong sequences, and truncated sequences as
    // if the text was actually in ISO-8859-1 since we need to do something
    // with it, and this seems the most likely reason why we'd have invalid
    // UTF-8.

    unsigned char ch = *p;

    seqlen = 1;
    // Single byte encoding (0x00-0x7f) or invalid (0x80-0xbf) or overlong
    // sequence (0xc0-0xc1).
    //
    // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
    // representable in a single byte, and we should not decode these.)
    if (ch < 0xc2) return (ch < 0x80);

    if (ch < 0xe0) {
	if (p + 1 == end || // Not enough bytes
	    bad_cont(p[1])) // Invalid
	    return false;
	seqlen = 2;
	return true;
    }
    if (ch < 0xf0) {
	if (end - p < 3 || // Not enough bytes
	    bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
	    (p[0] == 0xe0 && p[1] < 0xa0)) // Overlong encoding
	    return false;
	seqlen = 3;
	return true;
    }
    if (ch >= 0xf5 || // Code value above Unicode
	end - p < 4 || // Not enough bytes
	bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
	(p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
	(p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
	return false;
    seqlen = 4;
    return true;
}

unsigned Utf8Iterator::operator*() const XAPIAN_NOEXCEPT {
    if (p == NULL) return unsigned(-1);
    if (seqlen == 0) calculate_sequence_length();
    unsigned char ch = *p;
    if (seqlen == 1) return ch;
    if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
    if (seqlen == 3)
	return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
    return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
	    ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
}

unsigned
Utf8Iterator::strict_deref() const XAPIAN_NOEXCEPT
{
    if (p == NULL) return unsigned(-1);
    if (seqlen == 0) {
	if (!calculate_sequence_length())
	    return unsigned(*p) | 0x80000000;
    }
    unsigned char ch = *p;
    if (seqlen == 1) return ch;
    if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
    if (seqlen == 3)
	return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
    return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
	    ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
}

}