File: TextConverter.cpp

package info (click to toggle)
pinot 0.85-1
links: PTS, VCS
area: main
in suites: lenny
size: 5,524 kB
ctags: 3,868
sloc: cpp: 33,107; sh: 8,801; ansic: 3,049; makefile: 557; xml: 366; python: 250
file content (163 lines) | stat: -rw-r--r-- 3,699 bytes
/*
 *  Copyright 2008 Fabrice Colin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include <errno.h>
#include <iostream>
#include <glibmm/convert.h>
#include <glibmm/ustring.h>

#include "StringManip.h"
#include "TextConverter.h"

using std::cout;
using std::endl;
using std::string;
using namespace Glib;

TextConverter::TextConverter(unsigned int maxErrors) :
	m_utf8Locale(false),
	m_maxErrors(maxErrors),
	m_conversionErrors(0)
{
	// Get the locale charset
	m_utf8Locale = get_charset(m_localeCharset);
}

TextConverter::~TextConverter()
{
}

string TextConverter::toUTF8(const string &text, const string &charset)
{
	unsigned int textLen = (unsigned int)text.length();

	// Call overload
	return toUTF8(text.c_str(), textLen, charset);
}

string TextConverter::toUTF8(const char *pText, unsigned int textLen, const string &charset)
{
	string textCharset(StringManip::toLowerCase(charset));
	char outputBuffer[8192];
	char *pInput = const_cast<char *>(pText);

	m_conversionErrors = 0;

	if ((pText == NULL) ||
		(textLen == 0) ||
		(textCharset == "utf-8"))
	{
		// No conversion necessary
		return string(pText, textLen);
	}

	if (textCharset.empty() == true)
	{
		if (m_utf8Locale == true)
		{
			// The current locale uses UTF-8
			return string(pText, textLen);
		}

		textCharset = m_localeCharset;
	}

	IConv converter("UTF-8", textCharset);
	string outputText;
	gsize inputSize = (gsize)textLen;
	bool invalidSequence = false;

	while (inputSize > 0)
	{
		char *pOutput = outputBuffer;
		gsize outputSize = 8192;

		size_t conversions = converter.iconv(&pInput, &inputSize, &pOutput, &outputSize);
		int errorCode = errno;
		if (conversions == -1)
		{
			if (errorCode == EILSEQ)
			{
				// Conversion was only partially successful
				++m_conversionErrors;
#ifdef DEBUG
				cout << "TextConverter::toUTF8: invalid sequence at " << pInput - pText << endl;
#endif
				if (m_conversionErrors >= m_maxErrors)
				{
					// Give up
					return string(pText, textLen);
				}
				converter.reset();

				outputText.append(outputBuffer, 8192 - outputSize);
				if (invalidSequence == false)
				{
					outputText += "?";
					invalidSequence = true;
				}

				// Skip that
				++pInput;
				--inputSize;
				continue;
			}
			else if (errorCode != E2BIG)
			{
#ifdef DEBUG
				cout << "TextConverter::toUTF8: unknown error " << errorCode << endl;
#endif
				return string(pText, textLen);
			}
		}
		else
		{
			invalidSequence = false;
		}

		// Append what was successfully converted
		outputText.append(outputBuffer, 8192 - outputSize);
	}
#ifdef DEBUG
	cout << "TextConverter::toUTF8: " << m_conversionErrors << " conversion errors" << endl;
#endif

	return outputText;
}

unsigned int TextConverter::getErrorsCount(void) const
{
	return m_conversionErrors;
}

string TextConverter::fromUTF8(const string &text)
{
	try
	{
		return locale_from_utf8(text);
	}
	catch (ConvertError &ce)
	{
#ifdef DEBUG
		cout << "TextConverter::fromUTF8: " << ce.what() << endl;
#endif
	}
 
       return "";
}