File: encoded_char.h

package info (click to toggle)
swish%2B%2B 5.15.3-3
links: PTS
area: main
in suites: sarge
size: 1,868 kB
ctags: 1,593
sloc: ansic: 11,384; lisp: 804; sh: 499; perl: 211; makefile: 69
file content (474 lines) | stat: -rw-r--r-- 12,617 bytes
/*
**	SWISH++
**	encoded_char.h
**
**	Copyright (C) 2000  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
**
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
**
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

#ifndef encoded_char_H
#define encoded_char_H

// standard
#include <iterator>
#include <set>

// local
#include "fake_ansi.h"			/* for iterator, std */
#include "iso8859-1.h"
#include "util.h"			/* for to_lower() */

//*****************************************************************************
//
// SYNOPSIS
//
	class encoded_char_range
//
// DESCRIPTION
//
//	An encoded_char_range is an abstraction that contains a range of
//	characters in memory that are encoded according to some scheme, either
//	a Content-Transfer-Encoding (such as Quoted-Printable or Base64) or a
//	character set (such as UTF-7 or UTF-8).  A const_iterator is used to
//	iterate over the range and, when dereferenced, decodes a character.
//
//	However, doing this is a serious performance hit since it has to be
//	done for every single character examined.  Hence, the code is #ifdef'd
//	for MOD_id3 and MOD_mail: if neither is used, there's no need for any
//	special decoding.
//
//*****************************************************************************
{
public:
	typedef ptrdiff_t difference_type;
	typedef char value_type;
	typedef value_type const* pointer;
	typedef value_type (*charset_type )( pointer, pointer&, pointer );
	typedef value_type (*encoding_type)( pointer, pointer&, pointer );

	class const_iterator;
	friend class const_iterator;

	encoded_char_range(
		pointer begin, pointer end, charset_type = 0, encoding_type = 0
	);
	encoded_char_range( const_iterator const &pos );
	encoded_char_range(
		const_iterator const &begin, const_iterator const &end
	);

	// default copy constructor is fine
	// default assignment operator is fine

	const_iterator	begin() const;
	pointer		begin_pos() const		{ return begin_; }
	void		begin_pos( pointer p )		{ begin_ = p; }
	void		begin_pos( const_iterator const& );
	const_iterator	end() const;
	pointer		end_pos() const			{ return end_; }
	void		end_pos( pointer p )		{ end_ = p; }
	void		end_pos( const_iterator const& );

#ifdef	IMPLEMENT_DECODING
	class decoder;
#endif
protected:
	encoded_char_range() { }

	pointer		begin_;
	pointer		end_;
#ifdef	IMPLEMENT_DECODING
	charset_type	charset_;
	encoding_type	encoding_;
#endif
};

//*****************************************************************************
//
// SYNOPSIS
//
	class encoded_char_range::const_iterator :
		public encoded_char_range,
		public std::iterator<
			std::forward_iterator_tag, value_type const
		>
//
// DESCRIPTION
//
//	An encoded_char_range::const_iterator is (not surprisingly) an iterator
//	for an encoded_char_range.  It might seem a bit odd to have an iterator
//	derived from the container class it's an iterator for (that's because
//	it is odd), but the iterator needs access to all its data members and
//	going through an extra level of indirection by having a pointer to it
//	would be slower.
//
//*****************************************************************************
{
public:
	typedef encoded_char_range::difference_type difference_type;
	typedef encoded_char_range::value_type value_type;
	typedef encoded_char_range::pointer pointer;

	const_iterator() { }
	const_iterator(
		pointer begin, pointer end, charset_type = 0, encoding_type = 0
	);

	// default copy constructor is fine
	// default assignment operator is fine

	value_type	operator*() const;
	const_iterator&	operator++();
	const_iterator	operator++(int);

	bool		at_end() const			{ return pos_ == end_; }

	pointer		pos() const			{ return pos_; }
	pointer&	pos()				{ return pos_; }
	pointer		prev_pos() const		{ return prev_; }

	friend bool operator==( const_iterator const&, const_iterator const& );
	friend bool operator==( const_iterator const&, pointer );
private:
	mutable pointer	pos_;
	mutable pointer	prev_;
#ifdef	IMPLEMENT_DECODING
	mutable value_type	ch_;
	mutable bool		decoded_;
	mutable int		delta_;
#endif
	const_iterator( encoded_char_range const*, pointer start_pos );
	friend class	encoded_char_range;	// for access to c'tor above
#ifdef	IMPLEMENT_DECODING
	void		decode() const;
#endif
};

#ifdef	IMPLEMENT_DECODING
//*****************************************************************************
//
// SYNOPSIS
//
	class encoded_char_range::decoder
//
// DESCRIPTION
//
//	An encoded_char_range::decoder is used to keep decoders' state between
//	calls and reset state between files to their initial states just before
//	starting to index a file.
//
//*****************************************************************************
{
public:
	typedef encoded_char_range::value_type value_type;
	typedef encoded_char_range::pointer pointer;

	static void	reset_all();
protected:
	decoder()	{ set_.insert( this ); }

	virtual void	reset() = 0;
private:
	typedef std::set< decoder* > set_type;
	static set_type set_;
};
#endif	/* IMPLEMENT_DECODING */

////////// encoded_char_range inlines /////////////////////////////////////////

// I hate lots of typing.
#define ECR	encoded_char_range
#define	ECR_CI	ECR::const_iterator

inline ECR::ECR(
	pointer begin, pointer end, charset_type charset, encoding_type encoding
) :
	begin_( begin ), end_( end )
#ifdef	IMPLEMENT_DECODING
	, charset_( charset ), encoding_( encoding )
#endif
{
}

inline ECR::ECR( const_iterator const &i ) :
	begin_( i.pos_ ), end_( i.end_ )
#ifdef	IMPLEMENT_DECODING
	, charset_( i.charset_ ), encoding_( i.encoding_ )
#endif
{
}

inline ECR::ECR( const_iterator const &begin, const_iterator const &end ) :
	begin_( begin.pos_ ), end_( end.pos_ )
#ifdef	IMPLEMENT_DECODING
	, charset_( begin.charset_ ), encoding_( begin.encoding_ )
#endif
{
}

inline ECR_CI ECR::begin() const {
	return const_iterator( this, begin_ );
}

inline ECR_CI ECR::end() const {
	return const_iterator( this, end_ );
}

inline void ECR::begin_pos( const_iterator const &i ) {
	begin_ = i.pos_;
}

inline void ECR::end_pos( const_iterator const &i ) {
	end_ = i.pos_;
}

////////// encoded_char_range::const_iterator inlines /////////////////////////

inline ECR_CI::const_iterator(
	pointer begin, pointer end,
	charset_type charset, encoding_type encoding
) :
	encoded_char_range( begin, end, charset, encoding ), pos_( begin )
#ifdef	IMPLEMENT_DECODING
	, decoded_( false )
#endif
{
}

inline ECR_CI::const_iterator( ECR const *ecr, pointer start_pos ) :
	encoded_char_range(
		start_pos, ecr->end_
#ifdef	IMPLEMENT_DECODING
		, ecr->charset_, ecr->encoding_
#endif
	),
	pos_( start_pos )
#ifdef	IMPLEMENT_DECODING
	, decoded_( false )
#endif
{
}

#ifdef	IMPLEMENT_DECODING
//*****************************************************************************
//
// SYNOPSIS
//
	inline void ECR_CI::decode() const
//
// DESCRIPTION
//
//	Decode the character at the iterator's current position according to
//	the character range's content-transfer-encoding.
//
// RETURN VALUE
//
//	Returns the decoded character.
//
//*****************************************************************************
{
	//
	// Remember the current position to allow the decoders to advance
	// through the encoded text.  This allows the delta to be computed so
	// the iterator can be incremented later.
	//
	pointer c = pos_;
	//
	// A mail message can have both an encoding and a non-ASCII or
	// non-ISO-8859-1 charset simultaneously, e.g., base64-encoded UTF-8.
	// (In practice, this particular case should never happen since UTF-7
	// should be used instead; but you get the idea.)
	//
	// However, handling both an encoding and such a charset simultaneously
	// is a real pain because both can use multiple characters to decode a
	// single character and keeping track of both positions is messy and I
	// didn't feel like thinking about this just now.
	//
	// Hence, a current caveat is that a mail message or attachment can
	// have EITHER an encoding OR a non-ASCII/ISO-8859-1 character set, but
	// not both.  If it does, the encoding takes precedence.
	//
	if ( encoding_ )
		ch_ = (*encoding_)( begin_, c, end_ );
	else if ( charset_ )
		ch_ = (*charset_)( begin_, c, end_ );
	else
		ch_ = iso8859_1_to_ascii( *c++ );
	delta_ = c - pos_;
}
#endif	/* IMPLEMENT_DECODING */

//*****************************************************************************
//
// SYNOPSIS
//
	inline ECR::value_type ECR_CI::operator*() const
//
// DESCRIPTION
//
//	Dereference an encoded_char_range::const_iterator at its current
//	position.
//
// RETURN VALUE
//
//	Returns the decoded character.
//
//*****************************************************************************
{
#ifdef	IMPLEMENT_DECODING
	if ( !decoded_ ) {
		decode();
		decoded_ = true;
	}
	return ch_;
#else
	return iso8859_1_to_ascii( *pos_ );
#endif
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline ECR_CI& ECR_CI::operator++()
//
// DESCRIPTION
//
//	Pre-increment the iterator's position by one.
//
// RETURN VALUE
//
//	Returns a reference to the given object.
//
//*****************************************************************************
{
#ifdef	IMPLEMENT_DECODING
	if ( decoded_ ) {
		//
		// The character at the current position has previously been
		// decoded so we know the delta.  However, since we're about to
		// increment the position to the next character, that character
		// will no longer have been decoded, so set decoded_ to false.
		//
		decoded_ = false;
	} else {
		//
		// The character at the current position has not previously
		// been decoded so we don't know the delta: call decode() to
		// calculate the delta only.  We can't set decoded_ to true
		// since we're about to increment the position to the next
		// character and that character hasn't been decoded.
		//
		decode();
	}
#endif
	prev_ = pos_;
#ifdef	IMPLEMENT_DECODING
	pos_ += delta_;
#else
	++pos_;
#endif
	return *this;
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline ECR_CI ECR_CI::operator++(int)
//
// DESCRIPTION
//
//	Post-increment the iterator's position by one.
//
// RETURN VALUE
//
//	Returns a reference to the original (pre-incremented) object.
//
//*****************************************************************************
{
	ECR_CI const temp = *this;
	operator++();
	return temp;
}

//*****************************************************************************
//
//	Equality operators.
//
//*****************************************************************************

inline bool operator==( ECR_CI const &e1, ECR_CI const &e2 ) {
	return e1.pos_ == e2.pos_;
}

inline bool operator==( ECR_CI const &e, ECR_CI::pointer p ) {
	return e.pos_ == p;
}

inline bool operator==( ECR_CI::pointer p, ECR_CI const &e ) {
	return e == p;
}

inline bool operator!=( ECR_CI const &e1, ECR_CI const &e2 ) {
	return !( e1 == e2 );
}

inline bool operator!=( ECR_CI const &e, ECR_CI::pointer p ) {
	return !( e == p );
}

inline bool operator!=( ECR_CI::pointer p, ECR_CI const &e ) {
	return e != p;
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline char *to_lower( ECR const &range )
//
// DESCRIPTION
//
//	Return a pointer to a string converted to lower case taking the
//	encoding of the characters into account; the original string is
//	untouched.  The string returned is from an internal pool of string
//	buffers.  The time you get into trouble is if you hang on to more then
//	Num_Buffers strings.  This doesn't normally happen in practice,
//	however.
//
// PARAMETERS
//
//	c	The iterator to use.
//
// RETURN VALUE
//
//	A pointer to the lower-case string.
//
//*****************************************************************************
{
	extern char_buffer_pool<128,5> lower_buf;
	register char *p = lower_buf.next();
	for ( ECR_CI c = range.begin(); !c.at_end(); ++c )
		*p++ = to_lower( *c );
	*p = '\0';
	return lower_buf.current();
}

#undef	ECR_CI
#undef	ECR

#endif	/* encoded_char_H */