File: util.c

package info (click to toggle)
swish%2B%2B 1.1b3-3
links: PTS
area: main
in suites: slink
size: 416 kB
ctags: 409
sloc: ansic: 2,842; makefile: 247; sh: 48
file content (322 lines) | stat: -rw-r--r-- 7,579 bytes
parent folder | download | duplicates (2)
/*
**	SWISH++
**	util.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <cctype>

// local
#include "config.h"
#include "entities.h"
#include "fake_ansi.h"
#include "string_set.h"
#include "util.h"

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

struct stop_word_set : string_set {
	stop_word_set();
};

stop_word_set::stop_word_set() {
	extern char const *const stop_word_table[];
	for ( register char const *const *w = stop_word_table; *w; ++w )
		insert( *w );
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline bool is_vowel( char c )
//
// DESCRIPTION
//
//	Determine whether a character is a vowel [aeiou] regardless of case.
//
// PARAMETERS
//
//	c	The character to be checked.
//
// RETURN VALUE
//
//	Returns true only if the character is a vowel.
//
//*****************************************************************************
{
	c = tolower( c );
	return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool is_ok_word( char const *word )
//
// DESCRIPTION
//
//	Determine whether a given word should be indexed or not using several
//	heuristics.
//
//	Stop words, words that occur too frequently or have no information
//	content, are not indexed.  Additionally, several heuristics are used
//	to determine which words should not be indexed.
//
//	First, a word is checked to see if it looks like an acronym.  A word
//	is considered an acronym only if it starts with a capital letter and
//	is composed exclusively of capital letters, digits, and punctuation
//	symbols, e.g., "AT&T."  If a word looks like an acronym, it is OK and
//	no further checks are done.
//
//	Second, there are several other checks that are applied.  A word is
//	not indexed if it:
//
//	1. Starts with a capital letter, is of mixed case, and contains more
//	   than a third capital letters, e.g., "BizZARE."
//
//	2. Contains a capital letter other than the first, e.g, "weIrd."
//
//	3. Is less that Word_Min_Size characters and is not an acronym.
//
//	4. Contains no vowels.
//
//	5. Contains more than Word_Max_Consec_Same of the same character
//	   consecutively (not including digits).
//
//	6. Contains more than Word_Max_Consec_Vowels consecutive vowels.
//
//	7. Contains more than Word_Max_Consec_Consonants consecutive
//	   consonants.
//
// PARAMETERS
//
//	word	The word to be checked.
//
// RETURN VALUE
//
//	Returns true only if the word should be indexed.
//
// EXAMPLES
//
//	AT&T	OK
//	cccp	not OK -- no vowels
//	CCCP	OK -- acronym
//	eieio	not OK -- too many consec. vowels
//	other	not OK -- stop word
//
// SEE ALSO
//
//	stop_words.c	List of built-in stop words.
//
//*****************************************************************************
{
	int len = ::strlen( word );
	register char const *c;

#	ifdef DEBUG_is_ok_word
	cerr << '\t' << word << ' ';
#	endif

	////////// See if it's a stop word ////////////////////////////////////

	char lc_word[ Word_Hard_Max_Size + 1 ];
	::transform( word, word + len, lc_word, to_lower );
	lc_word[ len ] = '\0';

	static stop_word_set stop_words;
	if ( stop_words.find( lc_word ) ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(stop word)" << endl;
#		endif
		return false;
	}

	////////// Survey the characters in the word //////////////////////////

	int consonants = 0;
	int digits = 0;
	int puncts = 0;
	int uppers = 0;
	int vowels = 0;
	for ( c = word; *c; ++c ) {
		if ( isdigit( *c ) ) {
			++digits;
			continue;
		};
		if ( ispunct( *c ) ) {
			++puncts;
			continue;
		}
		if ( isupper( *c ) )
			++uppers;
		if ( is_vowel( *c ) )
			++vowels;
		else
			++consonants;
	}

	if ( isupper( *word ) ) {
		if ( uppers + digits + puncts == len ) {
#			ifdef DEBUG_is_ok_word
			cerr << "(potential acronym)" << endl;
#			endif
			return true;
		}
		if ( double( uppers + digits ) / len >= 33 / 100.0 ) {
#			ifdef DEBUG_is_ok_word
			cerr << "(too many intermediate uppers)" << endl;
#			endif
			return false;
		}
	} else if ( uppers ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(intermediate uppers)" << endl;
#		endif
		return false;
	}

	if ( len < Word_Min_Size ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(len < Word_Min_Size)" << endl;
#		endif
		return false;
	}

	if ( !vowels ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(no vowels)" << endl;
#		endif
		return false;
	}

	////////// Perform consecutive-character checks ///////////////////////

	int consec_consonants = 0;
	int consec_vowels = 0;
	int consec_same = 0;
	register char last_c = '\0';

	for ( c = word; *c; ++c ) {

		if ( isdigit( *c ) ) {
			consec_consonants = 0;
			consec_vowels = 0;
			last_c = '\0';	// consec_same doesn't apply to digits
			continue;
		}

		if ( ispunct( *c ) ) {
			consec_consonants = 0;
			consec_vowels = 0;
			continue;
		}

		if ( *c == last_c ) {
			if ( ++consec_same > Word_Max_Consec_Same ) {
#				ifdef DEBUG_is_ok_word
				cerr << "(exceeded consec same)" << endl;
#				endif
				return false;
			}
		} else {
			consec_same = 0;
			last_c = *c;
		}

		if ( is_vowel( *c ) ) {
			if ( ++consec_vowels > Word_Max_Consec_Vowels ) {
#				ifdef DEBUG_is_ok_word
				cerr << "(exceeded consec vowels)" << endl;
#				endif
				return false;
			}
			consec_consonants = 0;
			continue;
		} else {
			if ( ++consec_consonants > Word_Max_Consec_Consonants ){
#				ifdef DEBUG_is_ok_word
				cerr << "(exceeded consec consonants)" << endl;
#				endif
				return false;
			}
			consec_vowels = 0;
		}
	}

#	ifdef DEBUG_is_ok_word
	cerr << endl;
#	endif
	return true;
}

//*****************************************************************************
//
// SYNOPSIS
//
	char const *ltoa( register long n )
//
// DESCRIPTION
//
//	Convert a long integer to a string.  The string returned is from an
//	internal pool of string buffers.  The time you get into trouble is if
//	you hang on to more then Num_Buffers strings.  This doesn't normally
//	happen in practice, however.
//
// RETURN VALUE
//
//	A pointer to the string.
//
// SEE ALSO
//
//	Brian W. Kernighan, Dennis M. Ritchie.  "The C Programming Language,
//	2nd ed."  Addison-Wesley, Reading, MA.  pp. 63-64.
//
//*****************************************************************************
{
	static int const	Buf_Size	= 25;
	static int const	Num_Buffers	= 10;

	static char		buf[ Num_Buffers ][ Buf_Size ];
	static int		b;			// which buffer to use

	register char		*s = buf[ b ];
	bool const		is_neg = n < 0;

	if ( is_neg ) n = -n;
	do {					// generate digits in reverse
		*s++ = n % 10 + '0';
	} while ( n /= 10 );
	if ( is_neg ) *s++ = '-';
	*s = '\0';

	// now reverse the string
	for ( register char *t = buf[ b ]; t < s; ++t ) {
		char const tmp = *--s; *s = *t; *t = tmp;
	}

	s = buf[ b ];
	b = (b + 1) % Num_Buffers;

	return s;
}