File: util.c

package info (click to toggle)
swish++ 1.1b3-3
  • links: PTS
  • area: main
  • in suites: slink
  • size: 416 kB
  • ctags: 409
  • sloc: ansic: 2,842; makefile: 247; sh: 48
file content (322 lines) | stat: -rw-r--r-- 7,579 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
/*
**	SWISH++
**	util.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <cctype>

// local
#include "config.h"
#include "entities.h"
#include "fake_ansi.h"
#include "string_set.h"
#include "util.h"

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

struct stop_word_set : string_set {
	stop_word_set();
};

stop_word_set::stop_word_set() {
	extern char const *const stop_word_table[];
	for ( register char const *const *w = stop_word_table; *w; ++w )
		insert( *w );
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline bool is_vowel( char c )
//
// DESCRIPTION
//
//	Determine whether a character is a vowel [aeiou] regardless of case.
//
// PARAMETERS
//
//	c	The character to be checked.
//
// RETURN VALUE
//
//	Returns true only if the character is a vowel.
//
//*****************************************************************************
{
	c = tolower( c );
	return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool is_ok_word( char const *word )
//
// DESCRIPTION
//
//	Determine whether a given word should be indexed or not using several
//	heuristics.
//
//	Stop words, words that occur too frequently or have no information
//	content, are not indexed.  Additionally, several heuristics are used
//	to determine which words should not be indexed.
//
//	First, a word is checked to see if it looks like an acronym.  A word
//	is considered an acronym only if it starts with a capital letter and
//	is composed exclusively of capital letters, digits, and punctuation
//	symbols, e.g., "AT&T."  If a word looks like an acronym, it is OK and
//	no further checks are done.
//
//	Second, there are several other checks that are applied.  A word is
//	not indexed if it:
//
//	1. Starts with a capital letter, is of mixed case, and contains more
//	   than a third capital letters, e.g., "BizZARE."
//
//	2. Contains a capital letter other than the first, e.g, "weIrd."
//
//	3. Is less that Word_Min_Size characters and is not an acronym.
//
//	4. Contains no vowels.
//
//	5. Contains more than Word_Max_Consec_Same of the same character
//	   consecutively (not including digits).
//
//	6. Contains more than Word_Max_Consec_Vowels consecutive vowels.
//
//	7. Contains more than Word_Max_Consec_Consonants consecutive
//	   consonants.
//
// PARAMETERS
//
//	word	The word to be checked.
//
// RETURN VALUE
//
//	Returns true only if the word should be indexed.
//
// EXAMPLES
//
//	AT&T	OK
//	cccp	not OK -- no vowels
//	CCCP	OK -- acronym
//	eieio	not OK -- too many consec. vowels
//	other	not OK -- stop word
//
// SEE ALSO
//
//	stop_words.c	List of built-in stop words.
//
//*****************************************************************************
{
	int len = ::strlen( word );
	register char const *c;

#	ifdef DEBUG_is_ok_word
	cerr << '\t' << word << ' ';
#	endif

	////////// See if it's a stop word ////////////////////////////////////

	char lc_word[ Word_Hard_Max_Size + 1 ];
	::transform( word, word + len, lc_word, to_lower );
	lc_word[ len ] = '\0';

	static stop_word_set stop_words;
	if ( stop_words.find( lc_word ) ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(stop word)" << endl;
#		endif
		return false;
	}

	////////// Survey the characters in the word //////////////////////////

	int consonants = 0;
	int digits = 0;
	int puncts = 0;
	int uppers = 0;
	int vowels = 0;
	for ( c = word; *c; ++c ) {
		if ( isdigit( *c ) ) {
			++digits;
			continue;
		};
		if ( ispunct( *c ) ) {
			++puncts;
			continue;
		}
		if ( isupper( *c ) )
			++uppers;
		if ( is_vowel( *c ) )
			++vowels;
		else
			++consonants;
	}

	if ( isupper( *word ) ) {
		if ( uppers + digits + puncts == len ) {
#			ifdef DEBUG_is_ok_word
			cerr << "(potential acronym)" << endl;
#			endif
			return true;
		}
		if ( double( uppers + digits ) / len >= 33 / 100.0 ) {
#			ifdef DEBUG_is_ok_word
			cerr << "(too many intermediate uppers)" << endl;
#			endif
			return false;
		}
	} else if ( uppers ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(intermediate uppers)" << endl;
#		endif
		return false;
	}

	if ( len < Word_Min_Size ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(len < Word_Min_Size)" << endl;
#		endif
		return false;
	}

	if ( !vowels ) {
#		ifdef DEBUG_is_ok_word
		cerr << "(no vowels)" << endl;
#		endif
		return false;
	}

	////////// Perform consecutive-character checks ///////////////////////

	int consec_consonants = 0;
	int consec_vowels = 0;
	int consec_same = 0;
	register char last_c = '\0';

	for ( c = word; *c; ++c ) {

		if ( isdigit( *c ) ) {
			consec_consonants = 0;
			consec_vowels = 0;
			last_c = '\0';	// consec_same doesn't apply to digits
			continue;
		}

		if ( ispunct( *c ) ) {
			consec_consonants = 0;
			consec_vowels = 0;
			continue;
		}

		if ( *c == last_c ) {
			if ( ++consec_same > Word_Max_Consec_Same ) {
#				ifdef DEBUG_is_ok_word
				cerr << "(exceeded consec same)" << endl;
#				endif
				return false;
			}
		} else {
			consec_same = 0;
			last_c = *c;
		}

		if ( is_vowel( *c ) ) {
			if ( ++consec_vowels > Word_Max_Consec_Vowels ) {
#				ifdef DEBUG_is_ok_word
				cerr << "(exceeded consec vowels)" << endl;
#				endif
				return false;
			}
			consec_consonants = 0;
			continue;
		} else {
			if ( ++consec_consonants > Word_Max_Consec_Consonants ){
#				ifdef DEBUG_is_ok_word
				cerr << "(exceeded consec consonants)" << endl;
#				endif
				return false;
			}
			consec_vowels = 0;
		}
	}

#	ifdef DEBUG_is_ok_word
	cerr << endl;
#	endif
	return true;
}

//*****************************************************************************
//
// SYNOPSIS
//
	char const *ltoa( register long n )
//
// DESCRIPTION
//
//	Convert a long integer to a string.  The string returned is from an
//	internal pool of string buffers.  The time you get into trouble is if
//	you hang on to more then Num_Buffers strings.  This doesn't normally
//	happen in practice, however.
//
// RETURN VALUE
//
//	A pointer to the string.
//
// SEE ALSO
//
//	Brian W. Kernighan, Dennis M. Ritchie.  "The C Programming Language,
//	2nd ed."  Addison-Wesley, Reading, MA.  pp. 63-64.
//
//*****************************************************************************
{
	static int const	Buf_Size	= 25;
	static int const	Num_Buffers	= 10;

	static char		buf[ Num_Buffers ][ Buf_Size ];
	static int		b;			// which buffer to use

	register char		*s = buf[ b ];
	bool const		is_neg = n < 0;

	if ( is_neg ) n = -n;
	do {					// generate digits in reverse
		*s++ = n % 10 + '0';
	} while ( n /= 10 );
	if ( is_neg ) *s++ = '-';
	*s = '\0';

	// now reverse the string
	for ( register char *t = buf[ b ]; t < s; ++t ) {
		char const tmp = *--s; *s = *t; *t = tmp;
	}

	s = buf[ b ];
	b = (b + 1) % Num_Buffers;

	return s;
}