1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
|
/*
------------------------------------------------------------------------------
A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use. The copyright notice included with the software
must be maintained in all copies produced.
THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE. THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.
Copyright (c) 1995, 1996, John Conover, All Rights Reserved.
Comments and/or bug reports should be addressed to:
john@johncon.com (John Conover)
------------------------------------------------------------------------------
uppercase.c, uppercase transliteration
unsigned char *make_uppercase (void);
and:
unsigned char *uppercase = (unsigned char *) 0;
allocate a global array, of size MAX_ALPHABET_SIZE, and of type
unsigned char, named uppercase[], constructed in such a manner
that the implicit index of any element in the array contains the
toupper() of the index value (ie., it is a look up table for
uppercase characters.)
note: the requirement of bmhsearch() in bmhsearch.c that the '\0'
character is reserved as an end of search sentinel in the
pattern-this means that array element 0 can NOT contain a '\0'-a
space will be used
note: care must be exercised when using this array in systems
where the native type of char is signed, for example:
signed char ch;
unsigned char cu;
cu = uppercase[ch];
will not give the desired results, since ch indexed a negative
section of the array, (which does not exist.). Particularly
meticulous usage of lint is advisable.
The objective of this technique is to provide an alternative to
using toupper() on every character in large documents-implicit
indexing is very fast, and once the uppercase array has been set
up, uppercase transliteration of documents can be made very
quickly. As a related issue, it should be, relatively, portable to
other locale.h environments. It is probably important to note that
the characters in the infix search criteria and any documents
should both be transliterated using this array so that the
character sets for both are identical.
Other transliterations can be placed in the array, for example,
tabs and newlines can be converted to spaces, (which is the
current implementation,) as could punctuation such as commas and
periods. A possible scenario would be to use ispunct(), iswhite(),
isprint(), etc. in the for loop to alter the transliteration to
whatever is desired-be advised that portability issues may ensue
if the scheme is not compatible with local.h in some languages. A
possible alternative would be to implement a command line switch
for various languages, to avoid localization and portability
conflicts.
The algorithm is as follows:
allocate space for the array
for each element in the array, store the toupper() of the index of
the element
Usage is a single call to allocate and scan the array, for example:
unsigned char my_array_of_uppercase[],
*my_ptr;
if (make_uppercase () == (unsigned char *) 0)
{
(void) printf ("error installing uppercase array\n");
}
while (something)
{
my_array_of_uppercase[something] = uppercase[(int) *my_ptr];
}
For a detailed description of using implicit addressing for character
transliteration, see "Information Retrieval: Data Structures &
Algorithms," William B. Frakes, Ricardo Baeza-Yates, Editors, Prentice
Hall, Englewood Cliffs, New Jersey, 1992, ISBN 0-13-463837-9, pp 102.
There are no arguments
On any error, return null, else return a reference to the array of
uppercase letters, uppercase[]
MAX_ALPHABET_SIZE is defined in uppercase.h
To test this module, compile the module source with -DTEST_UPPERCASE
$Revision: 1.2 $
$Date: 1996/09/13 13:47:23 $
$Id: uppercase.c,v 1.2 1996/09/13 13:47:23 john Exp $
$Log: uppercase.c,v $
Revision 1.2 1996/09/13 13:47:23 john
Added handling of circularly linked directories and subdirectories in searchpath.c
Cosmetic changes to bmhsearch.c, postfix.c, rel.c, searchfile.c, translit.c, uppercase.c, version.c.
Revision 1.1 1996/02/08 02:55:10 john
Added hyphenation, backspace, and multiple whitespace capability.
Changes to files: uppercase.c translit.c searcfile.c rel.c and version.c-required for hyphenation, backspace, and multiple whitespace capability.
* Revision 1.0 1995/04/22 05:13:18 john
* Initial revision
*
*/
#include "rel.h"
#ifndef LINT /* include rcsid only if not running lint */
static char rcsid[] = "$Id: uppercase.c,v 1.2 1996/09/13 13:47:23 john Exp $"; /* module version */
static char rcsid_h[] = UPPERCASE_H_ID; /* module include version */
#endif
unsigned char *uppercase = (unsigned char *) 0; /* reference to uppercase array */
/*
Note: all whitespace characters, as determined by the function
isspace(3), are considered spaces. The whitespace characters are
derived from locale.h and ctype.h and is locale specific. The single
or multiple whitespace characters are used to address phrase searching
issues in translit.c. In the "C" locale, the set of whitespace
characters are the space, the form feed, new-line, carriage return,
horizontal tab, and vertical tab. In other locales, other
implementation-defined characters may be added to this set, provided
they do not test true for isalnum(3).
*/
#ifdef __STDC__
unsigned char *make_uppercase (void)
#else
unsigned char *make_uppercase ()
#endif
{
int uppercase_error = URMEM_ERR, /* module error value, assume error allocating memory */
i;
if ((uppercase = (unsigned char *) memalloc (MAX_ALPHABET_SIZE * sizeof (unsigned char))) != (unsigned char *) 0) /* allocate */
{
uppercase_error = NO_ERROR; /* assume no errors */
uppercase[0] = (unsigned char) ' '; /* the null character is reserved as an end of search sentinel */
for (i = 1; i < MAX_ALPHABET_SIZE; i++) /* for each remaining character in the uppercase array */
{
if (isspace (i)) /* character whitespace? */
{
uppercase[i] = (unsigned char) ' '; /* yes, make sure the character is a space */
}
else
{
uppercase[i] = (unsigned char) toupper (i); /* no, convert the character to uppercase */
}
}
}
if (uppercase_error != NO_ERROR) /* pending error? */
{
message (uppercase_error, (char *) 0); /* yes, print the error */
}
return (uppercase); /* return a reference to the uppercase array, null if error */
}
#ifdef TEST_UPPERCASE
/*
simple exerciser for testing make_uppercase (); dump the array to
stdio
declared global, could be static
uppercase uppercase.c(xxx)
make_uppercase uppercase.c(yyy)
from lint
*/
#ifdef __STDC__
int main (void)
#else
int main ()
#endif
{
int i; /* character counter */
if (make_uppercase () == (unsigned char *) 0) /* setup the uppercase array */
{
(void) fprintf (stderr, "error allocating uppercase array\n"); /* couldn't setup the uppercase array, print the error */
exit (1); /* and, exit */
}
for (i = 0; i < MAX_ALPHABET_SIZE; i++) /* for each character in the uppercase array */
{
(void) printf ("uppercase[%d] = %d\n", i, (int) uppercase[i]); /* print the character's decimal value to stdio */
}
exit (0); /* return success */
#ifdef LINT /* include only if running lint */
return (0); /* for LINT formality */
#endif
}
#endif
|