
|
/*
------------------------------------------------------------------------------
A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use. The copyright notice included with the software
must be maintained in all copies produced.
THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE. THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.
Copyright (c) 1995, 1996, John Conover, All Rights Reserved.
Comments and/or bug reports should be addressed to:
john@johncon.com (John Conover)
------------------------------------------------------------------------------
translit.c, transliterate a page
ssize_t transliterate (unsigned char *page, ssize_t count);
translate count many characters in page using uppercase as a
translation table
note: the sole reason for breaking this module out of searchfile()
is to provide a means of manipulating the content of a file that
is being searched-the rules are:
1) the search area must start at page[0], (but can constitute
a smaller area of the page data space,) and the search area
must end a ' ' character; it is a requirement of bmhsearch(),
in bmhsearch.c, that the '\0' character is reserved as an end
of search sentinel in the pattern-failure to observe this rule
will result in a program that is erratic and either hangs
forever, or perhaps does a core dump of a very involved data
structure, that is very difficult to analyze-see also
uppercase.c and bmhsearch.c
2) the return value, count, must be the size of the data
space, in page to be searched, *_NOT_* including the last ' '
character
In conjunction with uppercase.c, hyphenation, backspace and
underlining, and phrase searching are addressed:
1) hyphenation could be implemented by omitting a '-' followed by
any number of white space characters
2) if the program is used primarily for searching catman pages,
the backspace and underlining features that are incorporated in
the man page system can be defeated by deleting the "backspace
character" sequences from the documents.
3) phrase searching could be enhanced by translating any number of
whitespace characters into a single ' ' character-the "\ " search
phrase would then be interpreted as any number of white space
characters. See uppercase.c for comments concerning whitespace,
and locale specific issues.
Note that main() in rel.c calls transliterate() in tranlit.c to
transliterate the query/search criteria-if an exact match is
specified, both the pattern and the data would be altered in
exactly the same manner, and appropriate matches found even though
both were translated, (although additional patterns could
conceivably be matched, the originals will be found,
irregardless,) for example, the data:
... re-engineering ...
or hyphenated:
... re-
engineering ...
would become reengineering, which could be found by any of the
query patterns:
re
engineering
reengineering
re-engineering
Likewise for multiple space compression in phrase query patterns.
Quite probably, such scenarios should be controlled by command
line options, perhaps via a language selection to avoid
localization and portability conflicts.
The algorithm is as follows:
for each character in the page
replace the character with its equivilent in uppercase[]
Usage is a call with page referencing the first character to be
translated, and count the number of characters to be translated,
for example:
count = transliterate (page, count + 2);
There are no errors, and the number of characters translated is
returned
To test this module, compile the module source with -DTEST_TRANSLIT
$Revision: 1.2 $
$Date: 1996/09/13 13:47:23 $
$Id: translit.c,v 1.2 1996/09/13 13:47:23 john Exp $
$Log: translit.c,v $
Revision 1.2 1996/09/13 13:47:23 john
Added handling of circularly linked directories and subdirectories in searchpath.c
Cosmetic changes to bmhsearch.c, postfix.c, rel.c, searchfile.c, translit.c, uppercase.c, version.c.
Revision 1.1 1996/02/08 02:55:10 john
Added hyphenation, backspace, and multiple whitespace capability.
Changes to files: uppercase.c translit.c searcfile.c rel.c and version.c-required for hyphenation, backspace, and multiple whitespace capability.
* Revision 1.0 1995/04/22 05:13:18 john
* Initial revision
*
*/
#include "rel.h"
#ifndef LINT /* include rcsid only if not running lint */
static char rcsid[] = "$Id: translit.c,v 1.2 1996/09/13 13:47:23 john Exp $"; /* module version */
static char rcsid_h[] = TRANSLIT_H_ID; /* module include version */
#endif
/*
Note: the heuristics for addressing hyphenation issues are as follows:
if a hyphen is found while transliterating the page:
skip the hyphen, and any following whitespace or another
hyphens, to the first character that is not whitespace or a
hyphen, which will collapse consecutive instances of
whitespace and hyphens into nothing.
Note: the heuristics for addressing the backspace character is as
follows:
if a backspace character is found while transliterating the page:
skip the backspace, and overwrite the character before the
backspace with the character after the backspace, which will
instantiate the character of the last instance of of
consecutive backspace/character combinations. This is
specifically for catman pages which utilize
underscore/backspace/character combinations for underlining,
in addition to backspace/character combinations for bold
representation-note that for this process to be successful,
the underscore must preceed the character in the sequence.
Note: the heuristics for addressing phrase issues are as follows:
if a whitespace character is found while transliterating the page:
and if the previous character found while transliterating the
page is also whitespace, skip the second instance of the
whitespace character, which will collapse consecutive
instances of whitespace characters into a single space.
*/
#ifdef __STDC__
ssize_t transliterate (unsigned char *page, ssize_t count)
#else
ssize_t transliterate (page, count)
unsigned char *page;
ssize_t count;
#endif
{
unsigned char last_char = (unsigned char) '\0', /* last character in memory page */
current_char, /* current character in memory page */
*char_ref = page; /* reference to character in memory page */
int i, /* character counter */
j = 0; /* character count */
for (i = 0; i < (int) count; i++) /* for each character in the page */
{
current_char = *char_ref = (unsigned char) uppercase[(int) page[i]]; /* convert the character to uppercase */
switch ((int) current_char) /* what is the current character in the memory page? */
{
case (int) '-': /* hyphenation? */
i++; /* yes, skip the hyphen; next character in the page */
for (i = i; i < (int) count; i++) /* for each character following the hyphen */
{
current_char = *char_ref = (unsigned char) uppercase[(int) page[i]]; /* convert the character to uppercase */
if (current_char != (unsigned char) ' ') /* character not whitespace? */
{
if (current_char != (unsigned char) '-') /* yes, character not a hyphen? */
{
char_ref++; /* yes, next character */
j++; /* yes, increment the character count */
break;
}
}
}
break;
case (int) '\b': /* backspace? */
i++; /* yes, skip the backspace; next character in the page */
char_ref --; /* previous character */
current_char = *char_ref = (unsigned char) uppercase[(int) page[i]]; /* convert the character to uppercase */
char_ref++; /* next character */
break;
case (int) ' ': /* space? */
if (last_char != (unsigned char) ' ') /* yes, last character in memory page not a space? */
{
char_ref++; /* yes, next character */
j++; /* increment the character count */
}
break;
default:
char_ref++; /* next character */
j++; /* increment the character count */
break;
}
last_char = current_char; /* last character in memory page is current character in memory page */
}
return (j); /* return the size of the page */
}
#ifdef TEST_TRANSLIT
/*
simple exerciser for testing transliterate (); get a string from
stdin, transliterate it, and print it to stdout; ignore the:
declared global, could be static
transliterate translit.c(xx)
from lint
*/
#ifdef __STDC__
int main (void)
#else
int main ()
#endif
{
unsigned char buffer[BUFSIZ]; /* buffer to be parsed */
ssize_t i; /* length of transliterated buffer */
if (make_uppercase () != (unsigned char *) 0) /* setup the uppercase array */
{
while (gets ((char *) buffer) != 0) /* input the string to be transliterated */
{
i = transliterate (buffer, strlen ((char *) buffer)); /* transliterate the buffer */
buffer[i] = '\0'; /* terminate the transliterated buffer with an EOS for printing */
(void) printf ("%s\n", buffer); /* print the transliterate buffer */
}
}
else
{
(void) fprintf (stderr, "error making uppercase array\n"); /* couldn't setup the uppercase array, print the error */
exit (1); /* and exit */
}
exit (0); /* return success */
#ifdef LINT /* include only if running lint */
return (0); /* for LINT formality */
#endif
}
#endif
|