1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
/*
mmorph, MULTEXT morphology tool
Version 2.3, October 1995
Copyright (c) 1994,1995 ISSCO/SUISSETRA, Geneva, Switzerland
Dominique Petitpierre, <petitp@divsun.unige.ch>
*/
/*
tbl.c
procedure to lookup words in the database
input and output are in Multext record/field format
should be changed to use an API
*/
#include <ctype.h>
#include "user.h"
/* max number of relevant fields in a line */
#define MAX_FIELD 4
#define MAXSTRLNG 128
#define LINE_CHUNK 128
static char *line = NULL;
static int max_line_size;
#define CLASS_CHUNK 10
static char **class_table = NULL;
static int max_class_table_size;
static int class_table_size;
/* save a slot for end of last field */
static char *field[MAX_FIELD + 1];
static int field_card;
static t_boolean begin_sentence;
static void
print_tbl(field_no)
int field_no;
{
int i;
int last_field;
/* copy the line as is, no lookup, no annotations */
print_out("%s", field[0]);
last_field = MAX(field_no, field_card - 1);
for (i = 1; i <= last_field; i++)
print_out("%c%s", FIELD_SEP, field[i]);
print_out("%c", RECORD_SEP);
}
/** Fonction de comparaison utilise par qsort et bsearch */
static int
#if defined(__STDC__)
class_cmp(const T_PTR class1, const T_PTR class2)
#else
class_cmp(class1, class2)
T_PTR class1;
T_PTR class2;
#endif
{
return strcmp(*((T_PTR *) class1), *((T_PTR *) class2));
}
/*
Read a line up to a newline and divide it in MAXFIELD fields
separated by TABs.
return in segment the form in field form_field with its size
in segment_size.
*/
static t_boolean
get_tbl_segment(infile, segment, segment_size)
FILE *infile;
char **segment; /* output */
int *segment_size; /* output */
{
char *s;
char *last_s;
int c;
int field_no;
t_boolean irrelevant;
/*
read a line and extract the word to lookup. fgets is not used because
it takes the newline in.
*/
begin_sentence = FALSE; /* reset sentence boundary flag */
do {
s = line;
field[0] = line;
field_no = 1;
last_s = line + max_line_size;
while ((c = getc(infile)) != EOF
&& (c != (int) RECORD_SEP)) {
if (c == (int) FIELD_SEP && field_no < MAX_FIELD) {
*s++ = NUL_CHAR;
field[field_no++] = s;
}
else
*s++ = (char) c;
if (s >= last_s) {
max_line_size += LINE_CHUNK;
MY_REALLOC(line, max_line_size, char);
/* line address might have changed */
s = line + max_line_size - LINE_CHUNK;
last_s = line + max_line_size;
}
}
*s = NUL_CHAR;
/* fill the table, just in case */
field_card = field_no;
for (; field_no <= MAX_FIELD; field_no++)
field[field_no] = s;
/*
no lookup occurs if - the form field is empty, - the morph field
is not empty and we are not extending nor overwriting,
- the class field is not in the class set
*/
irrelevant = (*field[morph_field] && !(extend_morph_field
|| overwrite_morph_field))
|| !*field[form_field]
|| (bsearch((T_PTR) (field + class_field),
(T_PTR) class_table,
(size_t) class_table_size,
sizeof(char *), class_cmp)
== NULL);
if (irrelevant && !(c == EOF && s == line)) {
begin_sentence = !strcmp(field[class_field],
begin_sentence_class);
print_tbl(0);
}
} while (irrelevant && c != EOF);
if (c == EOF && s != line)
print_warning("input file does not terminate with a newline");
if (irrelevant)
return (FALSE); /* EOF */
else {
*segment = field[form_field];
*segment_size = field[form_field + 1] - field[form_field];
return (TRUE);
}
}
/*
copy a string, converting uppercase to lowercase.
Use the correspondance specified by LC_CTYPE environment variable
*/
t_boolean
fold_case(source, target)
unsigned char *source;
unsigned char *target;
{
register unsigned char *s;
register unsigned char *t;
t_boolean folded;
folded = FALSE;
for (s = source, t = target; *s;) {
if (isupper((int) *s)) {
folded = TRUE;
*t++ = (unsigned char) tolower((int) *s++);
}
else
*t++ = *s++;
}
*t = (unsigned char) NUL_CHAR;
return (folded);
}
void
lookup_tbl(infile, lookup_classes)
FILE *infile;
char *lookup_classes;
{
char *segment;
t_letter surface_lex[MAXSTRLNG];
char *p;
int segment_size;
int i;
t_boolean found;
t_boolean folded;
t_boolean extend;
/* separator are comma, backslash, tab and space (and newline). Comma and
space should not be there strictly, but are unlikely to appear in a
class name
*/
#define SEPARATORS " |,\\\t\n"
if (class_table == NULL) {
max_class_table_size = CLASS_CHUNK;
MY_CALLOC(class_table, max_class_table_size, char *);
}
if (line == NULL) {
max_line_size = LINE_CHUNK;
MY_CALLOC(line, max_line_size, char);
}
/* parse and store the lookup classes in a table, sorted */
p = strtok(lookup_classes, SEPARATORS);
class_table_size = 0;
while (p != NULL) {
if (class_table_size >= max_class_table_size) {
max_class_table_size += CLASS_CHUNK;
MY_REALLOC(class_table, max_class_table_size, char *);
}
class_table[class_table_size++] = p;
p = strtok((char *) NULL, SEPARATORS);
}
if (class_table_size == 0)
fatal_error("lookup classes are not correctly specified: \"%s\"",
lookup_classes);
qsort((T_PTR) class_table, (size_t) class_table_size, sizeof(char *),
class_cmp);
/* check each line for lookup */
while (get_tbl_segment(infile, &segment, &segment_size)) {
if (segment_size >= MAXSTRLNG) {
fatal_error("word too long (max %d):\n %s",
MAXSTRLNG - 1, segment);
}
for (i = 0; i < morph_field; i++)
print_out("%s%c", field[i], FIELD_SEP);
extend = (*field[morph_field] && ! overwrite_morph_field);
if (extend)
print_out("%s", field[morph_field]);
if (fold_case_always) {
folded = fold_case((unsigned char *) segment,
(unsigned char *) surface_lex);
found = map_letter((t_str) surface_lex, surface_lex,
Surface_Letter)
&& db_forms_lookup_tbl(surface_lex, extend, folded);
}
else {
folded = FALSE;
found = map_letter(segment, surface_lex, Surface_Letter)
&& db_forms_lookup_tbl(surface_lex, extend, folded);
}
if (!fold_case_always
&& ((begin_sentence && isupper((int) ((unsigned char) *segment)))
|| (fold_case_fallback && !found))) {
folded = fold_case((unsigned char *) segment,
(unsigned char *) surface_lex);
if (folded)
found |= (map_letter((char *) surface_lex, surface_lex,
Surface_Letter)
&& db_forms_lookup_tbl(surface_lex,
(extend || found),
TRUE));
}
if (!found && !extend && mark_unknown)
print_out("%s", UNKNOWN_MORPH);
for (i++; i < field_card; i++)
print_out("%c%s", FIELD_SEP, field[i]);
print_out("%c", RECORD_SEP);
if (want_flush)
flush_out();
}
}
|