File: libhspell.c

package info (click to toggle)
hspell 1.4-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 1,556 kB
  • sloc: ansic: 2,808; perl: 1,989; makefile: 209; sh: 109; awk: 15
file content (501 lines) | stat: -rw-r--r-- 13,426 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
/* Copyright (C) 2003-2012 Nadav Har'El and Dan Kenigsberg */

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "dict_radix.h"

#include "hspell.h"
#include "linginfo.h"


/* Hspell uses a dictionary, and several related files (prefix information,
   sizes information, stems, and linguistic description file). It needs
   to know the path of the dictionary, and to that it add suffixes to get
   the names of the other files.

   The function hspell_set_dictionary_path() can be used before calling
   hspell_init() to determine where that function loads the dictionary
   from. hspell_get_dictionary_path() queries the current setting.
*/
static const char *hspell_dictionary = DICTIONARY_BASE;

const char *
hspell_get_dictionary_path(void)
{
	return hspell_dictionary;
}

void

hspell_set_dictionary_path(const char *path)
{
	hspell_dictionary = path;
}


/* TODO: compile out debug code in production version... */
int hspell_debug=0;

/* Load the data files. Returns 0 on success, -1 if couldn't read the
   dictionary.
*/
static int
load_data(struct dict_radix **dictp)
{
	clock_t t1, t2;
	if(hspell_debug){
		fprintf(stderr,"Loading data files... ");
		t1=clock();
	}

	*dictp = new_dict_radix();
	if(!read_dict(*dictp, hspell_dictionary)){
		delete_dict_radix(*dictp);
		return -1;
	}

	if(hspell_debug){
		t2=clock();
		fprintf(stderr,"done (%d ms).\n",
				(int)((t2-t1)/(CLOCKS_PER_SEC/1000)));
	}
	return 0;
}

/*
 * The prefix tree "prefix_tree" is built by build_prefix_tree, from a list of
 * known combinations of prefixes. Each prefix also has a mask that determines
 * to what kind of words it can be applied.
 *
 * The list of known prefixes and masks were defined in the prefixes[] and
 * masks[] arrays in prefixes.c. That file is automatically generated by the
 * genprefixes.pl program.
 */

#include "prefixes.c"

struct prefix_node {
	/* if a prefix has a certain 'mask', and lookup on a word returns
	 * 'val' (a bitmask of prefixes allowed for it), our prefix is
	 * allowed on this word if and only if (mask & val)!=0.
	 *
	 * This means that 'mask' defines the bits that this prefix "supplies"
	 * and he 'val' defined for a word is the bits this words insists on
	 * getting at least one of (i.e., val is the list of types of
	 * prefixes that are allowed for this word).
	 */
	int mask;
	struct prefix_node *next['�'-'�'+1];
};
static struct prefix_node *prefix_tree = 0;

static void
build_prefix_tree(int allow_he_hasheela){
	int i;
	const char *p;
	struct prefix_node **n;
	char **prefixes;
	int *masks;
	if(allow_he_hasheela){
		prefixes=prefixes_H;
		masks=masks_H;
	} else {
		prefixes=prefixes_noH;
		masks=masks_noH;
	}

	for(i=0; prefixes[i]; i++){
		p=prefixes[i];
		n=&prefix_tree;
		if(hspell_debug)
			fprintf(stderr,"prefix %s ",p);
		while(*p){
			if(!(*n))
				*n=(struct prefix_node *)
					calloc(1,sizeof(struct prefix_node));
			n=& ((*n)->next[*p-'�']);
			p++;
		}
		/* define the mask (making sure the node exists). */
		if(!*n)
			*n=(struct prefix_node *)
				calloc(1,sizeof(struct prefix_node));
		(*n)->mask=masks[i];

		if(hspell_debug)
			fprintf(stderr,"mask=%d\n",(*n)->mask);
	}
}

static void
free_prefix_tree(struct prefix_node *n)
{
	/* free_prefix_tree recursively walk the tree, freeing all nodes */
	int i;
	if(!n)
		return;
	for(i=0; i< sizeof(n->next)/sizeof(n->next[0]); i++)
		free_prefix_tree(n->next[i]);
	free(n);
}


int
hspell_check_word(struct dict_radix *dict, const char *word, int *preflen)
{
	int hashebrew;
	const char *w=word;
	struct prefix_node *n;
	*preflen = 0;

	/* ignore empty words: */
	hashebrew=0;
	while(*w){
		if(*w>='�' && *w<='�'){
			hashebrew=1;
			break;
		}
		(*preflen)++;
		w++;
	}
	if(!hashebrew)
		return 1; /* ignore (accept) empty words */


	n=prefix_tree;
	if(hspell_debug)
		fprintf(stderr,"looking %s\n",w);
	while(*w && n){
		/* eat up the " if necessary, to recognize words like
		 * �"����".  or ������ �"�����...".
		 * See the Academy's punctuation rules (see ������ ���, ���,
		 * ���"�) for an explanation of this rule (we're probably don't
		 * support here everything they suggested; in particular I
		 * don't recognize a single quote as valid form of merchaot).
		 */
		if(*w=='"'){
			(*preflen)++;
			w++;
			continue;
		}
		/* The first case here is the Academia's "ha-ktiv hasar
		 * ha-niqqud" rule of doubling a consonant waw in the middle
		 * a word, unless it's already next to a waw. When adding a
		 * prefix, any initial waw in a word will necessarily
		 * become a consonant waw in the middle of the word.
		 * The "else if" below is the normal check.
		 */
		if(n!=prefix_tree && *w=='�' && w[-1]!='�'){
			if(w[1]=='�'){
				if(w[2]!='�' && (lookup(dict,w+1) & n->mask)){
					/* for example: ����� */
					if(hspell_debug)
						fprintf(stderr,"found %s: double waw.\n",w);
					return 1;
				} else if(lookup(dict,w) & n->mask){
					/* for example: ����� */
					if(hspell_debug)
						fprintf(stderr,"found %s: nondouble waw.\n",w);
					return 1;
				}
			}
		} else {
			if (hspell_debug) fprintf (stderr, "tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
			if(lookup(dict,w) & n->mask) return 1; /* found word! */
		}

		/* try the next prefix... */
		if(*w>='�' && *w<='�'){
			n=n->next[*w-'�'];
			(*preflen)++;
			w++;
		} else {
			break;
		}
	}
	if(n && !*w){
		/* allow prefix followed by nothing (or a non-word like
		 * number, maqaf, etc.) */
		if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
		return 1;
	} else
		return 0; /* unrecognized (misspelled) word */
}

/* this functions copies, in a less than intelligent fashion, the Nadav's code
 * from hspell_check_word. TODO: use the same code for both functions. */
int hspell_enum_splits(struct dict_radix *dict, const char *word,
	hspell_word_split_callback_func *enumf)
{
	int preflen=0, count=0;

	int hashebrew;
	const char *w=word;
	struct prefix_node *n;

	/* ignore empty words: */
	hashebrew=0;
	while(*w){
		if(*w>='�' && *w<='�'){
			hashebrew=1;
			break;
		}
		preflen++;
		w++;
	}
	if(!hashebrew)
		return -1; /* ignore empty words */

	n=prefix_tree;
	if(hspell_debug)
		fprintf(stderr,"enum_splits looking %s\n",w);
	while(*w && n){
		/* eat up the " if necessary, to recognize words like
		 * �"����".  or ������ �"�����...".
		 * See the Academy's punctuation rules (see ������ ���, ���,
		 * ���"�) for an explanation of this rule (we're probably don't
		 * support here everything they suggested; in particular I
		 * don't recognize a single quote as valid form of merchaot).
		 */
		if(*w=='"'){
			preflen++;
			w++;
			continue;
		}
		/* The first case here is the Academia's "ha-ktiv hasar
		 * ha-niqqud" rule of doubling a consonant waw in the middle
		 * a word, unless it's already next to a waw. When adding a
		 * prefix, any initial waw in a word will necessarily
		 * become a consonant waw in the middle of the word.
		 * The "else if" below is the normal check.
		 */
		if(n!=prefix_tree && *w=='�' && w[-1]!='�'){
			if(w[1]=='�'){
				if(w[2]!='�' && (lookup(dict,w+1) & n->mask)){
					w++;
					/* for example: ����� */
					if(hspell_debug)
						fprintf(stderr,"found %s: double waw.\n",w);
					enumf(word, w, preflen++, n->mask);
					n=n->next[*w-'�']; w++;
					count++;
					continue;
				} else if(lookup(dict,w) & n->mask){
					/* for example: ����� */
					if(hspell_debug)
						fprintf(stderr,"found %s: nondouble waw.\n",w);
					enumf(word, w, preflen++, n->mask);
					n=n->next[*w-'�']; w++;
					count++;
					continue;
				}
			}
		} else {
			if (hspell_debug) fprintf (stderr, "enum_splits: tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
			if(lookup(dict,w) & n->mask) {
				enumf(word, w, preflen++, n->mask);
				n=n->next[*w-'�']; w++;
				count++;
				continue;
			} /* found word! */
		}

		/* try the next prefix... */
		if(*w>='�' && *w<='�'){
			n=n->next[*w-'�'];
			preflen++;
			w++;
		} else {
			break;
		}
	}
	if(n && !*w){
		/* allow prefix followed by nothing (or a non-word like
		 * number, maqaf, etc.) */
		if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
		enumf(word, w, preflen, n->mask);
		count++;
	} /* else
		return 0;  unrecognized (misspelled) word */
	if (hspell_debug) fprintf(stderr, "enum_splits found %d splits\n", count);
	return count;
}

/* In the past, we used to use snprintf for this splicing needed for
   hspell_trycorrect. But it turns out that snprintf, when given the %.*s
   format, counts locale "characters", and not bytes. When the locale was
   UTF8, this made it count wrong, despite us knowing here that we only
   deal with iso-8859-8. So let's implement this functionality on our own.
   This is ugly :(

   This function splices together the first s1len characters of s1, then
   two characters c1,c2 (or nothing if c is 0) and the string s2.
*/
static inline void splice(char *buf, int size, const char *s1, int s1len,
		     char c1, char c2, const char *s2)
{
	int len=s1len;
	if(len>=size)
		len=size-1;
	strncpy(buf,s1,len);
	if(len+1>=size){
		buf[len]='\0';
		return;
	} else if(c1) {
		buf[len++]=c1;
	}
	if(len+1>=size){
		buf[len]='\0';
		return;
	} else if(c2) {
		buf[len++]=c2;
	}
	if(s2){
		strncpy(buf+len,s2,size-len-1);
		buf[size-1]='\0'; /* in case the last command truncated */
	} else {
		buf[len]='\0';
	}
}

/* try to find corrections for word */
void
hspell_trycorrect(struct dict_radix *dict, const char *w, struct corlist *cl)
{
	char buf[30];
	int i;
	int len=strlen(w), preflen;
	static char *similar[] = {"���", "��", "��", "��", "��", "��",
				  "��", "��", "��"};

#define TRYBUF if(hspell_check_word(dict, buf, &preflen)) corlist_add(cl, buf)
	/* try to add a missing em kri'a - yud or vav */
	for(i=1;i<len;i++){
		splice(buf,sizeof(buf),w,i,'�',0,w+i);
		TRYBUF;
		splice(buf,sizeof(buf),w,i,'�',0,w+i);
		TRYBUF;
	}
	/* try to remove an em kri'a - yud or vav */
	/* NOTE: in hspell.pl the loop was from i=0 to i<len... */
	for(i=1;i<len-1;i++){
		if(w[i]=='�' || w[i]=='�'){
			splice(buf,sizeof(buf),w,i,0,0,w+i+1);
			TRYBUF;
		}
	}
	/* try to add or remove an aleph (is that useful?) */
	/* TODO: don't add an aleph next to yud or non-double vav,
	 * as it can't be an em kria there? */
	for(i=1;i<len;i++){
		splice(buf,sizeof(buf),w,i,'�',0,w+i);
		TRYBUF;
	}
	for(i=1;i<len-1;i++){
		if(w[i]=='�'){
			splice(buf,sizeof(buf),w,i,0,0,w+i+1);
			TRYBUF;
		}
	}
	/* try to replace similarly sounding (for certain people) letters:
	 */
	for(i=0;i<len;i++){
		int group;
		char *g;
		for(group=0; group< (sizeof(similar)/sizeof(similar[0]));
				group++){
			for(g=similar[group];*g && *g!=w[i];g++);
				;
			if(*g){
				/* character in group - try the other ones
				 * in this group! */
				for(g=similar[group];*g;g++){
					if(*g==w[i]) continue;
					if(i>0 && w[i]=='�' && w[i+1]=='�')
						splice(buf,sizeof(buf),w,i,*g,0,w+i+2);
					else if(*g=='�')
						splice(buf,sizeof(buf),w,i,'�','�',w+i+1);
					else
						splice(buf,sizeof(buf),w,i,*g,0,w+i+1);
					TRYBUF;
				}
			}
		}
	}
	/* try to replace a non-final letter at the end of the word by its
	 * final form and vice versa (useful check for abbreviations) */
	if(len>0 && len<sizeof(buf)){
		strncpy(buf,w,sizeof(buf));
		switch(w[len-1]){
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
			case '�': buf[len-1]='�'; break;
		}
		if(buf[len-1]!=w[len-1]){ TRYBUF; }
	}
	/* try to make the word into an acronym (add " before last character */
	if(len>=2){
		splice(buf,sizeof(buf),w,len-1,'"',w[len-1],0);
		TRYBUF;
	}
	/* try to make the word into an abbreviation (add ' at the end) */
	snprintf(buf,sizeof(buf), "%s'",w);
	TRYBUF;
}

/* hspell_init() reads the dictionary and initializes the necessary data
   structures, into the an allocated dictp structure.

   hspell_init() returns 0 on success, or negative numbers on errors:
   -1: cannot read dictionary.
*/
int
hspell_init(struct dict_radix **dictp, int flags){
	int ret;
	ret=load_data(dictp);
	if(ret<0) return ret;
	build_prefix_tree(flags & HSPELL_OPT_HE_SHEELA);
#ifdef USE_LINGINFO
	if (flags & HSPELL_OPT_LINGUISTICS) {
		if (!linginfo_init(hspell_dictionary)) return -1;
	}
#endif
	return 0;
}

/* TODO: hspell_init should use a new "hspell_context" structure, not
   dict_radix. Because we might want to add more things like user dictionary.
   The prefix tree should also sit in the hspell_context, instead of
   being a global variable: the current mishmash of globals and non-globals
   is ugly.
   Linginfo's global variables (see linginfo_init and linginfo_free)
   should also be in this context.
*/

/* hspell_uninit() undoes the effects of hspell_init, freeing memory that
   was allocated during initialization. The dict pointer passed is no
   longer valid after this call, and should not be used (i.e., hspell_uninit()
   has similar semantics to free()).
*/
void
hspell_uninit(struct dict_radix *dict)
{
	delete_dict_radix(dict);
	/* free prefix tree. Too bad this is a global variable, and not
	   something in a "context" given to us as a parameter. */
	free_prefix_tree(prefix_tree);
	prefix_tree=0;
#ifdef USE_LINGINFO
	linginfo_free();
#endif
}