File: libhspell.c

package info (click to toggle)
hspell 1.4-3.1
links: PTS, VCS
area: main
in suites: bookworm, bullseye, forky, sid, trixie
size: 1,556 kB
sloc: ansic: 2,808; perl: 1,989; makefile: 209; sh: 109; awk: 15
file content (501 lines) | stat: -rw-r--r-- 13,426 bytes
parent folder | download | duplicates (4)
/* Copyright (C) 2003-2012 Nadav Har'El and Dan Kenigsberg */

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "dict_radix.h"

#include "hspell.h"
#include "linginfo.h"


/* Hspell uses a dictionary, and several related files (prefix information,
   sizes information, stems, and linguistic description file). It needs
   to know the path of the dictionary, and to that it add suffixes to get
   the names of the other files.

   The function hspell_set_dictionary_path() can be used before calling
   hspell_init() to determine where that function loads the dictionary
   from. hspell_get_dictionary_path() queries the current setting.
*/
static const char *hspell_dictionary = DICTIONARY_BASE;

const char *
hspell_get_dictionary_path(void)
{
	return hspell_dictionary;
}

void

hspell_set_dictionary_path(const char *path)
{
	hspell_dictionary = path;
}


/* TODO: compile out debug code in production version... */
int hspell_debug=0;

/* Load the data files. Returns 0 on success, -1 if couldn't read the
   dictionary.
*/
static int
load_data(struct dict_radix **dictp)
{
	clock_t t1, t2;
	if(hspell_debug){
		fprintf(stderr,"Loading data files... ");
		t1=clock();
	}

	*dictp = new_dict_radix();
	if(!read_dict(*dictp, hspell_dictionary)){
		delete_dict_radix(*dictp);
		return -1;
	}

	if(hspell_debug){
		t2=clock();
		fprintf(stderr,"done (%d ms).\n",
				(int)((t2-t1)/(CLOCKS_PER_SEC/1000)));
	}
	return 0;
}

/*
 * The prefix tree "prefix_tree" is built by build_prefix_tree, from a list of
 * known combinations of prefixes. Each prefix also has a mask that determines
 * to what kind of words it can be applied.
 *
 * The list of known prefixes and masks were defined in the prefixes[] and
 * masks[] arrays in prefixes.c. That file is automatically generated by the
 * genprefixes.pl program.
 */

#include "prefixes.c"

struct prefix_node {
	/* if a prefix has a certain 'mask', and lookup on a word returns
	 * 'val' (a bitmask of prefixes allowed for it), our prefix is
	 * allowed on this word if and only if (mask & val)!=0.
	 *
	 * This means that 'mask' defines the bits that this prefix "supplies"
	 * and he 'val' defined for a word is the bits this words insists on
	 * getting at least one of (i.e., val is the list of types of
	 * prefixes that are allowed for this word).
	 */
	int mask;
	struct prefix_node *next[''-''+1];
};
static struct prefix_node *prefix_tree = 0;

static void
build_prefix_tree(int allow_he_hasheela){
	int i;
	const char *p;
	struct prefix_node **n;
	char **prefixes;
	int *masks;
	if(allow_he_hasheela){
		prefixes=prefixes_H;
		masks=masks_H;
	} else {
		prefixes=prefixes_noH;
		masks=masks_noH;
	}

	for(i=0; prefixes[i]; i++){
		p=prefixes[i];
		n=&prefix_tree;
		if(hspell_debug)
			fprintf(stderr,"prefix %s ",p);
		while(*p){
			if(!(*n))
				*n=(struct prefix_node *)
					calloc(1,sizeof(struct prefix_node));
			n=& ((*n)->next[*p-'']);
			p++;
		}
		/* define the mask (making sure the node exists). */
		if(!*n)
			*n=(struct prefix_node *)
				calloc(1,sizeof(struct prefix_node));
		(*n)->mask=masks[i];

		if(hspell_debug)
			fprintf(stderr,"mask=%d\n",(*n)->mask);
	}
}

static void
free_prefix_tree(struct prefix_node *n)
{
	/* free_prefix_tree recursively walk the tree, freeing all nodes */
	int i;
	if(!n)
		return;
	for(i=0; i< sizeof(n->next)/sizeof(n->next[0]); i++)
		free_prefix_tree(n->next[i]);
	free(n);
}


int
hspell_check_word(struct dict_radix *dict, const char *word, int *preflen)
{
	int hashebrew;
	const char *w=word;
	struct prefix_node *n;
	*preflen = 0;

	/* ignore empty words: */
	hashebrew=0;
	while(*w){
		if(*w>='' && *w<=''){
			hashebrew=1;
			break;
		}
		(*preflen)++;
		w++;
	}
	if(!hashebrew)
		return 1; /* ignore (accept) empty words */


	n=prefix_tree;
	if(hspell_debug)
		fprintf(stderr,"looking %s\n",w);
	while(*w && n){
		/* eat up the " if necessary, to recognize words like
		 * "".  or  "...".
		 * See the Academy's punctuation rules (see  , ,
		 * ") for an explanation of this rule (we're probably don't
		 * support here everything they suggested; in particular I
		 * don't recognize a single quote as valid form of merchaot).
		 */
		if(*w=='"'){
			(*preflen)++;
			w++;
			continue;
		}
		/* The first case here is the Academia's "ha-ktiv hasar
		 * ha-niqqud" rule of doubling a consonant waw in the middle
		 * a word, unless it's already next to a waw. When adding a
		 * prefix, any initial waw in a word will necessarily
		 * become a consonant waw in the middle of the word.
		 * The "else if" below is the normal check.
		 */
		if(n!=prefix_tree && *w=='' && w[-1]!=''){
			if(w[1]==''){
				if(w[2]!='' && (lookup(dict,w+1) & n->mask)){
					/* for example:  */
					if(hspell_debug)
						fprintf(stderr,"found %s: double waw.\n",w);
					return 1;
				} else if(lookup(dict,w) & n->mask){
					/* for example:  */
					if(hspell_debug)
						fprintf(stderr,"found %s: nondouble waw.\n",w);
					return 1;
				}
			}
		} else {
			if (hspell_debug) fprintf (stderr, "tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
			if(lookup(dict,w) & n->mask) return 1; /* found word! */
		}

		/* try the next prefix... */
		if(*w>='' && *w<=''){
			n=n->next[*w-''];
			(*preflen)++;
			w++;
		} else {
			break;
		}
	}
	if(n && !*w){
		/* allow prefix followed by nothing (or a non-word like
		 * number, maqaf, etc.) */
		if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
		return 1;
	} else
		return 0; /* unrecognized (misspelled) word */
}

/* this functions copies, in a less than intelligent fashion, the Nadav's code
 * from hspell_check_word. TODO: use the same code for both functions. */
int hspell_enum_splits(struct dict_radix *dict, const char *word,
	hspell_word_split_callback_func *enumf)
{
	int preflen=0, count=0;

	int hashebrew;
	const char *w=word;
	struct prefix_node *n;

	/* ignore empty words: */
	hashebrew=0;
	while(*w){
		if(*w>='' && *w<=''){
			hashebrew=1;
			break;
		}
		preflen++;
		w++;
	}
	if(!hashebrew)
		return -1; /* ignore empty words */

	n=prefix_tree;
	if(hspell_debug)
		fprintf(stderr,"enum_splits looking %s\n",w);
	while(*w && n){
		/* eat up the " if necessary, to recognize words like
		 * "".  or  "...".
		 * See the Academy's punctuation rules (see  , ,
		 * ") for an explanation of this rule (we're probably don't
		 * support here everything they suggested; in particular I
		 * don't recognize a single quote as valid form of merchaot).
		 */
		if(*w=='"'){
			preflen++;
			w++;
			continue;
		}
		/* The first case here is the Academia's "ha-ktiv hasar
		 * ha-niqqud" rule of doubling a consonant waw in the middle
		 * a word, unless it's already next to a waw. When adding a
		 * prefix, any initial waw in a word will necessarily
		 * become a consonant waw in the middle of the word.
		 * The "else if" below is the normal check.
		 */
		if(n!=prefix_tree && *w=='' && w[-1]!=''){
			if(w[1]==''){
				if(w[2]!='' && (lookup(dict,w+1) & n->mask)){
					w++;
					/* for example:  */
					if(hspell_debug)
						fprintf(stderr,"found %s: double waw.\n",w);
					enumf(word, w, preflen++, n->mask);
					n=n->next[*w-'']; w++;
					count++;
					continue;
				} else if(lookup(dict,w) & n->mask){
					/* for example:  */
					if(hspell_debug)
						fprintf(stderr,"found %s: nondouble waw.\n",w);
					enumf(word, w, preflen++, n->mask);
					n=n->next[*w-'']; w++;
					count++;
					continue;
				}
			}
		} else {
			if (hspell_debug) fprintf (stderr, "enum_splits: tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
			if(lookup(dict,w) & n->mask) {
				enumf(word, w, preflen++, n->mask);
				n=n->next[*w-'']; w++;
				count++;
				continue;
			} /* found word! */
		}

		/* try the next prefix... */
		if(*w>='' && *w<=''){
			n=n->next[*w-''];
			preflen++;
			w++;
		} else {
			break;
		}
	}
	if(n && !*w){
		/* allow prefix followed by nothing (or a non-word like
		 * number, maqaf, etc.) */
		if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
		enumf(word, w, preflen, n->mask);
		count++;
	} /* else
		return 0;  unrecognized (misspelled) word */
	if (hspell_debug) fprintf(stderr, "enum_splits found %d splits\n", count);
	return count;
}

/* In the past, we used to use snprintf for this splicing needed for
   hspell_trycorrect. But it turns out that snprintf, when given the %.*s
   format, counts locale "characters", and not bytes. When the locale was
   UTF8, this made it count wrong, despite us knowing here that we only
   deal with iso-8859-8. So let's implement this functionality on our own.
   This is ugly :(

   This function splices together the first s1len characters of s1, then
   two characters c1,c2 (or nothing if c is 0) and the string s2.
*/
static inline void splice(char *buf, int size, const char *s1, int s1len,
		     char c1, char c2, const char *s2)
{
	int len=s1len;
	if(len>=size)
		len=size-1;
	strncpy(buf,s1,len);
	if(len+1>=size){
		buf[len]='\0';
		return;
	} else if(c1) {
		buf[len++]=c1;
	}
	if(len+1>=size){
		buf[len]='\0';
		return;
	} else if(c2) {
		buf[len++]=c2;
	}
	if(s2){
		strncpy(buf+len,s2,size-len-1);
		buf[size-1]='\0'; /* in case the last command truncated */
	} else {
		buf[len]='\0';
	}
}

/* try to find corrections for word */
void
hspell_trycorrect(struct dict_radix *dict, const char *w, struct corlist *cl)
{
	char buf[30];
	int i;
	int len=strlen(w), preflen;
	static char *similar[] = {"", "", "", "", "", "",
				  "", "", ""};

#define TRYBUF if(hspell_check_word(dict, buf, &preflen)) corlist_add(cl, buf)
	/* try to add a missing em kri'a - yud or vav */
	for(i=1;i<len;i++){
		splice(buf,sizeof(buf),w,i,'',0,w+i);
		TRYBUF;
		splice(buf,sizeof(buf),w,i,'',0,w+i);
		TRYBUF;
	}
	/* try to remove an em kri'a - yud or vav */
	/* NOTE: in hspell.pl the loop was from i=0 to i<len... */
	for(i=1;i<len-1;i++){
		if(w[i]=='' || w[i]==''){
			splice(buf,sizeof(buf),w,i,0,0,w+i+1);
			TRYBUF;
		}
	}
	/* try to add or remove an aleph (is that useful?) */
	/* TODO: don't add an aleph next to yud or non-double vav,
	 * as it can't be an em kria there? */
	for(i=1;i<len;i++){
		splice(buf,sizeof(buf),w,i,'',0,w+i);
		TRYBUF;
	}
	for(i=1;i<len-1;i++){
		if(w[i]==''){
			splice(buf,sizeof(buf),w,i,0,0,w+i+1);
			TRYBUF;
		}
	}
	/* try to replace similarly sounding (for certain people) letters:
	 */
	for(i=0;i<len;i++){
		int group;
		char *g;
		for(group=0; group< (sizeof(similar)/sizeof(similar[0]));
				group++){
			for(g=similar[group];*g && *g!=w[i];g++);
				;
			if(*g){
				/* character in group - try the other ones
				 * in this group! */
				for(g=similar[group];*g;g++){
					if(*g==w[i]) continue;
					if(i>0 && w[i]=='' && w[i+1]=='')
						splice(buf,sizeof(buf),w,i,*g,0,w+i+2);
					else if(*g=='')
						splice(buf,sizeof(buf),w,i,'','',w+i+1);
					else
						splice(buf,sizeof(buf),w,i,*g,0,w+i+1);
					TRYBUF;
				}
			}
		}
	}
	/* try to replace a non-final letter at the end of the word by its
	 * final form and vice versa (useful check for abbreviations) */
	if(len>0 && len<sizeof(buf)){
		strncpy(buf,w,sizeof(buf));
		switch(w[len-1]){
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
			case '': buf[len-1]=''; break;
		}
		if(buf[len-1]!=w[len-1]){ TRYBUF; }
	}
	/* try to make the word into an acronym (add " before last character */
	if(len>=2){
		splice(buf,sizeof(buf),w,len-1,'"',w[len-1],0);
		TRYBUF;
	}
	/* try to make the word into an abbreviation (add ' at the end) */
	snprintf(buf,sizeof(buf), "%s'",w);
	TRYBUF;
}

/* hspell_init() reads the dictionary and initializes the necessary data
   structures, into the an allocated dictp structure.

   hspell_init() returns 0 on success, or negative numbers on errors:
   -1: cannot read dictionary.
*/
int
hspell_init(struct dict_radix **dictp, int flags){
	int ret;
	ret=load_data(dictp);
	if(ret<0) return ret;
	build_prefix_tree(flags & HSPELL_OPT_HE_SHEELA);
#ifdef USE_LINGINFO
	if (flags & HSPELL_OPT_LINGUISTICS) {
		if (!linginfo_init(hspell_dictionary)) return -1;
	}
#endif
	return 0;
}

/* TODO: hspell_init should use a new "hspell_context" structure, not
   dict_radix. Because we might want to add more things like user dictionary.
   The prefix tree should also sit in the hspell_context, instead of
   being a global variable: the current mishmash of globals and non-globals
   is ugly.
   Linginfo's global variables (see linginfo_init and linginfo_free)
   should also be in this context.
*/

/* hspell_uninit() undoes the effects of hspell_init, freeing memory that
   was allocated during initialization. The dict pointer passed is no
   longer valid after this call, and should not be used (i.e., hspell_uninit()
   has similar semantics to free()).
*/
void
hspell_uninit(struct dict_radix *dict)
{
	delete_dict_radix(dict);
	/* free prefix tree. Too bad this is a global variable, and not
	   something in a "context" given to us as a parameter. */
	free_prefix_tree(prefix_tree);
	prefix_tree=0;
#ifdef USE_LINGINFO
	linginfo_free();
#endif
}