1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
|
/*
* main.c - Thesaurus coder for OpenOffice.org
*
* Copyright (C) 2003, 2005 Giuseppe Modugno
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* CHANGES
* 0.6
* - Added "-sep" option
*/
/* thescoder is a program that converts an input thesaurus plain-text file
* into two files, index file (.idx extension) and data file (.dat extension).
*
* Plain-text input file must have the following format:
* <characters accepted>
* <word1>;<syn11>;<syn12>...\n
* <word2>;<syn21>;<syn22>...\n
* ...
* <wordN>;<synN1>;<synN2>...\n
* On the first line there is a list of characters accepted: lowercase
* and uppercase letters, dash ('-'), space (' ') and so on.
* On every next lines there is a word and all its synonimous, separated by a
* semicolon. Spaces are optional before and after colon word-separator
* character.
* A word or synonimous is a sequence of characters as listed in the first
* line. Spaces at the beginning and ending of words are ignored by thescoder.
* Note that it isn't necessary to have word alphabetically sorted.
*
* Index file (.idx) generated is a plain-text file with the following format:
* <word1>;<offset1>
* <word2>;<offset2>
* ...
* <wordN>;<offsetN>
* It's a **sorted** list of all the words and synonimous. Synonimous list
* for every word is stored in data file. So, in index file, for every word
* there is the offset in data file where synonimous list for that word is.
*
* Data file is a binary file with the following format:
* <n1><idx_11><idx_12>...<idx_1n1>
* <n2><idx_21><idx_22>...<idx_2n2>
* ...
* <nM><idx_M1><idx_M2>...<idx_MnM>
* where <ni> is the number of synonimous of i-th word (as in index file),
* <idx_ij> is the j-th synonimous index (as in index file) of i-th word.
* Every number is stored as 16 bit unsigned integer in Big Endian format.
* OpenOffice.org read that files considering numbers in Big Endian format
* indipendent of the platform in which it is running. In this way, the same
* .idx and .dat files can be distributed for all the platforms without
* conversion.
*/
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "wordtree.h"
#define RELEASE_MAJOR 0
#define RELEASE_MINOR 6
#define VERBOSE
/* define DEBUG*/
#define APPNAME "thescoder"
#define WORD_MAXLENGTH 128 /* Max word and syn length. */
#define SYN_SEP_DEFAULT ';' /* Default syn separator character. */
#define MAX_WORDCHARS 128 /* Maximum number of characters accepted. */
/* Useful macros. */
#define make_string(x) #x
#define def2string(x) make_string(x)
#define syntax_error() {fprintf( stderr, APPNAME ": Syntax error\n" );\
fprintf( stderr, "usage: " APPNAME " [-sep <sep char>] <input file> <output prefix file>\n" );\
}
/* Internal funtion prototypes. */
int read_word( FILE *f, char *word );
int iswordchar( char c );
/* Global varibles. */
char wordchar[MAX_WORDCHARS];
int
main( int argc, char *argv[] )
{
FILE *in,*idx,*dat; /* Input and output files. */
WTelem *wordtree = NULL, /* Wordtree home address. */
*syn, /* Synonimous pointer (tree element). */
*word; /* Word pointer (tree element). */
char word_str[WORD_MAXLENGTH]; /* Buffer for word. */
int sep; /* Synonimous separator. */
char *filename; /* String for output filename. */
char *out_prefix; /* Output file name (without extension). */
unsigned int nw=0; /* Word counter. */
char syn_sep = SYN_SEP_DEFAULT; /* Syn separator character. */
/* Check for the optional -sep option. */
if( argc>1 )
if( !strcmp( *(argv+1), "-sep" ) ) {
if( argc>2 ) {
argc -= 2;
argv += 2;
syn_sep = **argv;
} else {
/* -sep option without parameter. */
syntax_error();
exit(1);
}
}
/* Read input file name... */
if( !--argc ) {
syntax_error();
exit(1);
}
/* ...and open it. */
if( !strcmp("-v",*++argv) ) {
fprintf( stderr, APPNAME " " def2string(RELEASE_MAJOR) "." def2string(RELEASE_MINOR) "\n" );
exit(0);
}
if( (in=fopen(*argv,"rt"))==NULL ) {
fprintf( stderr, APPNAME ": Error opening %s file for reading\n", *argv );
exit(1);
}
/* Read output prefix file name (second parameter). */
if( !--argc ) {
syntax_error();
fclose(in);
exit(1);
}
out_prefix = *++argv;
/* If more arguments there are, it's a syntax error. */
if( --argc ) {
syntax_error();
fclose(in);
exit(1);
}
#ifdef VERBOSE
fprintf( stderr, "Reading words...\n" );
#endif
/* Read word characters. */
if( fscanf(in, "%" def2string(MAX_WORDCHARS) "s\n",wordchar)!=1 ) {
fprintf( stderr, APPNAME ": error reading word characters line.\n" );
fclose(in);
exit(1);
}
/* Now we can read input file and add words to the tree. */
while( (sep=read_word(in,word_str))!=-1 ) {
/* Increment number of words. */
++nw;
#ifdef DEBUG
fprintf( stderr, "Word %4u: .%s. read\n", nw, word_str );
#endif
/* Add word to the tree (as a word). */
if( (word=wordtree_add( word_str, &wordtree ))==NULL ) {
fprintf( stderr, APPNAME ": out of memory\n" );
fclose( in );
exit(1);
}
if( word->isword )
fprintf( stderr, "Warning: Two or more lines for the word .%s.\n", word->word );
else
word->isword = 1;
/* Add synonimous to the word. */
while( (char)sep==syn_sep ) {
sep=read_word(in,word_str);
#ifdef DEBUG
fprintf( stderr, "Adding synonimous %s to word %s\n", word_str, word->word );
#endif
/* Add synonimous to the tree. */
if( (syn=wordtree_add( word_str, &wordtree ))==NULL ) {
fprintf( stderr, APPNAME ": out of memory\n" );
fclose(in);
exit(1);
}
/* Add synonimous to the word synonimous list. */
if( synlist_add(word,syn)==NULL ) {
fprintf( stderr, APPNAME ": out of memory\n" );
fclose(in);
exit(1);
}
}
}
#ifdef VERBOSE
fprintf( stderr, "%u words read from input file\n", nw );
#endif
/* Close input file. */
fclose( in );
/* Open output files. */
#ifdef VERBOSE
fprintf( stderr, "Writing output files...\n" );
#endif
if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
fprintf( stderr, APPNAME ": out of memory\n" );
exit(1);
}
sprintf( filename, "%s.idx", out_prefix );
if( (idx=fopen(filename,"wt"))==NULL ) {
fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
free(filename);
exit(1);
}
free(filename);
if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
fprintf( stderr, APPNAME ": out of memory\n" );
fclose(idx);
exit(1);
}
sprintf( filename, "%s.dat", out_prefix );
if( (dat=fopen(filename,"wb"))==NULL ) {
fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
free(filename);
fclose(idx);
exit(1);
}
free(filename);
/* Write output files. */
wordtree_output(wordtree, idx, dat);
/* Exit without errors. */
wordtree_free(wordtree);
fclose( idx );
fclose( dat );
return(0);
}
int
read_word( FILE *f, char *word )
{
/* Read from file f the next word and fill word buffer.
* Return -1 if there isn't another word to read,
otherwise the last character read. */
int c;
int inword=0;
/* Skip initial spaces. */
while( isspace((c=fgetc(f))) )
;
/* Read alphabetical characters. */
while( iswordchar(c) || c==' ' ) {
if( !inword )
inword=1;
*word++=c;
c=fgetc(f);
}
/* Cut trailing spaces. */
while( *--word==' ' )
;
/* Terminate the word. */
*++word='\0';
return(inword?c:-1);
}
int
iswordchar( char c )
{
/* Return 1 if c is in wordchar[] array of characters. */
char *w = wordchar;
while( *w && (*w!=c) )
w++;
return( *w==c?1:0 );
}
|