File: main.c

package info (click to toggle)
thescoder 0.6-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k, lenny, sarge
  • size: 104 kB
  • ctags: 34
  • sloc: ansic: 311; makefile: 9
file content (299 lines) | stat: -rw-r--r-- 8,916 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/*
 *  main.c - Thesaurus coder for OpenOffice.org
 *
 *  Copyright (C) 2003, 2005 Giuseppe Modugno
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/* CHANGES
 * 0.6
 * - Added "-sep" option
 */

/*  thescoder is a program that converts an input thesaurus plain-text file
 *  into two files, index file (.idx extension) and data file (.dat extension).
 *  
 *  Plain-text input file must have the following format:
 *  <characters accepted>
 *  <word1>;<syn11>;<syn12>...\n
 *  <word2>;<syn21>;<syn22>...\n
 *  ...
 *  <wordN>;<synN1>;<synN2>...\n
 *  On the first line there is a list of characters accepted: lowercase
 *  and uppercase letters, dash ('-'), space (' ') and so on.
 *  On every next lines there is a word and all its synonimous, separated by a
 *  semicolon. Spaces are optional before and after colon word-separator 
 *  character.
 *  A word or synonimous is a sequence of characters as listed in the first
 *  line. Spaces at the beginning and ending of words are ignored by thescoder.
 *  Note that it isn't necessary to have word alphabetically sorted.
 *
 *  Index file (.idx) generated is a plain-text file with the following format:
 *  <word1>;<offset1>
 *  <word2>;<offset2>
 *  ...
 *  <wordN>;<offsetN>
 *  It's a **sorted** list of all the words and synonimous. Synonimous list
 *  for every word is stored in data file. So, in index file, for every word
 *  there is the offset in data file where synonimous list for that word is.
 *
 *  Data file is a binary file with the following format:
 *  <n1><idx_11><idx_12>...<idx_1n1>
 *  <n2><idx_21><idx_22>...<idx_2n2>
 *  ...
 *  <nM><idx_M1><idx_M2>...<idx_MnM>
 *  where <ni> is the number of synonimous of i-th word (as in index file),
 *        <idx_ij> is the j-th synonimous index (as in index file) of i-th word.
 *  Every number is stored as 16 bit unsigned integer in Big Endian format.
 *  OpenOffice.org read that files considering numbers in Big Endian format
 *  indipendent of the platform in which it is running. In this way, the same
 *  .idx and .dat files can be distributed for all the platforms without
 *  conversion.
 */
  
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "wordtree.h"

#define RELEASE_MAJOR 0
#define RELEASE_MINOR 6

#define VERBOSE
/* define DEBUG*/

#define APPNAME          "thescoder"
#define WORD_MAXLENGTH   128        /* Max word and syn length. */
#define SYN_SEP_DEFAULT  ';'        /* Default syn separator character. */
#define MAX_WORDCHARS	 128        /* Maximum number of characters accepted. */

/* Useful macros. */
#define make_string(x)			#x
#define def2string(x)			make_string(x)

#define syntax_error()      {fprintf( stderr, APPNAME ": Syntax error\n" );\
                             fprintf( stderr, "usage: " APPNAME " [-sep <sep char>] <input file> <output prefix file>\n" );\
                            }

/* Internal funtion prototypes. */
int read_word( FILE *f, char *word );
int iswordchar( char c );

/* Global varibles. */
char wordchar[MAX_WORDCHARS];


int
main( int argc, char *argv[] )
{
  FILE *in,*idx,*dat;	/* Input and output files. */
  WTelem *wordtree = NULL, /* Wordtree home address. */
    *syn,		/* Synonimous pointer (tree element). */
    *word;		/* Word pointer (tree element). */
  char word_str[WORD_MAXLENGTH]; /* Buffer for word. */
  int sep;		/* Synonimous separator. */
  char *filename;		/* String for output filename. */
  char *out_prefix;	/* Output file name (without extension). */
  unsigned int nw=0;	/* Word counter. */
  char syn_sep = SYN_SEP_DEFAULT;	/* Syn separator character. */

  /* Check for the optional -sep option. */
  if( argc>1 )
    if( !strcmp( *(argv+1), "-sep" ) ) {
      if( argc>2 ) {
	argc -= 2;
	argv += 2;
        syn_sep = **argv;
      } else {
        /* -sep option without parameter. */
        syntax_error();
        exit(1);
      }
    }

  /* Read input file name... */
  if( !--argc ) {
    syntax_error();
    exit(1);
  }
  /* ...and open it. */
  if( !strcmp("-v",*++argv) ) {
    fprintf( stderr, APPNAME " " def2string(RELEASE_MAJOR) "." def2string(RELEASE_MINOR) "\n" );
    exit(0);
  }

  if( (in=fopen(*argv,"rt"))==NULL ) {
    fprintf( stderr, APPNAME ": Error opening %s file for reading\n", *argv );
    exit(1);
  }
  
  /* Read output prefix file name (second parameter). */
  if( !--argc ) {
    syntax_error();
    fclose(in);
    exit(1);
  }
  out_prefix = *++argv;
  
  /* If more arguments there are, it's a syntax error. */
  if( --argc ) {
    syntax_error();
    fclose(in);
    exit(1);
  }
  
  
  
#ifdef VERBOSE
  fprintf( stderr, "Reading words...\n" );
#endif
  /* Read word characters. */
  if( fscanf(in, "%" def2string(MAX_WORDCHARS) "s\n",wordchar)!=1 ) {
    fprintf( stderr, APPNAME ": error reading word characters line.\n" );
    fclose(in);
    exit(1);
  }
  
  /* Now we can read input file and add words to the tree. */
  while( (sep=read_word(in,word_str))!=-1 ) {
    /* Increment number of words. */
    ++nw;
#ifdef DEBUG
    fprintf( stderr, "Word %4u: .%s. read\n", nw, word_str );
#endif
    /* Add word to the tree (as a word). */
    if( (word=wordtree_add( word_str, &wordtree ))==NULL ) {
      fprintf( stderr, APPNAME ": out of memory\n" );
      fclose( in );
      exit(1);
    }
    if( word->isword )
      fprintf( stderr, "Warning: Two or more lines for the word .%s.\n", word->word );
    else
      word->isword = 1;
    
    /* Add synonimous to the word. */
    while( (char)sep==syn_sep ) {
      sep=read_word(in,word_str);
#ifdef DEBUG
      fprintf( stderr, "Adding synonimous %s to word %s\n", word_str, word->word );
#endif
      /* Add synonimous to the tree. */
      if( (syn=wordtree_add( word_str, &wordtree ))==NULL ) {
	fprintf( stderr, APPNAME ": out of memory\n" );
	fclose(in);
	exit(1);
      }
      /* Add synonimous to the word synonimous list. */
      if( synlist_add(word,syn)==NULL ) {
	fprintf( stderr, APPNAME ": out of memory\n" );
	fclose(in);
	exit(1);
      }
    }
  }
#ifdef VERBOSE
  fprintf( stderr, "%u words read from input file\n", nw );
#endif
  /* Close input file. */
  fclose( in );
  
  
  /* Open output files. */
#ifdef VERBOSE
  fprintf( stderr, "Writing output files...\n" );
#endif
  if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
    fprintf( stderr, APPNAME ": out of memory\n" );
    exit(1);
  }
  sprintf( filename, "%s.idx", out_prefix );
  if( (idx=fopen(filename,"wt"))==NULL ) {
    fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
    free(filename);
    exit(1);
  }
  free(filename);
  
  if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
    fprintf( stderr, APPNAME ": out of memory\n" );
    fclose(idx);
    exit(1);
  }
  sprintf( filename, "%s.dat", out_prefix );
  if( (dat=fopen(filename,"wb"))==NULL ) {
    fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
    free(filename);
    fclose(idx);
    exit(1);
  }
  free(filename);
  
  /* Write output files. */
  wordtree_output(wordtree, idx, dat);
  
  /* Exit without errors. */
  wordtree_free(wordtree);
  fclose( idx );
  fclose( dat );
  return(0);
}



int
read_word( FILE *f, char *word )
{
  /* Read from file f the next word and fill word buffer.
   * Return -1 if there isn't another word to read,
   otherwise the last character read. */
  int c;
  int inword=0;
  
  /* Skip initial spaces. */
  while( isspace((c=fgetc(f))) )
    ;
  
  /* Read alphabetical characters. */
  while( iswordchar(c) || c==' ' ) {
    if( !inword )
      inword=1;
    *word++=c;
    c=fgetc(f);
  }
  
  /* Cut trailing spaces. */
  while( *--word==' ' )
    ;
  /* Terminate the word. */
  *++word='\0';
  
  return(inword?c:-1);
}


int
iswordchar( char c )
{
  /* Return 1 if c is in wordchar[] array of characters. */
  char *w = wordchar;
  
  while( *w && (*w!=c) )
    w++;
  
  return( *w==c?1:0 );
}