File: main.c

package info (click to toggle)
thescoder 0.6-1
links: PTS
area: main
in suites: etch, etch-m68k, lenny, sarge
size: 104 kB
ctags: 34
sloc: ansic: 311; makefile: 9
file content (299 lines) | stat: -rw-r--r-- 8,916 bytes
/*
 *  main.c - Thesaurus coder for OpenOffice.org
 *
 *  Copyright (C) 2003, 2005 Giuseppe Modugno
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/* CHANGES
 * 0.6
 * - Added "-sep" option
 */

/*  thescoder is a program that converts an input thesaurus plain-text file
 *  into two files, index file (.idx extension) and data file (.dat extension).
 *  
 *  Plain-text input file must have the following format:
 *  <characters accepted>
 *  <word1>;<syn11>;<syn12>...\n
 *  <word2>;<syn21>;<syn22>...\n
 *  ...
 *  <wordN>;<synN1>;<synN2>...\n
 *  On the first line there is a list of characters accepted: lowercase
 *  and uppercase letters, dash ('-'), space (' ') and so on.
 *  On every next lines there is a word and all its synonimous, separated by a
 *  semicolon. Spaces are optional before and after colon word-separator 
 *  character.
 *  A word or synonimous is a sequence of characters as listed in the first
 *  line. Spaces at the beginning and ending of words are ignored by thescoder.
 *  Note that it isn't necessary to have word alphabetically sorted.
 *
 *  Index file (.idx) generated is a plain-text file with the following format:
 *  <word1>;<offset1>
 *  <word2>;<offset2>
 *  ...
 *  <wordN>;<offsetN>
 *  It's a **sorted** list of all the words and synonimous. Synonimous list
 *  for every word is stored in data file. So, in index file, for every word
 *  there is the offset in data file where synonimous list for that word is.
 *
 *  Data file is a binary file with the following format:
 *  <n1><idx_11><idx_12>...<idx_1n1>
 *  <n2><idx_21><idx_22>...<idx_2n2>
 *  ...
 *  <nM><idx_M1><idx_M2>...<idx_MnM>
 *  where <ni> is the number of synonimous of i-th word (as in index file),
 *        <idx_ij> is the j-th synonimous index (as in index file) of i-th word.
 *  Every number is stored as 16 bit unsigned integer in Big Endian format.
 *  OpenOffice.org read that files considering numbers in Big Endian format
 *  indipendent of the platform in which it is running. In this way, the same
 *  .idx and .dat files can be distributed for all the platforms without
 *  conversion.
 */
  
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "wordtree.h"

#define RELEASE_MAJOR 0
#define RELEASE_MINOR 6

#define VERBOSE
/* define DEBUG*/

#define APPNAME          "thescoder"
#define WORD_MAXLENGTH   128        /* Max word and syn length. */
#define SYN_SEP_DEFAULT  ';'        /* Default syn separator character. */
#define MAX_WORDCHARS	 128        /* Maximum number of characters accepted. */

/* Useful macros. */
#define make_string(x)			#x
#define def2string(x)			make_string(x)

#define syntax_error()      {fprintf( stderr, APPNAME ": Syntax error\n" );\
                             fprintf( stderr, "usage: " APPNAME " [-sep <sep char>] <input file> <output prefix file>\n" );\
                            }

/* Internal funtion prototypes. */
int read_word( FILE *f, char *word );
int iswordchar( char c );

/* Global varibles. */
char wordchar[MAX_WORDCHARS];


int
main( int argc, char *argv[] )
{
  FILE *in,*idx,*dat;	/* Input and output files. */
  WTelem *wordtree = NULL, /* Wordtree home address. */
    *syn,		/* Synonimous pointer (tree element). */
    *word;		/* Word pointer (tree element). */
  char word_str[WORD_MAXLENGTH]; /* Buffer for word. */
  int sep;		/* Synonimous separator. */
  char *filename;		/* String for output filename. */
  char *out_prefix;	/* Output file name (without extension). */
  unsigned int nw=0;	/* Word counter. */
  char syn_sep = SYN_SEP_DEFAULT;	/* Syn separator character. */

  /* Check for the optional -sep option. */
  if( argc>1 )
    if( !strcmp( *(argv+1), "-sep" ) ) {
      if( argc>2 ) {
	argc -= 2;
	argv += 2;
        syn_sep = **argv;
      } else {
        /* -sep option without parameter. */
        syntax_error();
        exit(1);
      }
    }

  /* Read input file name... */
  if( !--argc ) {
    syntax_error();
    exit(1);
  }
  /* ...and open it. */
  if( !strcmp("-v",*++argv) ) {
    fprintf( stderr, APPNAME " " def2string(RELEASE_MAJOR) "." def2string(RELEASE_MINOR) "\n" );
    exit(0);
  }

  if( (in=fopen(*argv,"rt"))==NULL ) {
    fprintf( stderr, APPNAME ": Error opening %s file for reading\n", *argv );
    exit(1);
  }
  
  /* Read output prefix file name (second parameter). */
  if( !--argc ) {
    syntax_error();
    fclose(in);
    exit(1);
  }
  out_prefix = *++argv;
  
  /* If more arguments there are, it's a syntax error. */
  if( --argc ) {
    syntax_error();
    fclose(in);
    exit(1);
  }
  
  
  
#ifdef VERBOSE
  fprintf( stderr, "Reading words...\n" );
#endif
  /* Read word characters. */
  if( fscanf(in, "%" def2string(MAX_WORDCHARS) "s\n",wordchar)!=1 ) {
    fprintf( stderr, APPNAME ": error reading word characters line.\n" );
    fclose(in);
    exit(1);
  }
  
  /* Now we can read input file and add words to the tree. */
  while( (sep=read_word(in,word_str))!=-1 ) {
    /* Increment number of words. */
    ++nw;
#ifdef DEBUG
    fprintf( stderr, "Word %4u: .%s. read\n", nw, word_str );
#endif
    /* Add word to the tree (as a word). */
    if( (word=wordtree_add( word_str, &wordtree ))==NULL ) {
      fprintf( stderr, APPNAME ": out of memory\n" );
      fclose( in );
      exit(1);
    }
    if( word->isword )
      fprintf( stderr, "Warning: Two or more lines for the word .%s.\n", word->word );
    else
      word->isword = 1;
    
    /* Add synonimous to the word. */
    while( (char)sep==syn_sep ) {
      sep=read_word(in,word_str);
#ifdef DEBUG
      fprintf( stderr, "Adding synonimous %s to word %s\n", word_str, word->word );
#endif
      /* Add synonimous to the tree. */
      if( (syn=wordtree_add( word_str, &wordtree ))==NULL ) {
	fprintf( stderr, APPNAME ": out of memory\n" );
	fclose(in);
	exit(1);
      }
      /* Add synonimous to the word synonimous list. */
      if( synlist_add(word,syn)==NULL ) {
	fprintf( stderr, APPNAME ": out of memory\n" );
	fclose(in);
	exit(1);
      }
    }
  }
#ifdef VERBOSE
  fprintf( stderr, "%u words read from input file\n", nw );
#endif
  /* Close input file. */
  fclose( in );
  
  
  /* Open output files. */
#ifdef VERBOSE
  fprintf( stderr, "Writing output files...\n" );
#endif
  if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
    fprintf( stderr, APPNAME ": out of memory\n" );
    exit(1);
  }
  sprintf( filename, "%s.idx", out_prefix );
  if( (idx=fopen(filename,"wt"))==NULL ) {
    fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
    free(filename);
    exit(1);
  }
  free(filename);
  
  if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
    fprintf( stderr, APPNAME ": out of memory\n" );
    fclose(idx);
    exit(1);
  }
  sprintf( filename, "%s.dat", out_prefix );
  if( (dat=fopen(filename,"wb"))==NULL ) {
    fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
    free(filename);
    fclose(idx);
    exit(1);
  }
  free(filename);
  
  /* Write output files. */
  wordtree_output(wordtree, idx, dat);
  
  /* Exit without errors. */
  wordtree_free(wordtree);
  fclose( idx );
  fclose( dat );
  return(0);
}



int
read_word( FILE *f, char *word )
{
  /* Read from file f the next word and fill word buffer.
   * Return -1 if there isn't another word to read,
   otherwise the last character read. */
  int c;
  int inword=0;
  
  /* Skip initial spaces. */
  while( isspace((c=fgetc(f))) )
    ;
  
  /* Read alphabetical characters. */
  while( iswordchar(c) || c==' ' ) {
    if( !inword )
      inword=1;
    *word++=c;
    c=fgetc(f);
  }
  
  /* Cut trailing spaces. */
  while( *--word==' ' )
    ;
  /* Terminate the word. */
  *++word='\0';
  
  return(inword?c:-1);
}


int
iswordchar( char c )
{
  /* Return 1 if c is in wordchar[] array of characters. */
  char *w = wordchar;
  
  while( *w && (*w!=c) )
    w++;
  
  return( *w==c?1:0 );
}