File: makeconcfile.c

package info (click to toggle)
bible-kjv 4.30
  • links: PTS
  • area: main
  • in suites: buster
  • size: 4,748 kB
  • sloc: ansic: 3,569; makefile: 344; sh: 233; perl: 35
file content (404 lines) | stat: -rw-r--r-- 12,038 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* -*-C-*-
*******************************************************************************
*
* File:         makeconcfile.c
* RCS:          $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcfile.c,v 2.7 2005/01/23 11:27:00 matthew Exp $
* Description:  Generate Concordance Data File
* Author:       Chip Chapin
* Created:      Thu Dec 17 11:28:20 1992
* Modified:     Mon Apr 26 11:14:38 1993 (Chip Chapin) chip@hpclbis
* Language:     C
* Package:      Text Storage Library (Bible Retrieval System)
* Status:       Experimental (Do Not Distribute)
*
*******************************************************************************
*
* Revisions:
*
* Thu Jan  7 12:07:05 1993 (Chip Chapin) chip@hpclbis
*  Revised fileheader.
*  Implemented cumulative index entries to save space (24kbytes for KJV).
* Wed Jan  6 10:08:12 1993 (Chip Chapin) chip@hpclbis
*  Began implementation of list compression.
* Wed Dec 23 13:43:11 1992 (Chip Chapin) chip@hpclbis
*  Unlink tmp files when done.
*******************************************************************************
* $Log: makeconcfile.c,v $
* Revision 2.7  2005/01/23 11:27:00  matthew
* include more standard header files
*
* Revision 2.6  2005/01/22 18:07:28  matthew
* prototype main properly
*
* Revision 2.5  2005/01/22 17:17:54  matthew
* cast return value of sizeof
*
* Revision 2.4  2005/01/22 16:28:28  matthew
* initialise cmpnum.
*
* Revision 2.3  2005/01/22 16:27:34  matthew
* sort out function prototyping, and include util.h
*
* Revision 2.2  2003/07/26 10:02:53  matthew
* fprintf("\0") is not very useful
*
* Revision 2.1  2003/02/01 02:39:53  matthew
* make main int main ..., and accordingly return 0 at the end of it, if
* all is well
*
* Revision 2.0  2003/01/08 15:29:52  matthew
* versions collected from the net
*
 * Revision 1.5  93/04/26  11:18:17  11:18:17  chip (Chip Chapin)
 * Release 4.00
 * Public release of portable datafile version.
 * 
 * Revision 1.4  93/04/23  13:08:07  13:08:07  chip (Chip Chapin)
 * PORTABILITY RELEASE
 * This version supports portable data files, usable on machines with
 * differing native byte-orders.
 * Also, this version compiles and runs on non-HPUX systems.  It has been
 * tested on SunOS 4.? and ULTRIX 4.?, using SPARC and DEC 3100 hardware
 * respectively.  Note that the data file format has rolled again.
 * 
 * Revision 1.3  93/01/07  12:18:06  12:18:06  chip (Chip Chapin)
 * Release 3.01: Greatly improved compression of concordance data file.
 * 
*
*******************************************************************************
*/

#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include "tsl.h"
#include "util.h"

#define FALSE	(0)
#define TRUE	(1)

#define INDEXTMPF	"/tmp/mci"
#define DATATMPF	"/tmp/mcd"

static void outerr(int n);
static void print_header(void);

char *myname;			/* Program Name */

/*
  Structure of Input Data ...

    Input data is a sorted ASCII file.
    Each line is a record.
    Each record is a variable-length list of fields, separated by blanks,
    as follows:

       <word> <ref> {<ref>...}

    where <word> is a lower-case alpha string, and <ref> is an integer
    numeric string.  Each <word> is guaranteed to be unique.  <ref> is
    expected to describe a location in the subject document where
    <word> occurs, though this program doesn't actually care about
    that.

  Structure of Output Data ...

    Please refer to description in tsl.h.
 */

struct tsl_conc_fileheader fh;


int main(int argc,char **argv)
/*----------------------------------------------------------------------
|   NAME:
|       main
|
|   ALGORITHM:
|       Main Program.
|
|   HISTORY:
|       921221 cc Initial creation.
|
\*----------------------------------------------------------------------*/
{
    FILE	*outfp, *indexfp, *datafp;
    int		ref;		/* NOT a ref_t, because it must be signed */
    ref_t	rbuf[SELECTSZ];
    unsigned char obuf[SELECTSZ*sizeof(ref_t)];
    int		n, oinx, cnt;
    unsigned char bhi, blo;
    int		refs_left;
    int		no_comp;
    int		cmpnum=0;
    int		base, curbase;
    int		m_n, m_ref, m_cmpnum, m_cmpsize;
    file_ptr_t	word_index, data_index;
    int		entry_size;
    short int	this_index;
    Short_Univ_Int indextmp;
#define WRDSZ    100
    char word[WRDSZ];
    
    myname = *argv++; argc--;	/* Program name */
    
    if (argc < 1) {
	fprintf( stderr, "%s: Missing output file name\n", myname );
	fprintf( stderr, "Usage: %s datafile\n", myname );
	exit( 1 );
    }

    outfp = fopen( *argv, "w" );
    if (outfp == NULL) {
	fprintf( stderr, "%s: Cannot open output file %s\n", myname, *argv );
	exit( 1 );
    }
    indexfp = fopen( INDEXTMPF, "w+" );
    datafp = fopen( DATATMPF, "w+" );
    if ((indexfp == NULL) || (datafp == NULL)) {
	fprintf( stderr, "%s: Cannot open temp files.\n", myname );
	exit( 1 );
    }

    /* Initialize the header and write it out... */
    fh.magic[0] = TSL_CONCMAGIC1;
    fh.magic[1] = TSL_CONCMAGIC2;
    fh.version[0] = TSL_CONCFVERSION1;
    fh.version[1] = TSL_CONCFVERSION2;
    strncpy( fh.name, "KJV Concordance", TSL_DESCRSZ );
    univ_assign(fh.word_ptr, sizeof( fh ));
    univ_assign(fh.word_cnt, 0);
    univ_assign(fh.index_ptr, 0);
    univ_assign(fh.data_ptr, 0);
    fwrite( &fh, sizeof(fh), 1, outfp );

    
    /* Process input data.
       Each line consists of a word, followed by a list of numeric
       references where the word occurs.  Separated by blanks.

       We will write the strings for the word list into the output file
       as we go, meanwhile counting the words and building both the reference
       index and the reference data pool in temp files.

       To save space, the reference index entries are not actually pointers
       into the reference data pool, but are the SIZE of the entry's ref data
       pool.
       
       So the offset in the ref data pool for entry N is the
       sum over i=0..N-1 of index(i).

       To complicate things more, there is a special case when an
       entry N has only a single ref.  In that case, the ref is NOT
       entered into the ref data pool but the ref itself is stored as
       index(N).  It is negated to distinguish it from a regular index
       entry.
       
     */
    word_index = 0;	/* The index of each word, 0..N */
    data_index = 0;	/* The offset in the ref data pool of current entry */
    while (scanf( "%s", word) > 0) {
	/* Append string to word list */
	if ((n=fputs( word, outfp )) <= 0)
	    outerr(n);
	putc( 0, outfp );

	/* Start reading references */
	cnt = 0;
	while (scanf( "%d", &n) > 0) {
	    /* Read all refs into buffer */
	    rbuf[cnt++] = (ref_t) n;
	}
	if (cnt <= 0) {
	    /* this should never happen */
	    fprintf( stderr, "%s: Bad input format.  No refs for %s\n",
		    myname, word );
	}
	
	entry_size = 0;		/* size (bytes) of current ref data entry */
	if (cnt == 1) {
	    /* Special handling for this case: word has a SINGLE REFERENCE.
	       Instead of putting the ref into the data pool, just keep it
	       in the index.  Encode it by negating it.
	     */
	    this_index = -rbuf[0];
	} else {
	    /* Write reference list to reference data pool */
	    for (n=oinx=0, refs_left=cnt; refs_left--; n++) {
		ref = rbuf[n];
		no_comp = TRUE;
		if (refs_left && (rbuf[n+1]-ref <= 16)) {
		    /* We can compress at least one (though possibly to
		       no advantage).  How many more might we compress?
		     */
		    cmpnum=1;
		    while ((cmpnum < refs_left) &&
			   (rbuf[n+cmpnum+1]-rbuf[n+cmpnum] <= 16)) {
			cmpnum++;
		    }

		    /*
		       How much will we really save?
		       Model the actual algorithm and compare...
		     */
		    m_cmpsize = 0;
		    curbase = ref;
		    m_n = n;
		    m_ref = rbuf[++m_n]-curbase-1;
		    m_cmpnum = cmpnum-1;
		    while (m_ref >= 0) {
			/* For each byte in bitmap... */
			while ((m_ref < 8) && (m_ref >= 0)) {
			    /* For each ref that fits in this byte... */
			    if (m_cmpnum > 0) {
				m_cmpnum--;
				m_ref = rbuf[++m_n]-curbase-1;
			    } else {
				m_ref = -1;	/* term. flag, both loops */
			    }
			}
			m_cmpsize++;
			curbase += 8;
			m_ref -= 8;
		    }
		    m_cmpsize += 2;		/* Add terminator size */

		    /* Well, did it pay off? */
		    no_comp = (m_cmpsize > (cmpnum*(int)sizeof(ref_t)));
		}
		if (no_comp) {
		    /* No compression */
		    bhi = (ref >> 8);
		    blo = ref & 0x00ff;
		    obuf[oinx++] = bhi;
		    obuf[oinx++] = blo;
		} else {
		    /* Do the compression.
		       Format:
		         <base address><bitmap>
		       Where <base address> is the NEGATED reference to the
		       line BEFORE the start of the bitmap refs.
		     */
		    /* Write base address */
		    base = ref;
		    ref = -ref;
		    bhi = (ref >> 8);
		    blo = ref & 0x00ff;
		    obuf[oinx++] = bhi;
		    obuf[oinx++] = blo;

		    /* Write bitmap.
		         ref is always with respect to curbase.
			 It is the number of verses, MINUS ONE, that
			 separate them.  The MINUS ONE is so the
			 left-shift works correctly.
		     */
		    curbase = base;
		    ref = rbuf[++n]-curbase-1;
		    refs_left -= cmpnum;
		    cmpnum--;
		    while (ref >= 0) {
			/* For each byte in bitmap... */
			blo = 0;
			while ((ref < 8) && (ref >= 0)) {
			    /* For each ref that fits in this byte... */
			    blo |= 0x01 << ref;
			    if (cmpnum > 0) {
				cmpnum--;
				ref = rbuf[++n]-curbase-1;
			    } else {
				ref = -1;	/* term. flag, both loops */
			    }
			}
			obuf[oinx++] = blo;
			curbase += 8;
			ref -= 8;
		    }
		    
		    /* Write TWO terminator bytes */
		    /* Might be better to use ONE, and change the requirement
		       to being within 8 lines, not 16 */
		    obuf[oinx++] = 0;
		    obuf[oinx++] = 0;
		} /* compression */
	    } /* for */
	    if ((n=fwrite( obuf, 1, oinx, datafp )) <= 0)
		outerr(n);

	    entry_size = oinx;
	    if (entry_size > 32767) {
		fprintf( stderr, "%s: Too many entries for \"%s\"!\n",
			myname, word );
		exit(1);
	    }
	    this_index = entry_size;
	} /* write ref list */
	
	/* Write the index entry for this word */
	shortuniv_assign( indextmp, this_index );
	if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
	    outerr(n);
	
	data_index += entry_size;
	word_index++;
    }

    /* It makes searching easier if we guarantee an ending string */
    if ((n=fprintf( outfp, "~" )) <= 0)
	outerr(n);
    shortuniv_assign( indextmp, 0 );
    if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
	outerr(n);
    
    /* OK, now finish putting the output file together. */
    /* First, another header field */
    univ_assign(fh.word_cnt, word_index);

    /* Now copy the reference index from temp file to output file */
    univ_assign(fh.index_ptr, ftell( outfp ));
    rewind( indexfp );
    while (fread( obuf, sizeof(Short_Univ_Int), 1, indexfp ) > 0) {
	fwrite( obuf, sizeof(Short_Univ_Int), 1, outfp );
    }

    /* Then copy the reference data from temp file to output file */
    univ_assign(fh.data_ptr, ftell( outfp ));
    rewind( datafp );
    while (fread( obuf, 1, 1, datafp ) > 0) {
	fwrite( obuf, 1, 1, outfp );
    }

    /* Finally, write out the updated file header */
    rewind( outfp );
    fwrite( &fh, sizeof(fh), 1, outfp );

    /* Prove that we've been working hard... */
    print_header();

    fclose(outfp);
    fclose(indexfp);
    fclose(datafp);
    unlink( INDEXTMPF );
    unlink( DATATMPF );
    return(0);
} /* main */


static void outerr(int n)
{
    fprintf( stderr, "%s: File I/O error #%d, %d\n", __FILE__, n, errno );
}


static void print_header(void)
{
    printf( "Concordance data file:\n" );
    printf( "  Version:  %c%c\n", fh.version[0], fh.version[1] );
    printf( "  Name:     %s\n", fh.name );
    printf( "  Contents: %d words\n", univ2int(fh.word_cnt) );
    printf( "  Word list at file offset %d\n", univ2int(fh.word_ptr) );
    printf( "  Index at file offset %d\n", univ2int(fh.index_ptr) );
    printf( "  Data at file offset %d\n", univ2int(fh.data_ptr) );
}