File: archer.h

package info (click to toggle)
bow 19991122-4
  • links: PTS
  • area: main
  • in suites: woody
  • size: 2,544 kB
  • ctags: 2,987
  • sloc: ansic: 38,660; lisp: 1,072; makefile: 594; perl: 492; yacc: 149; sh: 91
file content (216 lines) | stat: -rw-r--r-- 8,174 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* archer.h - public declartions for IR frontend to libbow.
   Copyright (C) 1998 Andrew McCallum

   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>

   This file is part of the Bag-Of-Words Library, `libbow'.

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License
   as published by the Free Software Foundation, version 2.
   
   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */

#ifndef __archer_h_INCLUDE
#define __archer_h_INCLUDE

#include <bow/libbow.h>


/* The version number of this program. */
#define ARCHER_MAJOR_VERSION 0
#define ARCHER_MINOR_VERSION 0

#define BOW_MAX_WORD_LABELS 100

/* The variables that are set by command-line options. */
struct archer_arg_state_s
{
  /* What this invocation of archer to do? */
  void (*what_doing)();
  int non_option_argi;
  int num_hits_to_print;
  FILE *query_out_fp;
  const char *dirname;
  const char *query_string;
  const char *server_port_num;
  int serve_with_forking;
  int score_is_raw_count;
};

typedef union {
  /* When we write it to disk, it just looks like a `char' */
  unsigned char byte;
  /* The first byte not only must tell us if we must read more bytes
     in order to get all the bits of the offset, but also must tell us
     if this is a "document index", "label index" or a "position
     index". "label indices" will be each non-first index in each
     sequence of indices with is_di true; the first index in each such
     sequence will be a valid (non-zero) "document index" or a
     placeholder zero value. However many label parameters there are are
     stored immediately afterwards the label index. */
  struct _bow_pe {
    unsigned int is_more:1;
    unsigned int is_di:1;
    unsigned int index:6;
  } bits;
  /* The bytes following the first don't need to tell us if they are a
     "document index", "label index" or "position index" because the
     first byte already told us. */
  struct _bow_pe_more {
    unsigned int is_more:1;
    unsigned int index:7;
  } bits_more;
} bow_pe;

typedef struct _bow_pv {
  int count;			/* total number of word occurrences in PV */
  int seek_start;		/* disk position where this PV starts */
  int read_seek_end;		/* disk position from which to read next */
  int read_last_di;		/* doc index last read */
  int read_last_pi;		/* position index last read */
  int read_segment_bytes_remaining;
  int write_seek_end;
  int write_last_di;
  int write_last_pi;
  int write_segment_bytes;
  int write_segment_bytes_remaining;
} bow_pv;

typedef struct _bow_wi2pv {
  const char *pv_filename;   /* filename where pv/di/pi matrix is stored */
  FILE *fp;                  /* file-pointer for the same */
  int num_words;             /* number of used wi entries */
  int next_word;             /* greatest wi entry on disk, plus 1.
				0 if no used entries */
  int entry_count;           /* number of wi entries allocated
				(entry_count >= next_word >= num_words) */
  bow_pv *entry;           
  FILE *inc_fp;              /* file-pointer for the wi2pv mapping */
  long entry_start;          /* position in the above file where entries 
				start. 
				- equal to the length of the header,
				which includes the pv_filename, whose
				length isn't known at compile-time */
} bow_wi2pv;

typedef struct archer_doc {
  bow_doc_type tag;
  int word_count;
  int di;
} archer_doc;

/* document annotations */
typedef struct annotation {
  int count;
  int size;
  char **feats;
  char **vals;
} annotation;

typedef struct archer_label {
  int word_count;
  int li;
} archer_label;

bow_wi2pv *bow_wi2pv_new (int capacity, const char *pv_filename, const char *inc_filename);
void bow_wi2pv_free (bow_wi2pv *wi2pv);
void bow_wi2pv_add_wi_di_pi (bow_wi2pv *wi2pv, int wi, int di, int pi); /* deprecated */
void bow_wi2pv_add_wi_di_li_pi (bow_wi2pv *wi2pv, int wi, int di, int li[],
				int ln, int pi);
void bow_wi2pv_rewind (bow_wi2pv *wi2pv);
void bow_wi2pv_wi_next_di_pi (bow_wi2pv *wi2pv, int wi, int *di, int *pi); /* deprecated */
void bow_wi2pv_wi_next_di_li_pi(bow_wi2pv *wi2pv, int wi, int *di,
				int li[], int *ln, int *pi);
void bow_wi2pv_wi_unnext (bow_wi2pv *wi2pv, int wi);
int bow_wi2pv_wi_count (bow_wi2pv *wi2pv, int wi);
void bow_wi2pv_write_header (bow_wi2pv *wi2pv);
void bow_wi2pv_write_entry (bow_wi2pv *wi2pv, int wi);
void bow_wi2pv_write (bow_wi2pv *wi2pv);
bow_wi2pv *bow_wi2pv_new_from_filename (const char *filename);
void bow_wi2pv_print_stats (bow_wi2pv *wi2pv);


/* Fill in PV with the correct initial values, and write the first
   segment header to disk.  What this function does must match what
   bow_pv_add_di_pi() does when it adds a new segment. */
void bow_pv_init (bow_pv *pv, FILE *fp);

/* Add "document index" DI and "position index" PI to PV by writing
   the correct information to FP.  Does not assume that FP is already
   seek'ed to the correct position.  Will add a new PV segment on disk
   if necessary.  Assumes that both DI and PI are greater than or
   equal to the last DI and PI written, respectively. */
void bow_pv_add_di_pi (bow_pv *pv, int di, int pi, FILE *fp);
void bow_pv_add_di_li_pi (bow_pv *pv, int di, int li[], int ln, int pi,
			  FILE *fp);

/* Read the next "document index" DI and "position index" PI.  Does
   not assume that FP is already seek'ed to the correct position.
   Will jump to a new PV segment on disk if necessary. */
void bow_pv_next_di_pi (bow_pv *pv, int *di, int *pi, FILE *fp);
void bow_pv_next_di_li_pi (bow_pv *pv, int *di, int li[], int *ln, int *pi,
			   FILE *fp);

/* Undo the effect of the last call to bow_pv_next_di_pi().  That is,
   make the next call to bow_pv_next_di_pi() return the same DI and PI
   as the last call did.  This function may not be called multiple
   times in a row without calling bow_pv_next_di_pi() in between. */
void bow_pv_unnext (bow_pv *pv);

/* Rewind the read position to the beginning of the PV */
void bow_pv_rewind (bow_pv *pv, FILE *fp);

/* Write the in-memory portion of PV to FP */
void bow_pv_write (bow_pv *pv, FILE *fp);

/* Read the in-memory portion of PV from FP */
void bow_pv_read (bow_pv *pv, FILE *fp);

/* Close and re-open WI2PV's FILE* for its PV's.  This should be done
   after a fork(), since the parent and child will share lseek()
   positions otherwise. */
void bow_wi2pv_reopen_pv (bow_wi2pv *wi2pv);

/* Label handling code */
const char *bow_last_label(void);
void bow_push_label(const char *label);
char *bow_pop_label(char buf[], int bufsz);
char *bow_first_label(char buf[], int bufsz);
char *bow_next_label(char buf[], int bufsz);
void bow_reset_labels(void);

/* Lexer interfaces */
void flex_mail_open(FILE *fp, const char * name);
int flex_mail_get_word(char buf[], int bufsz);
int flex_mail_get_word_extended(char buf[], int bufsz, long* start, long* end);
void tagged_lex_open(FILE* fp, const char * name);
void tagged_lex_open_dont_parse_tags(FILE* fp, const char * name);
int tagged_lex_get_word(char buf[], int bufsz);
int tagged_lex_get_word_extended(char buf[], int bufsz, long* start, long* end);

/* server code */
void archer_query_serve();

/* annotations */
bow_sarray *annotation_sarray_new(void);
annotation *annotation_new(void);
void annotation_add_fval(annotation *a, char *feat, char *val);
void annotation_sarray_write(bow_sarray *sa, char *fname);
bow_sarray *annotation_sarray_read(const char *fname);
bow_sarray *annotation_sarray_reread(bow_sarray *sa, const char *fname);
int annotation_count(annotation *a);
char *annotation_feat(annotation *a, int index);
char *annotation_val(annotation *a, int index);
annotation *annotation_sarray_entry_at_keystr(bow_sarray *a, const char
					      *keystr);


#endif /* __archer_h_INCLUDE */