1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
|
/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */
/*
* quick.c: Used to search for a pattern in a compressed file.
*
* Algorithm: if the file (or stdin) is a compressed file, then:
* + a. Read in the hash-table-index file.
* + b. For each page in which the words of the pattern can be found:
* build the hash-table using the words in exactly those pages.
* + c. Now, call compress with the given pattern.
*
* + d. Call the normal search routines with the compressed pattern on
* the input file.
* + e. If the option is to count number of matches, just exit.
* Otherwise we have to modify the r_output/output routines:
*
* + f. Read in the string-table-index file.
* + g. For each page in which the word numbers of the input file can
* be found: build the string-table using the words in exactly
* those pages.
* + h. Call uncompress with the input file line to be output and
* output THIS line instead of the original matched line.
*
* Part of this will be in agrep and part of this here.
*/
#include "defs.h"
#include <sys/types.h>
#include <sys/stat.h>
/*
* The quick-functions can be called multiple number of times --
* they however open the hash, string and freq files only once.
*/
hash_entry *compress_hash_table[HASH_TABLE_SIZE]; /* used for compress: assume it is zeroed by C */
char loaded_hash_table[HASH_FILE_BLOCKS]; /* bit mask of loaded pages in hash-table: store chars since just 4K: speed is most imp. */
char *hashindexbuf;
int hashindexsize;
/* returns length of compressed pattern after filling up the compressed pattern in the user-supplied newpattern buffer */
int
quick_tcompress(freq_file, hash_file, pattern, len, newpattern, maxnewlen, flags)
char *freq_file;
char *hash_file;
CHAR *pattern;
int len;
void *newpattern; /* can be FILE* or CHAR* */
int *maxnewlen;
int flags;
{
static FILE *hashfp = NULL, *hashindexfp = NULL;
static char old_freq_file[MAX_LINE_LEN] = "", old_hash_file[MAX_LINE_LEN] = "";
static int blocksize;
int newlen;
if ((hashfp == NULL) || (strcmp(freq_file, old_freq_file)) || (strcmp(hash_file, old_hash_file)))
{ /* Have to do some initializations */
char s[256];
struct stat statbuf;
if (hashfp != NULL) {
uninitialize_tcompress();
fclose(hashfp);
hashfp = NULL;
}
else memset(loaded_hash_table, '\0', HASH_FILE_BLOCKS);
if (!initialize_common(freq_file, flags)) return 0; /* don't call initialize_tcompress since that will load the FULL hash table */
if ((hashfp = fopen(hash_file, "r")) == NULL) {
if (flags & TC_ERRORMSGS) {
fprintf(stderr, "cannot open cast-dictionary file: %s\n", hash_file);
fprintf(stderr, "(use -H to give a dictionary-dir or run 'buildcast' to make a dictionary)\n");
}
return 0;
}
sprintf(s, "%s.index", hash_file);
if ((hashindexfp = fopen(s, "r")) == NULL) {
if (flags & TC_ERRORMSGS)
fprintf(stderr, "cannot open for reading: %s\n", s);
fclose(hashfp);
hashfp = NULL;
return 0;
}
blocksize = 0;
fscanf(hashindexfp, "%d\n", &blocksize);
if (blocksize == 0) blocksize = DEF_BLOCKSIZE;
if (fstat(fileno(hashindexfp), &statbuf) == -1) {
fprintf(stderr, "error in quick_tcompress/fstat on '%s.index'\n", hash_file);
fclose(hashfp);
hashfp = NULL;
fclose(hashindexfp);
hashindexfp = NULL;
return 0;
}
if ((hashindexbuf = (char *)malloc(statbuf.st_size + 1)) == NULL) {
if (flags & TC_ERRORMSGS)
fprintf(stderr, "quick_tcompress: malloc failure!\n");
fclose(hashfp);
hashfp = NULL;
fclose(hashindexfp);
hashindexfp = NULL;
return 0;
}
if ((hashindexsize = fread(hashindexbuf, 1, statbuf.st_size, hashindexfp)) == -1) {
fprintf(stderr, "error in quick_tcompress/fread on '%s.index'\n", hash_file);
fclose(hashfp);
hashfp = NULL;
fclose(hashindexfp);
hashindexfp = NULL;
return 0;
}
hashindexsize ++; /* st_size - bytes used up for blocksize in file + 1 <= st_size */
hashindexbuf[hashindexsize] = '\0';
fclose(hashindexfp);
strcpy(old_freq_file, freq_file);
strcpy(old_hash_file, hash_file);
}
else rewind(hashfp); /* Don't do it first time */
if (pattern[len-1] == '\0') len--;
build_partial_hash(compress_hash_table, hashfp, hashindexbuf, hashindexsize, pattern, len, blocksize, loaded_hash_table);
newlen = tcompress(pattern, len, newpattern, maxnewlen, flags);
#if 0
printf("quick_tcompress: pat=%s len=%d newlen=%d newpat=", pattern, len, newlen);
for (i=0; i<newlen; i++) printf("%d ", newpattern[i]);
printf("\n");
#endif /*0*/
return newlen;
}
char *compress_string_table[DEF_MAX_WORDS]; /*[MAX_WORD_LEN+2]; */
char loaded_string_table[STRING_FILE_BLOCKS]; /* bit mask of loaded pages in string-table: store chars since just 4K: speed is most imp. */
char *stringindexbuf;
int stringindexsize;
/* returns length of uncompressed pattern after filling up the uncompressed pattern in the user-supplied newpattern buffer */
int
quick_tuncompress(freq_file, string_file, pattern, len, newpattern, maxnewlen, flags)
char *string_file;
char *freq_file;
CHAR *pattern;
int len;
void *newpattern; /* can be FILE* or CHAR* */
int *maxnewlen;
int flags;
{
static FILE *stringfp = NULL, *stringindexfp = NULL;
static char old_freq_file[MAX_LINE_LEN] = "", old_string_file[MAX_LINE_LEN] = "";
static int blocksize;
int newlen;
int dummy;
if ((stringfp == NULL) || (strcmp(freq_file, old_freq_file)) || (strcmp(string_file, old_string_file)))
{ /* Have to do some initializations */
char s[256];
struct stat statbuf;
if (stringfp != NULL) {
uninitialize_tuncompress();
fclose(stringfp);
stringfp = NULL;
}
else memset(loaded_string_table, '\0', STRING_FILE_BLOCKS);
if (!initialize_common(freq_file, flags)) return 0; /* don't call initialize_tuncompress since that will load the FULL string table */
if ((stringfp = fopen(string_file, "r")) == NULL) {
if (flags & TC_ERRORMSGS) {
fprintf(stderr, "cannot open cast-dictionary file: %s\n", string_file);
fprintf(stderr, "(use -H to give a dictionary-dir or run 'buildcast' to make a dictionary)\n");
}
return 0;
}
sprintf(s, "%s.index", string_file);
if ((stringindexfp = fopen(s, "r")) == NULL) {
if (flags & TC_ERRORMSGS)
fprintf(stderr, "cannot open for reading: %s\n", s);
fclose(stringfp);
stringfp = NULL;
return 0;
}
blocksize = 0;
fscanf(stringindexfp, "%d\n", &blocksize);
if (blocksize == 0) blocksize = DEF_BLOCKSIZE;
if (fstat(fileno(stringindexfp), &statbuf) == -1) {
fprintf(stderr, "error in quick_tuncompress/fstat on '%s.index'\n", string_file);
fclose(stringfp);
stringfp = NULL;
fclose(stringindexfp);
stringindexfp = NULL;
return 0;
}
if ((stringindexbuf = (char *)malloc(statbuf.st_size + 1)) == NULL) {
if (flags & TC_ERRORMSGS)
fprintf(stderr, "quick_tuncompress: malloc failure!\n");
fclose(stringfp);
stringfp = NULL;
fclose(stringindexfp);
stringindexfp = NULL;
return 0;
}
stringindexsize = 0;
while(fscanf(stringindexfp, "%d\n", &dummy) == 1) {
*((unsigned short *)(stringindexbuf+stringindexsize)) = (unsigned short)dummy;
stringindexsize+=sizeof(unsigned short);
}
fclose(stringindexfp);
strcpy(old_freq_file, freq_file);
strcpy(old_string_file, string_file);
}
else rewind(stringfp);
build_partial_string(compress_string_table, stringfp, stringindexbuf, stringindexsize, pattern, len, blocksize, loaded_string_table);
newlen = tuncompress(pattern, len, newpattern, maxnewlen, flags);
#if 0
printf("quick_tuncompress: len=%d newlen=%d newpat=%s pat=", len, newlen, newpattern);
for (i=0; i<len; i++) printf("%d ", pattern[i]);
printf("\n");
#endif /*0*/
return newlen;
}
|