File: filetype.c

package info (click to toggle)
glimpse 4.18.7-12
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,832 kB
  • sloc: ansic: 37,606; makefile: 847; sh: 242; perl: 142
file content (333 lines) | stat: -rw-r--r-- 10,005 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */
/* ./glimpse/index/filetype.c */
/* --------------------------------------------------------------------------
   this function detect whether a given file is of special type
   which we do not want to index.
   if so, then return(1) else return (0).
   a file is said to be binary if more than 10% of character > 128
   in the sampled input.
   a file is a uuencoded file if (maybe after mail header), there is
   a "begin" followed by 3 digits, and no lower case character.

   statistics we are concerned of:
   1) average word length: should not be greater than 10.
   2) index density: (the number of different words v.s. number of words).

-----------------------------------------------------------------------------*/
#include "glimpse.h"
#include "../missing_prototypes.h"

#define SAMPLE_SIZE  8192
#define EXTRACT_SAMPLE_SIZE (MAX_LINE_LEN*2)	/* must be lesser than above: used to get info to be stored ALONG with filename */	/* suggested fix: ldrolez@usa.net */
#define WORD_THRESHOLD  18  /* the ratio between number of characters and
		delimiters (blanks or \n) above which the file is determined to be
		hqx or other non-natural language text */

#if	BG_DEBUG
extern	FILE	*LOGFILE;
#endif	/*BG_DEBUG*/
char *member[MAX_4K_HASH];
int member_tag[MAX_4K_HASH];
int  file_id;
extern  char *getword();
extern char INDEX_DIR[MAX_LINE_LEN];
extern int ExtractInfo;
extern int InfoAfterFilename;

char *extract_info_suffix[] = EXTRACT_INFO_SUFFIX;

/*
 * dosuffix > 0 => processes suffixes (build_in.c after filtering);
 * dosuffix > 0 but != 1 => processes suffixes only (IndexEverything, dir.c where we don't want to read files);
 * dosuffix == 0 => processes other ad-hoc file checks (Default, dir.c where we want to discard un-indexable files).
 */
int
filetype(name, dosuffix, xinfo_len, xinfo)
char *name;
int dosuffix;
int *xinfo_len;	/* length of information extracted */
char xinfo[MAX_LINE_LEN];	/* atmost 1K info can be extracted */
{
	unsigned char buffer[SAMPLE_SIZE+1];
	int num_read;
        int BINARY=0;
        int UUENCODED=0;
	int fd;
	int i, name_len = strlen(name);
	int extract_only = 0;
	char name_buffer[MAX_LINE_LEN];
	char *tempname;

	if (InfoAfterFilename || ExtractInfo) {
		special_get_name(name, name_len, name_buffer);
		tempname = name_buffer;
	}
	else tempname = name;
	name_len = strlen(tempname);

/* printf("\tname=%s dosuffix=%d xinfo_len=%x *=%d\n", tempname, dosuffix, xinfo_len, (xinfo_len == NULL) ? -1 : *xinfo_len); */
	if (xinfo_len != NULL) *xinfo_len = 0;
	if (!dosuffix) goto nosuffix;
	if (!strcmp(COMP_SUFFIX, &tempname[name_len-strlen(COMP_SUFFIX)]))
		return 0;
	if (test_special_suffix(tempname)) {
/* printf("\t\tspecial suffix \n"); */
#if	BG_DEBUG
		fprintf(LOGFILE, "special suffix: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		return 1;
	}
	if (dosuffix != 1) {
		if (!ExtractInfo || (xinfo_len == NULL) || (xinfo == NULL)) return 0;
		extract_only = 1;
	}

nosuffix:

	if((fd = my_open(tempname, 0)) < 0) {
		/* This is the only thing the user might want to know: suppress other warnings */
		fprintf(stderr, "permission denied or non-existent file: %s\n", name);
		return(1);
	}
        if ((num_read = read(fd, buffer, extract_only?EXTRACT_SAMPLE_SIZE:SAMPLE_SIZE)) <= 0) {
#if	BG_DEBUG
		fprintf(LOGFILE, "no data: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return 1;
	}

	if (extract_only) goto extract;

	if (test_postscript(buffer, num_read)) {
/* printf("\t\tpostscript\n"); */
#if	BG_DEBUG
		fprintf(LOGFILE, "postscript file: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return 1;
	}

        BINARY = test_binary(buffer, num_read);
        if(BINARY == ON) {
/* printf("\t\tbinary\n"); */
#if	BG_DEBUG
		fprintf(LOGFILE, "binary file: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}

	/* now check for uuencoded file */
        UUENCODED = test_uuencode(buffer, num_read);
        if(UUENCODED == ON) {
/* printf("\t\tuuencoded\n"); */
#if	BG_DEBUG
		fprintf(LOGFILE, "uuencoded file: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}
	if(heavy_index(tempname, buffer, num_read)) { 
/* printf("\t\theavy_index\n"); */
#if	BG_DEBUG
		fprintf(LOGFILE, "heavy index file: %s -- not indexing\n ", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}
	if(hqx(tempname, buffer, num_read)) { 
/* printf("\t\thqx\n"); */
#if	BG_DEBUG
		fprintf(LOGFILE, "too few real words: %s -- not indexing\n", name);
#endif	/*BG_DEBUG*/
		close(fd);
		return(1);
	}

extract:
	if (ExtractInfo && (xinfo_len != NULL) && (xinfo != NULL)) {
		/* This can be replaced by checks for <HTML> in the file somewhere, but suffixes are faster and easier and enough in most cases */
		for (i=0; i<NUM_EXTRACT_INFO_SUFFIX; i++) {
			if (!strcasecmp(&tempname[name_len - strlen(extract_info_suffix[i])], extract_info_suffix[i])) break;
		}
		*xinfo_len = 0;
		if (i < NUM_EXTRACT_INFO_SUFFIX) {
			*xinfo_len = extract_info(tempname, buffer, num_read, i, xinfo, MAX_LINE_LEN);
		} else {
			xinfo[0] = FILE_END_MARK;
			xinfo[1] = '\0';
			*xinfo_len = 2;
		}
/* printf("\t\ti=%d extracted %d\n", i, *xinfo_len); */
	}

	close(fd);
	return(0);
}

/* This does not look at "suffix_index": it is possible to extract different things for different files: they are displayed after name of file in glimpse */
int
extract_info(name, buffer, num_read, suffix_index, xinfo, max_len)
	char	*name, *buffer, *xinfo;
	int	num_read, suffix_index, max_len;
{
	int	i=0, j=0, k=0, found_begin = 0;
	static char notitle[16];
	static char *begin = "<title>", *end = "</title>";
	static int begin_len, end_len;
	static char tr[256];
	static int first_time = 1;

	if (first_time) {
		begin_len = strlen(begin);
		end_len = strlen(end);
		for (i=0; i<256; i++)
			tr[i] = i;
		for (i=0; i<256; i++)
			if (isupper(i)) tr[i] = tr[tolower(i)];

   		/* We need xinfo to start with a dividing character, usually space or tab */
		notitle[0] =  FILE_END_MARK;
		notitle[1] = '\0';
		strcat(notitle,"No Title");

		first_time = 0;
	}

	i = 0;
	buffer[num_read] = '\0';
	while (i<=num_read-begin_len) {
		if (buffer[i] != '<') {
			i++;
			continue;
		}
		for (j=0; j<begin_len; j++)
			if (tr[buffer[j+i]] != tr[begin[j]]) break;
		if (j < begin_len) {
			i ++;
			continue;
		}
		i += j;
		while ((buffer[i] == '\0') || (buffer[i] == '\n')) i++;
		found_begin = 1;
		break;
	}
	if (!found_begin) {
		k = strlen(notitle);
		strncpy(xinfo, notitle, max_len);
		xinfo[max_len-1] = '\0';
/* printf("-X on %s --> %s\n", name, xinfo); */
		return k;
	}
	k = 0;
	xinfo[k++] = FILE_END_MARK;   /* We need xinfo to start with a dividing character, usually space or tab */
	/* There was a hard to find off-by-one error here that caused random
	   crashes. We need an extra byte for the terminating 0, so loop
	   only up to max_len - 1.
	   - CV 01/10/00 */
	while ((i<num_read) && (k<max_len  - strlen(name) - 3)) {
		if (buffer[i] != '<') {
			if ((buffer[i] == '\0') || (buffer[i] == '\n')) {
				xinfo[k++] = ' ';	/* must convert whole title to one line */
				i++;
			}
			else if (buffer[i] == ':') {	/* maybe change : to HTMML ascii character rep of : ?? ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ */
				xinfo[k++] = '\\';
				xinfo[k++] = buffer[i++];
			}
			else xinfo[k++] = buffer[i++];
			continue;
		}
		for (j=0; j<end_len; j++)
			if (tr[buffer[j+i]] != tr[end[j]]) break;
		if (j < end_len) {
			if ((buffer[i] == '\0') || (buffer[i] == '\n')) {
				xinfo[k++] = ' ';
				i++;
			}
			else if (buffer[i] == ':') {	/* maybe change : to HTMML ascii character rep of : ?? ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ */
				xinfo[k++] = '\\';
				xinfo[k++] = buffer[i++];
			}
			else xinfo[k++] = buffer[i++];
			continue;
		}
		/* found_end ; forget about i */
		break;
	}
	if (k <= 1) {
		k = strlen(notitle);
		strncpy(xinfo, notitle, max_len);
		xinfo[max_len-1] = '\0';
/* printf("-X on %s --> %s\n", name, xinfo); */
		return k;
	}
	xinfo[k] = '\0';
/* printf("-X on %s --> %s\n", name, xinfo); */
	return k;
}

/* ----------------------------------------------------------------------
   check for heavy index file.
   the function first test block 1 (of SAMPLE_SIZE bytes).
   the file is determined to be heavy index file if
   index_ratio > 0.9 and num_words > 500
   ???
---------------------------------------------------------------------- */
heavy_index(name, buffer, num_read)
char *name;
char *buffer;
int num_read;
{
	char *buffer_end;
	int hash_value;
	int new_word_num=0;
	int word_num=0;
	char word[256];

	buffer_end = &buffer[num_read];
	while((buffer = getword(name, word, buffer, buffer_end, NULL, NULL)) < buffer_end) {
		if(word[0] == '\0') continue;
		word_num++;
		hash_value = hash4k(word, strlen(word));
		if(member_tag[hash_value] != file_id) {
			new_word_num++;
			member_tag[hash_value] = file_id;
		}
	}
	if(new_word_num * 100 >= word_num * 83 && word_num >= 500) return(1);
#ifdef debug
	printf("%s: new_word_num=%d, word_num=%d\n", name, new_word_num, word_num);
#endif
	return(0);
}

/* ----------------------------------------------------------------------
   check for hqx encoded files or other files with long lines,
   for example, postscript files, core files, and others.
   the function first test block 1 (of SAMPLE_SIZE bytes).
   the file is determined to be bad if the ratio of blanks or newlines
   is too small.
---------------------------------------------------------------------- */

hqx(name, buffer, num_read)
char *name;
char *buffer;
int num_read;
{
int i;
char c;
int sep=0;
	if (num_read < 2048) return(0) ;
	for (i=0; i < num_read ; i++) {
		c=buffer[i];
		if (c == '\n' || c == ' ' || c == '/') sep++;
	/* the '/' is for list of file names. */
	/* the \n is for lists of words, but should be excluded really so
		that dictionaries are excluded */
	}
	if (!sep) return(1);
	if (num_read/sep > WORD_THRESHOLD) return(1);
		else return(0);
}