1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#define MAX_LINE 512
void die( char * ) ;
char buffer[MAX_LINE+1] ;
char *prog_name ;
void die( char *message )
{
fflush(stdout) ;
fprintf(stderr, "%s: %s\n", prog_name, message) ;
exit(1) ;
}
int main(int argc, char* argv[])
{
int errors ;
prog_name = *argv ;
if( argc != 1 )
die("pure filter, takes no arguments") ;
errors = 0 ;
while( fgets(buffer, MAX_LINE, stdin))
errors += do_line(buffer) ;
exit(errors ? 1 : 0 ) ;
}
int do_line(char *data)
{
char *p, *q, *r, *end, *before, *after ;
// expecting two tab-separated fields
// point r to 2nd, null terminate 1st
for( r = data ; *r && *r != '\t' ; r++ )
;
if( *r != '\t' )
return(1) ;
end = r++ ;
*end = '\0' ;
for( q = r ; *q ; q++ )
if( *q == '\n' )
*q = '\0' ;
if( !strlen(r) )
return(1) ;
// within 1st, parse as space-separated
// p will point to current word, q past its end
// before & after point to rest of text
// spaces converted to nulls & back as req'd
before = "" ;
for( p = data ; p < end ; p = q + 1 ) {
if( p > data ) {
before = data ;
p[-1] = '\0' ;
}
// find end of word
for( q = p ; *q && *q != ' ' ; q++ )
;
if( q == end )
after = "" ;
else if( q < end ) {
after = q + 1 ;
*q = '\0' ;
}
else assert(0) ;
print_line(before, p, after, r) ;
if( q < end )
*q = ' ' ;
if( p > data )
p[-1] = ' ' ;
}
return(0) ;
}
// print formatted line for permuted index
// two tab-separated fields
// 1st is sort key
// 2nd is printable line
// pipe it through something like
// sort -F | awk -F '\t' '{print $2}'
// to get final output
print_line( char *before, char *word, char *after, char *tag)
{
int i , x, y, z ;
/*
printf("%s\t%s\t%s\t%s\n", before, word, after, tag) ;
*/
if( list_word(word) )
return ;
x = strlen(before) ;
y = strlen(word) ;
z = strlen(after) ;
// put in sortable field
// strip out with awk after sorting
printf("%s %s\t", word, after) ;
// shorten before string to fit field
for( ; x > 30 ; x-- )
before++ ;
printf("%30s", before) ;
// print keyword, html tagged
printf(" %s%s</a> ", tag, word) ;
// padding, outside tag
for( ; y < 18 ; y++ )
putchar(' ') ;
if( z )
printf("%s", after) ;
printf("\n") ;
}
// avoid indexing on common English words
char *list[] = {
"the", "of", "a", "an", "to", "and", "or", "if", "for", "at",
"am", "is", "are", "was", "were", "have", "has", "had", "be", "been",
"on", "some", "with", "any", "into", "as", "by", "in", "out",
"that", "then", "this", "that", "than", "these", "those",
"he", "his", "him", "she", "her", "hers", "it", "its",
"&", "", "+", "-", "=", "--", "<", ">", "<=", ">=",
"!", "?", "#", "$", "%", "/", "\\", "\"", "\'",
NULL
} ;
// interrogative words like "how" and "where" deliberately left out of
// above list because users might want to search for "how to..." etc.
// return 1 if word in list, else 0
// case-insensitive comparison
list_word( char *p )
{
char **z ;
for( z = list ; *z != NULL ; z++ )
if( ! strcasecmp( p, *z ) )
return 1 ;
return 0 ;
}
|