File: index.c

package info (click to toggle)
swish%2B%2B 1.1b3-3
links: PTS
area: main
in suites: slink
size: 416 kB
ctags: 409
sloc: ansic: 2,842; makefile: 247; sh: 48
file content (1001 lines) | stat: -rw-r--r-- 25,778 bytes
parent folder | download | duplicates (2)
/*
**	SWISH++
**	index.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <climits>
#include <cmath>				/* for log(3) */
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <fstream.h>
#include <iomanip.h>
#include <iostream.h>
#include <string>
#include <sys/types.h>
#include <unistd.h>
#include <vector>

// local
#include "config.h"
#include "directory.h"
#include "fake_ansi.h"
#include "file_info.h"
#include "file_list.h"
#include "file_vector.h"
#include "html.h"
#include "string_set.h"
#include "util.h"
#include "version.h"
#include "word_info.h"
#include "word_index.h"

extern "C" {
	extern char*	optarg;
	extern int	optind, opterr;
}

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

void		do_file( char const *path );
void		merge_indicies( ostream& );
void		rank_full_index();
void		usage();
void		write_file_index( ostream&, off_t *offset );
void		write_full_index( ostream& );
void		write_partial_index();
void		write_word_index( ostream&, off_t *offset );

string_set	extensions;			// file extensions to index
char const*	me;				// executable name
word_map	words;				// the index being generated
long		total_words;
long		unique_words;
int		num_tmp_files;
int		verbosity;			// how much to print
int		word_file_file_max = INT_MAX;
int		word_file_percentage_max = 100;

string const	tmp_file_prefix = string( Tmp_Dir ) +
			string( itoa( ::getpid() ) ) + string( "." );

//*****************************************************************************
//
// SYNOPSIS
//
	int main( int argc, char *argv[] )
//
// DESCRIPTION
//
//	Parse the command line, initialize, call other functions ... the
//	usual things that are done in main().
//
// PARAMETERS
//
//	argc	The number of arguments.
//
//	argv	A vector of the arguments; argv[argc] is null.  Aside from
//		the options below, the arguments are the names of the files
//		and directories to be indexed.
//
// SEE ALSO
//
//	Stroustrup, Bjarne.  "The C++ Programming Language, 3rd ed."
//	Addison-Wesley, Reading, MA.  pp. 116-118.
//
//*****************************************************************************
{
	me = ::strrchr( argv[0], '/' );		// determine base name
	me = me ? me + 1 : argv[0];		// of executable

	/////////// Process command-line options //////////////////////////////

	char const *index_file_name = "the.index";

	::opterr = 1;
	for ( int opt;
		(opt = ::getopt( argc, (char**)argv, "e:f:i:lp:v:V" )) != EOF;
	)
		switch ( opt ) {

			case 'e': // Specify filename extension(s) to index.
				extensions.insert( ::optarg );
				break;

			case 'f': // Specify the word/file file maximum.
				word_file_file_max = ::atoi( ::optarg );
				break;

			case 'i': // Specify index file overriding the default.
				index_file_name = ::optarg;
				break;

			case 'l': // Follow symbolic links during indexing.
				follow_symbolic_links = true;
				break;

			case 'p': // Specify the word/file percentage.
				word_file_percentage_max = ::atoi( ::optarg );
				break;

			case 'v': // Specify verbosity level.
				verbosity = ::atoi( ::optarg );
				if ( verbosity < 0 )
					verbosity = 0;
				else if ( verbosity > 3 )
					verbosity = 3;
				break;

			case 'V': // Display version and exit.
				cout << "SWISH++ " << version << endl;
				::exit( 0 );

			case '?': // Bad option.
				usage();
		}

	argc -= ::optind, argv += ::optind;
	if ( !argc )
		usage();

	/////////// Index specified directories and files /////////////////////

	ofstream out( index_file_name );
	if ( !out ) {
		cerr	<< me << ": can not write index to "
			<< index_file_name << endl;
		::exit( 1 );
	}

	time_t time = ::time( 0 );
	while ( *argv ) {
		if ( is_directory( *argv ) )
			do_directory( *argv );
		else
			do_file( *argv );
		++argv;
	}

	if ( num_tmp_files ) {
		if ( words.size() ) {
			//
			// If we created any partial indicies, write the
			// remaining words to their own partial index so the
			// merge code doesn't have a special case.
			//
			write_partial_index();
		}
		merge_indicies( out );
	} else {
		rank_full_index();
		write_full_index( out );
	}

	out.close();

	if ( verbosity ) {
		time = ::time( 0 ) - time;
		cout	<< setfill('0')
			<< "\nIndexing done:\n  "
			<< setw(2) << (time / 60) << ':'
			<< setw(2) << (time % 60)
			<< " elapsed time\n  "
			<< file_info::set_.size() << " files\n  "
			<< total_words << " words, "
			<< unique_words << " unique\n\n"
			<< setfill(' ');
	}

	return 0;
}

//*****************************************************************************
//
// SYNOPSIS
//
	void do_file( char const *file_name )
//
// DESCRIPTION
//
//	Index the words in the given file, but only if its extension is among
//	the specified set.  It will not follow symbolic links unless the -l
//	command-line option was given.  A file is considered to be an HTML
//	file only if its extension is "htm", "html", or "shtml".
//
// PARAMETERS
//
//	file_name	The file to index.
//
//*****************************************************************************
{
	////////// Determine if we should process the file ////////////////////

	if ( !is_plain_file() ||
		is_symbolic_link( file_name ) && !follow_symbolic_links
	)
		return;

	//
	// Check to see if the file name has a '.' in it and that it is not the
	// last character.
	//
	char const *ext = ::strrchr( file_name, '.' );
	if ( !ext || !*++ext )
		return;

	//
	// If the candidate extension contains a '/', then it's really not an
	// extension; rather, it's a file name like: "/a.bizarre/file".
	//
	if ( ::strchr( ext, '/' ) )
		return;

	//
	// Skip the file if the set of acceptable extensions does not contain
	// the candidate.
	//
	if ( !extensions.find( ext ) )
		return;

	file_vector<char> file( file_name );
	if ( !file )
		return;

	if ( verbosity > 2 ) {			// print base name of file
		char const *const slash = ::strrchr( file_name, '/' );
		cout << "  " << ( slash ? slash + 1 : file_name ) << flush;
	}

	bool const is_html =
		!::strcmp( ext, "htm"   ) ||
		!::strcmp( ext, "html"  ) ||
		!::strcmp( ext, "shtml" );

	char const *title = is_html ? grep_title( file ) : 0;
	if ( !title ) {
		//
		// File either isn't HTML or it doesn't have a <TITLE> tag:
		// simply use its file name as its title.
		//
		if ( title = ::strrchr( file_name, '/' ) )
			++title;
		else
			title = file_name;
	}

	////////// Index the file /////////////////////////////////////////////

	file_info *const fi = new file_info( file_name, file.size(), title );

	char word_buf[ Word_Hard_Max_Size + 1 ];
	register char *word;
	int word_len;
	bool in_word = false;

	register file_vector<char>::const_iterator c = file.begin();
	while ( c != file.end() ) {
		register char ch = *c++;

		if ( is_html )
			switch ( ch ) {
				case '<':
					skip_html_tag( c, file.end() );
					continue;
				case '&':
					ch = convert_entity( c, file.end() );
					break;
			}

		////////// Collect a word /////////////////////////////////////

		if ( is_word_char( ch ) ) {
			if ( !in_word ) {
				// start a new word
				word = word_buf;
				word[ 0 ] = ch;
				word_len = 1;
				in_word = true;
				continue;
			}
			if ( word_len < Word_Hard_Max_Size ) {
				// continue same word
				word[ word_len++ ] = ch;
				continue;
			}
			in_word = false;		// too big: skip chars
			while ( c != file.end() && is_word_char( *c++ ) ) ;
			continue;
		}

		if ( !in_word )
			continue;

		////////// Got a word /////////////////////////////////////////

		in_word = false;
		if ( word_len < Word_Hard_Min_Size )
			continue;

		//
		// Strip chars not in Word_End_Chars from end of word.
		//
		for ( register int i = word_len - 1; i >= 0; --i ) {
			if ( !::strchr( Word_End_Chars, tolower( word[ i ] ) ) )
				--word_len;
			else
				break;
		}
		if ( word_len < Word_Hard_Min_Size )
			continue;

		word[ word_len ] = '\0';

		//
		// Strip chars not in Word_Begin_Chars from beginning of word.
		//
		for ( register char const *p = word; *p; ++p ) {
			if ( !::strchr( Word_Begin_Chars, tolower( *p ) ) )
				--word_len, ++word;
			else
				break;
		}
		if ( word_len < Word_Hard_Min_Size )
			continue;

		if ( !is_ok_word( word ) )
			continue;

		////////// Add the word ///////////////////////////////////////

		++fi->num_words_;
		++total_words;

		// Canonicalize to lower-case.
		::transform( word, word + word_len, word, to_lower );

		word_info &info = words[ word ];
		++info.occurrences_;

		if ( !info.files_.empty() ) {
			//
			// We've seen this word before: determine whether we've
			// seen it before in THIS file, and, if so, merely
			// increment the number of occurrences.
			//
			word_info::file &last_file = info.files_.back();
			if ( last_file.index_ == file_info::current_index() ) {
				++last_file.occurrences_;
				continue;
			}
		}

		// First time word occurred in current file.
		info.files_.push_back(
			word_info::file( file_info::current_index() )
		);
	}

	if ( verbosity > 2 )
		cout << " (" << fi->num_words_ << " words)" << endl;

	if ( words.size() >= Word_Threshold )
		write_partial_index();
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline int rank( int file_index, int occurences_in_file, double factor )
//
// DESCRIPTION
//
//	Compute the rank of a word in a file.  This equation was taken from
//	the one used in SWISH-E whose author thinks (?) it is the one taken
//	from WAIS.  I can't find this equation in the refernece cited below,
//	although that reference does list a different equation.  But, if it
//	ain't broke, don't fix it.
//
// PARAMETERS
//
//	file_index		Which file we're dealing with.
//
//	occurences_in_file	The number of times the word occurs in a given
//				file.
//
//	factor			This should be precomputed to be the value of
//				1000.0 divided by the total number of
//				occurrences across all files.  This number is
//				constant for a given word, hence the
//				precomputation.
//
// RETURN VALUE
//
//	Returns the rank.
//
// SEE ALSO
//
//	Salton, Gerard.  "Automatic Text Processing: the transformation,
//	analysis, and retrieval of information by computer."  Addison-Wesley,
//	Reading, MA.  pp. 279-280.
//
//*****************************************************************************
{
	int r = int(
		( ::log( occurences_in_file ) + 10 ) * factor
		/ file_info::set_[ file_index ]->num_words_
	);
	return r > 0 ? r : 1;
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline int word_file_percentage( int file_count )
//
// DESCRIPTION
//
//	Compute the percentage of the number of files a word occurs in.
//
// RETURN VALUE
//
//	Returns said percentage.
//
//*****************************************************************************
{
	return file_count * 100 / file_info::set_.size();
}

//*****************************************************************************
//
// SYNOPSIS
//
	void merge_indicies( ostream &o )
//
// DESCRIPTION
//
//	Perform an n-way merge of the partial word index files.  It first
//	determines the number of unique words in all the partial indicies,
//	then merges them all together and performs ranking at the same time.
//
// PARAMETERS
//
//	o	The ostream to write the index to.
//
//*****************************************************************************
{
	vector< file_vector<char> > index( num_tmp_files );
	vector< word_index > words( num_tmp_files );
	vector< word_index::const_iterator > word( num_tmp_files );
	string_set extra_stop_words;
	register int i, j;

	////////// Reopen all the partial indicies ////////////////////////////

	for ( i = 0; i < num_tmp_files; ++i ) {
		string const tmp_file = tmp_file_prefix + itoa( i );
		index[ i ].open( tmp_file.c_str() );
		if ( !index[ i ] ) {
			cerr	<< me << ": can not reopen tmp file "
				<< tmp_file << endl;
			::exit( 2 );
		}
		words[ i ].set_index_file( index[ i ] );
	}

	////////// Must determine the number of unique words first ////////////

	if ( verbosity > 1 )
		cout << "Determining unique words..." << flush;

	for ( i = 0; i < num_tmp_files; ++i ) {
		unique_words += words[ i ].size();
		word[ i ] = words[ i ].begin();
	}
	while ( 1 ) {
		// Find at least two non-exhausted indicies noting the first.
		int n = 0;
		for ( j = 0; j < num_tmp_files; ++j )
			if ( word[ j ] != words[ j ].end() && !n++ )
				i = j;
		if ( n < 2 )
			break;

		// Find the lexographically least word.
		for ( j = i + 1; j < num_tmp_files; ++j ) {
			if ( word[ j ] == words[ j ].end() )
				continue;
			if ( ::strcmp( *word[ j ], *word[ i ] ) < 0 )
				i = j;
		}

		file_list list( word[ i ] );
		int file_count = list.size();

		// See if there are any duplicates and eliminate them.
		for ( j = i + 1; j < num_tmp_files; ++j ) {
			if ( word[ j ] == words[ j ].end() )
				continue;
			if ( !::strcmp( *word[ i ], *word[ j ] ) ) {
				--unique_words;
				file_list list( word[ j ] );
				file_count += list.size();
				++word[ j ];
			}
		}

		// Mark words that occur too frequently.
		if ( file_count > word_file_file_max ) {
			if ( verbosity > 2 )
				cout	<< "\n  \"" << *word[ i ]
					<< "\" discarded (" << file_count
					<< " files)" << flush;
			extra_stop_words.insert( *word[ i ] );
			--unique_words;
		} else {
			int const wfp = word_file_percentage( file_count );
			if ( wfp >= word_file_percentage_max ) {
				if ( verbosity > 2 )
					cout	<< "\n  \"" << *word[ i ]
						<< "\" discarded (" << wfp
						<< "%)" << flush;
				extra_stop_words.insert( *word[ i ] );
				--unique_words;
			}
		}

		++word[ i ];
	}

	////////// Write index file header ////////////////////////////////////

	long const num_files = file_info::set_.size();
	off_t *const word_offset = new off_t[ unique_words ];
	off_t *const file_offset = new off_t[ num_files ];

	o.write( &unique_words, sizeof( unique_words ) );
	streampos const word_offset_pos = o.tellp();
	o.write( word_offset, unique_words * sizeof( word_offset[0] ) );

	o.write( &num_files, sizeof( num_files ) );
	streampos const file_offset_pos = o.tellp();
	o.write( file_offset, num_files * sizeof( file_offset[0] ) );

	////////// Merge the indicies /////////////////////////////////////////

	if ( verbosity > 1 )
		cout << "\nMerging partial indicies..." << flush;

	for ( i = 0; i < num_tmp_files; ++i )		// reset all iterators
		word[ i ] = words[ i ].begin();
	int word_index = 0;
	while ( 1 ) {

		////////// Find the next word /////////////////////////////////

		// Find at least two non-exhausted indicies.
		int n = 0;
		for ( j = 0; j < num_tmp_files; ++j )
			if ( word[ j ] != words[ j ].end() && !n++ )
				i = j;
		if ( n < 2 )
			break;

		// Find the lexographically least word.
		for ( j = i + 1; j < num_tmp_files; ++j ) {
			if ( word[ j ] == words[ j ].end() )
				continue;
			if ( ::strcmp( *word[ j ], *word[ i ] ) < 0 )
				i = j;
		}

		if ( extra_stop_words.find( *word[ i ] ) )
			continue;

		word_offset[ word_index++ ] = o.tellp();
		o << *word[ i ] << '\0';

		////////// Determine total occurrences in all indicies ////////

		int total_occurrences = 0;
		for ( j = i; j < num_tmp_files; ++j ) {
			if ( word[ j ] == words[ j ].end() )
				continue;
			if ( ::strcmp( *word[ i ], *word[ j ] ) )
				continue;

			file_list list( word[ j ] );
			for ( file_list::const_iterator
				file = list.begin(); file != list.end(); ++file
			)
				total_occurrences += file->occurrences;
		}

		////////// Copy all index info and compute ranks //////////////

		double const factor = 1000.0 / total_occurrences;

		for ( j = i; j < num_tmp_files; ++j ) {
			if ( word[ j ] == words[ j ].end() )
				continue;
			if ( ::strcmp( *word[ i ], *word[ j ] ) )
				continue;

			file_list list( word[ j ] );
			for ( file_list::const_iterator
				file = list.begin(); file != list.end(); ++file
			)
				o	<< file->index << ' ' << rank(
						file->index,
						file->occurrences,
						factor
					)
					<< ' ';

			if ( j != i ) ++word[ j ];
		}
		o << '\0';

		++word[ i ];
	}

	////////// Copy remaining words from last non-exhausted index /////////

	for ( j = 0; j < num_tmp_files; ++j ) {
		if ( word[ j ] == words[ j ].end() )
			continue;

		if ( extra_stop_words.find( *word[ i ] ) )
			continue;

		////////// Determine total occurrences in all indicies ////////

		int total_occurrences = 0;
		file_list list( word[ j ] );
		for ( file_list::const_iterator
			file = list.begin(); file != list.end(); ++file
		)
			total_occurrences += file->occurrences;
		double const factor = 1000.0 / total_occurrences;

		////////// Copy all index info and compute ranks //////////////

		while ( word[ j ] != words[ j ].end() ) {

			word_offset[ word_index++ ] = o.tellp();
			o << *word[ j ] << '\0';

			file_list list( word[ j ] );
			for ( file_list::const_iterator
				file = list.begin(); file != list.end(); ++file
			) {
				o	<< file->index << ' ' << rank(
						file->index,
						file->occurrences,
						factor
					)
					<< ' ';
			}

			++word[ j ];
		}
		o << '\0';
	}

	write_file_index( o, file_offset );

	////////// Go back and write the computed offsets /////////////////////

	o.seekp( word_offset_pos );
	o.write( word_offset, unique_words * sizeof( word_offset[0] ) );
	o.seekp( file_offset_pos );
	o.write( file_offset, num_files * sizeof( file_offset[0] ) );

	delete[] word_offset;
	delete[] file_offset;

	////////// Blow away the temporary files //////////////////////////////

	for ( i = 0; i < num_tmp_files; ++i ) {
		string const tmp_file = tmp_file_prefix + itoa( i );
		::unlink( tmp_file.c_str() );
	}

	if ( verbosity > 1 )
		cout << endl;
}

//*****************************************************************************
//
// SYNOPSIS
//
	void rank_full_index()
//
// DESCRIPTION
//
//	Compute the rank of all files for all words in the index.  This
//	function is used only when partial indicies are not generated.  Also
//	removes words that occur too frequently.
//
//*****************************************************************************
{
	if ( verbosity > 1 )
		cout <<	"\nRanking index..." << flush;

	for ( word_map::iterator w = words.begin(); w != words.end(); ++w ) {
		word_info &info = w->second;
		double const factor = 1000.0 / info.occurrences_;

		int file_count = 0;
		for ( word_info::file_set::iterator
			file  = info.files_.begin();
			file != info.files_.end(); ++file
		) {
			file->rank_ = rank(
				file->index_, file->occurrences_, factor
			);
			++file_count;
		}

		// Remove words that occur too frequently.
		if ( file_count > word_file_file_max ) {
			if ( verbosity > 2 )
				cout	<< "\n  \"" << w->first
					<< "\" discarded (" << file_count
					<< " files)" << flush;
			words.erase( w );
		} else {
			int const wfp = word_file_percentage( file_count );
			if ( wfp >= word_file_percentage_max ) {
				if ( verbosity > 2 )
					cout	<< "\n  \"" << w->first
						<< "\" discarded (" << wfp
						<< "%)" << flush;
				words.erase( w );
			}
		}
	}

	if ( verbosity > 1 )
		cout <<	endl;
}

//*****************************************************************************
//
// SYNOPSIS
//
	void write_file_index( ostream &o, off_t *offset )
//
// DESCRIPTION
//
//	Write the file index to the given ostream recording the offsets as it
//	goes.
//
// PARAMETERS
//
//	o	The ostream to write the index to.
//
//	offset	A pointer to a built-in vector where to record the offsets.
//
//*****************************************************************************
{
	register int i = 0;
	for ( file_info::set_type::const_iterator
		fi = file_info::set_.begin(); fi != file_info::set_.end(); ++fi
	) {
		offset[ i++ ] = o.tellp();
		o << **fi << '\0';
	}
}

//*****************************************************************************
//
// SYNOPSIS
//
	void write_full_index( ostream &o )
//
// DESCRIPTION
//
//	Write the index to the given ostream.  The index file is written in
//	such a way so that it can be mmap'd and used instantly with no
//	parsing or other processing.  The format of an index file is:
//
//		long	unique_words;
//		off_t	word_offset[ unique_words ];
//		long	num_files;
//		off_t	file_offset[ num_files ];
//			(word index)
//			(file index)
//
//	A file index is a list of file information.  Each entry is of the
//	form:
//
//		path_name file_size file_title\0
//
//	where the parts are separated by a single space and are teminated by
//	a null character.  (Any spaces after the second one are part of the
//	title.)  For example:
//
//		/path/my.html 1234 Some Title
//
//	Each file_offset points to the first character in a path name.
//
//	The word index is a list of all the words indexed in alphabetical
//	order.  Each entry is of the form:
//
//		word\0file_index rank \0
//		      ^^^^^^^^^^^^^^^^
//	where the word is terminated by a null character and both the
//	file_index and rank are followed by spaces.  The ^'d part is repeated
//	for every file the word is in and terminated by a null character.
//	Each word_offset points to the first character in a word.
//
// PARAMETERS
//
//	o	The ostream to write the index to.
//
//*****************************************************************************
{
	if ( verbosity > 1 )
		cout << "Writing index..." << flush;

	unique_words = words.size();
	long const num_files = file_info::set_.size();
	off_t *const word_offset = new off_t[ unique_words ];
	off_t *const file_offset = new off_t[ num_files ];

	// Write dummy data as a placeholder untill the offsets are computed.
	o.write( &unique_words, sizeof( unique_words ) );
	streampos const word_offset_pos = o.tellp();
	o.write( word_offset, unique_words * sizeof( word_offset[0] ) );

	o.write( &num_files, sizeof( num_files ) );
	streampos const file_offset_pos = o.tellp();
	o.write( file_offset, num_files * sizeof( file_offset[0] ) );

	write_word_index( o, word_offset );
	write_file_index( o, file_offset );

	// Go back and write the computed offsets.
	o.seekp( word_offset_pos );
	o.write( word_offset, unique_words * sizeof( word_offset[0] ) );
	o.seekp( file_offset_pos );
	o.write( file_offset, num_files * sizeof( file_offset[0] ) );

	delete[] word_offset;
	delete[] file_offset;

	if ( verbosity > 1 )
		cout << endl;
}

//*****************************************************************************
//
// SYNOPSIS
//
	void write_partial_index()
//
// DESCRIPTION
//
//	Write a partial index to a temporary file.  The format of an index
//	file is:
//
//		long	num_words;
//		off_t	word_offset[ num_words ];
//			(word index)
//
//	The word index is in the same format as the complete index exept that
//	"rank" is "occurrences."
//
//*****************************************************************************
{
	string const tmp_file = tmp_file_prefix + itoa( num_tmp_files++ );
	ofstream o( tmp_file.c_str() );
	if ( !o ) {
		cerr	<< me << ": can not write intermediate file "
			<< tmp_file << endl;
		::exit( 5 );
	}

	if ( verbosity > 1 )
		cout << "\nWriting partial index..." << flush;

	long const num_words = words.size();
	off_t *const word_offset = new off_t[ num_words ];

	// Write dummy data as a placeholder until the offsets are computed.
	o.write( &num_words, sizeof( num_words ) );
	streampos const word_offset_pos = o.tellp();
	o.write( word_offset, num_words * sizeof( word_offset[0] ) );

	write_word_index( o, word_offset );

	// Go back and write the computed offsets.
	o.seekp( word_offset_pos );
	o.write( word_offset, num_words * sizeof( word_offset[0] ) );

	delete[] word_offset;
	words.clear();

	if ( verbosity > 1 )
		cout << '\n' << endl;
}

//*****************************************************************************
//
// SYNOPSIS
//
	void write_word_index( ostream &o, off_t *offset )
//
// DESCRIPTION
//
//	Write the word index to the given ostream recording the offsets as it
//	goes.
//
// PARAMETERS
//
//	o	The ostream to write the index to.
//
//	offset	A pointer to a built-in vector where to record the offsets.
//
//*****************************************************************************
{
	register int word_index = 0;
	for ( word_map::const_iterator
		w = words.begin(); w != words.end(); ++w
	) {
		word_info const &info = w->second;

		offset[ word_index++ ] = o.tellp();
		o << w->first << '\0';

		for ( word_info::file_set::const_iterator
			file  = info.files_.begin();
			file != info.files_.end(); ++file
		)
			o << file->index_ << ' ' << file->occurrences_ << ' ';

		o << '\0';
	}
}

//*****************************************************************************
//
//	Miscellaneous function(s)
//
//*****************************************************************************

void usage() {
	cerr <<	"usage: " << me << " [options] dir ... file ...\n"
		" options:\n"
		" --------\n"
		"  -e ext          : Extension to index\n"
		"  -f file_max     : Word/file maximum\n"
		"  -i index_file   : Name of index file to generate\n"
		"  -l              : Follow symbolic links\n"
		"  -p percent_max  : Word/file percentage\n"
		"  -v verbosity    : Verbosity level [0-3]\n"
		"  -V              : Print version number and exit\n";
	::exit( 1 );
}