##############################################################################
# The Faq-O-Matic is Copyright 1997 by Jon Howell, all rights reserved.      #
#                                                                            #
# This program is free software; you can redistribute it and/or              #
# modify it under the terms of the GNU General Public License                #
# as published by the Free Software Foundation; either version 2             #
# of the License, or (at your option) any later version.                     #
#                                                                            #
# This program is distributed in the hope that it will be useful,            #
# but WITHOUT ANY WARRANTY; without even the implied warranty of             #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
# GNU General Public License for more details.                               #
#                                                                            #
# You should have received a copy of the GNU General Public License          #
# along with this program; if not, write to the Free Software                #
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.#
#                                                                            #
# Jon Howell can be contacted at:                                            #
# 6211 Sudikoff Lab, Dartmouth College                                       #
# Hanover, NH  03755-3510                                                    #
# jonh@cs.dartmouth.edu                                                      #
#                                                                            #
# An electronic copy of the GPL is available at:                             #
# http://www.gnu.org/copyleft/gpl.html                                       #
#                                                                            #
##############################################################################

use strict;

### SearchMod.pm
###
### Support for search functions
###

package FAQ::OMatic::SearchMod;

use FAQ::OMatic::Item;
use FAQ::OMatic::Words;
use FAQ::OMatic;

sub openWordDB {
	return if (defined FAQ::OMatic::getLocal('wordDB'));
	my $wordDBtoken;
	if (FAQ::OMatic::usedbm()) {
		my %wordDB;
		if (not dbmopen (%wordDB, "$FAQ::OMatic::Config::metaDir/search", 0400)) {
			FAQ::OMatic::gripe('abort', "Can't open dbm search database. "
				."Have you run buildSearchDB? (Should I?)");
		}
		$wordDBtoken = \%wordDB;
	} else {
		if (not open(OFFSETFILE, "<$FAQ::OMatic::Config::metaDir/search.offset")) {
			FAQ::OMatic::gripe('abort', "Can't open search.offset. "
				."Have you run buildSearchDB? (Should I?)");
		}
		$wordDBtoken = "files_are_open_yahoo!";
	}
	if (not open(WORDSFILE, "<$FAQ::OMatic::Config::metaDir/search.words")) {
		FAQ::OMatic::gripe('abort', "Can't open search.words. "
			."Have you run buildSearchDB? (Should I?)");
	}
	if (not open(INDEXFILE, "<$FAQ::OMatic::Config::metaDir/search.index")) {
		FAQ::OMatic::gripe('abort', "Can't open search.index. "
			."Have you run buildSearchDB? (Should I?)");
	}
	FAQ::OMatic::setLocal('wordDB', $wordDBtoken);
}

sub closeWordDB {
	my $wordDB = FAQ::OMatic::getLocal('wordDB');
	if (FAQ::OMatic::usedbm()) {
		dbmclose %{$wordDB};
		undef %{$wordDB};
	} else {
		close OFFSETFILE;
	}
	close WORDSFILE;
	close INDEXFILE;
}

# linear scan of .offset file, looking for $word. Pretty slow,
# unless your dbm implementation is somehow very slow and broken,
# as on sourceforge.
sub scanOffsets {
	my $word = shift;
	seek (OFFSETFILE, 0, 0);
	my $line;
	while ($line = <OFFSETFILE>) {
		chomp $line;
		my ($fileWord, $pair) = split(' ', $line, 2);
		if ($fileWord eq $word) {
			#FAQ::OMatic::gripe('debug', "found pair: $line");
			return $pair;
		}
	}
	return undef;
}

sub getIndices {
	my $word = shift;
	my $pair;
	if (FAQ::OMatic::usedbm()) {
		my $wordDB = FAQ::OMatic::getLocal('wordDB');
		$pair = $wordDB->{$word};
	} else {
		$pair = scanOffsets($word);
	}

	# returns indexseek,wordseek pair
	# THANKS to Vicki Brown <vlb@cfcl.com> and jon * <jon@clearink.com>
	# for reporting unitialized value errors in this code.
	return defined($pair)
		? split(' ', $pair)
		: (undef,undef);
}

sub getWordClass {
	my $word = shift;
	my @wordclass = ();

	openWordDB();
	
	my ($indexseek, $wordseek) = getIndices($word);
	#FAQ::OMatic::gripe('debug', "got seeks $indexseek and $wordseek for $word");

	if (defined $indexseek) {
		#grab all words in wordsfile with $word as a prefix
		seek WORDSFILE, $wordseek, 0;
		while (<WORDSFILE>) {
			chomp;
			if (m/^$word/) {
				push @wordclass, $_;
			} else {
				last;
			}
		}
	}

	return \@wordclass;
}

sub getMatchesForClass {
	my $classref = shift;	# array ref for a class of "identical" words
	my %files;

	my $word;
	foreach $word (@{$classref}) {
		my ($indexseek,$wordseek) = getIndices($word);
		next if (not defined $indexseek);
		seek INDEXFILE, $indexseek, 0;
		while (<INDEXFILE>) {
			chomp;
			last if (m/^END$/);
			$files{$_}=1;
		}
	}

	my @matches = sort keys %files;
	return \@matches;
}

sub getMatchesForSet {
	my $params = shift;
	my $setref = $params->{'_searchArray'};
							# array ref for complete set of user words to search

	$params->{'_minMatches'} = 'all' if ($params->{'_minMatches'} eq '');
	my $minhits = $params->{'_minMatches'};
							# we return only files with at least this many hits
	if ($minhits eq 'all') {	# convert symbolic hits to a number
		$minhits = scalar(@{$setref});
	}
	$params->{'_actualMatches'} = $minhits;

	my %accumulator=();
	my @hitfiles=();

	my ($word, $file);
	foreach $word (@{$setref}) {
		my $classref = getWordClass($word);
		my $matches = getMatchesForClass($classref);
		foreach $file (@{$matches}) {
			$accumulator{$file}++;
		}
	}

	foreach $file (sort keys %accumulator) {
		if ($accumulator{$file} >= $minhits) {
			push @hitfiles, $file;
		}
	}

	return \@hitfiles;
}

sub convertSearchParams {
	my $params = shift;
	my $pattern;

	# given a user-input search string, we break it into "legal" words
	# and store it in another parameter.
	
	$pattern = $params->{'_search'};
	my @patternwords = FAQ::OMatic::Words::getWords( $pattern );

	$params->{'_searchArray'} = \@patternwords;
}

sub addNewFiles {
	my $wordset = shift;	#ary ref
	my $fileset = shift;	# hash ref -- where to add results
	my $words = {};

	# Get the list of files touched since last searchDB build
	if (not open HINTS, "<$FAQ::OMatic::Config::metaDir/searchHints") {
		# sorry, can't help ya
		return;
	}
	my @touchedFiles = <HINTS>;
	close HINTS;

	# index each item
	my $filename;
	my $item;
	foreach $filename (@touchedFiles) {
		chomp $filename;
		$item = new FAQ::OMatic::Item($filename);
		$item->extractWords($words);
	}

	# for every word in the wordset, add all the files it appears
	# in to the fileset passed to us.
	my $word;
	foreach $word (@{$wordset}) {
		if ($words->{$word}) {
			foreach $filename (keys %{$words->{$word}}) {
				$fileset->{$filename} = 1;
			}
		}
	}

	# notice that if there were suffixes of the user's requested words
	# in the new content that weren't in the system anywhere when the
	# searchDB was built, then those suffixes won't be in the wordset,
	# and the search will miss them. Hey, wah, this is better than
	# missing ALL the new content, okay? :v)

	# this also screws up the counts of how many matches this file
	# had (since it could contribute matches from the searchDB lookup
	# AND the newFiles lookup), so I'm going to leave it turned off
	# for now. Rats.
}

sub getRecentSet {
	my $params = shift;
	my $recentList = [];

	my $durationdays = $params->{'_duration'};
		# used directly to compare against perl's floating-point -M file test
	my $then = time() - $durationdays*24*60*60;
		# Used to compare against LastModifiedSecs field.
		# By 'days' we mean 24-hour periods, not calendar days.
		# (In the US, for example, there is a 23-hour calendar day in
		# April and a 25-hour one in the fall, what, in October? for daylight
		# savings time.)

	my $filei;
	foreach $filei (FAQ::OMatic::getAllItemNames()) {
		# use file time as a hint for which items we even need to open up.
		next if (-M "$FAQ::OMatic::Config::itemDir/$filei" >= $durationdays);
		# ...but only trust LastModifiedSecs field for final say on mod time.
		my $item = new FAQ::OMatic::Item($filei);
		my $lm = $item->{'LastModifiedSecs'};
		if ($lm > $then) {
			push @{$recentList}, $filei;
		}
	}

	return $recentList;
}

1;
