File: SearchMod.pm

package info (click to toggle)
faqomatic 2.721-9.1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 1,996 kB
  • ctags: 548
  • sloc: perl: 13,356; sh: 69; makefile: 47
file content (318 lines) | stat: -rw-r--r-- 9,612 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
##############################################################################
# The Faq-O-Matic is Copyright 1997 by Jon Howell, all rights reserved.      #
#                                                                            #
# This program is free software; you can redistribute it and/or              #
# modify it under the terms of the GNU General Public License                #
# as published by the Free Software Foundation; either version 2             #
# of the License, or (at your option) any later version.                     #
#                                                                            #
# This program is distributed in the hope that it will be useful,            #
# but WITHOUT ANY WARRANTY; without even the implied warranty of             #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
# GNU General Public License for more details.                               #
#                                                                            #
# You should have received a copy of the GNU General Public License          #
# along with this program; if not, write to the Free Software                #
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.#
#                                                                            #
# Jon Howell can be contacted at:                                            #
# 6211 Sudikoff Lab, Dartmouth College                                       #
# Hanover, NH  03755-3510                                                    #
# jonh@cs.dartmouth.edu                                                      #
#                                                                            #
# An electronic copy of the GPL is available at:                             #
# http://www.gnu.org/copyleft/gpl.html                                       #
#                                                                            #
##############################################################################

use strict;

### SearchMod.pm
###
### Support for search functions
###

package FAQ::OMatic::SearchMod;

use FAQ::OMatic::Item;
use FAQ::OMatic::I18N;
use FAQ::OMatic::Words;
use FAQ::OMatic;

sub openWordDB {
	return if (defined FAQ::OMatic::getLocal('wordDB'));
	my $wordDBtoken;
	if (FAQ::OMatic::usedbm()) {
		my %wordDB;
		if (not dbmopen (%wordDB, "$FAQ::OMatic::Config::metaDir/search", 0400)) {
			FAQ::OMatic::gripe('abort', "Can't open dbm search database. "
				."Have you run buildSearchDB? (Should I?)");
		}
		$wordDBtoken = \%wordDB;
	} else {
		if (not open(OFFSETFILE, "<$FAQ::OMatic::Config::metaDir/search.offset")) {
			FAQ::OMatic::gripe('abort', "Can't open search.offset. "
				."Have you run buildSearchDB? (Should I?)");
		}
		$wordDBtoken = "files_are_open_yahoo!";
	}
	if (not open(WORDSFILE, "<$FAQ::OMatic::Config::metaDir/search.words")) {
		FAQ::OMatic::gripe('abort', "Can't open search.words. "
			."Have you run buildSearchDB? (Should I?)");
	}
	if (not open(INDEXFILE, "<$FAQ::OMatic::Config::metaDir/search.index")) {
		FAQ::OMatic::gripe('abort', "Can't open search.index. "
			."Have you run buildSearchDB? (Should I?)");
	}
	FAQ::OMatic::setLocal('wordDB', $wordDBtoken);
}

sub closeWordDB {
	my $wordDB = FAQ::OMatic::getLocal('wordDB');
	if (FAQ::OMatic::usedbm()) {
		dbmclose %{$wordDB};
		undef %{$wordDB};
	} else {
		close OFFSETFILE;
	}
	close WORDSFILE;
	close INDEXFILE;
}

# linear scan of .offset file, looking for $word. Pretty slow,
# unless your dbm implementation is somehow very slow and broken,
# as on sourceforge.
sub scanOffsets {
	my $word = shift;
	seek (OFFSETFILE, 0, 0);
	my $line;
	# THANKS to Gary.Frost@ubsw.com for reporting the "Value of
	# <HANDLE> construct can be "0"" error that occurs on his version
	# of perl; fixed here and elsewhere.
	while (defined($line = <OFFSETFILE>)) {
		chomp $line;
		my ($fileWord, $pair) = split(' ', $line, 2);
		if ($fileWord eq $word) {
			#FAQ::OMatic::gripe('debug', "found pair: $line");
			return $pair;
		}
	}
	return undef;
}

sub getIndices {
	my $word = shift;
	my $pair;
	if (FAQ::OMatic::usedbm()) {
		my $wordDB = FAQ::OMatic::getLocal('wordDB');
		$pair = $wordDB->{$word};
	} else {
		$pair = scanOffsets($word);
	}

	# returns indexseek,wordseek pair
	# THANKS to Vicki Brown <vlb@cfcl.com> and jon * <jon@clearink.com>
	# for reporting unitialized value errors in this code.
	return defined($pair)
		? split(' ', $pair)
		: (undef,undef);
}

sub getWordClass {
	my $word = shift;
	my @wordclass = ();

	openWordDB();
	
	my ($indexseek, $wordseek) = getIndices($word);
	#FAQ::OMatic::gripe('debug', "got seeks $indexseek and $wordseek for $word");

	if (defined $indexseek) {
		#grab all words in wordsfile with $word as a prefix
		seek WORDSFILE, $wordseek, 0;
		while (defined($_ = <WORDSFILE>)) {
			chomp;
			if (m/^$word/) {
				push @wordclass, $_;
			} else {
				last;
			}
		}
	}

	return \@wordclass;
}

sub getMatchesForClass {
	my $classref = shift;	# array ref for a class of "identical" words
	my %files;

	my $word;
	foreach $word (@{$classref}) {
		my ($indexseek,$wordseek) = getIndices($word);
		next if (not defined $indexseek);
		seek INDEXFILE, $indexseek, 0;
		while (defined($_ = <INDEXFILE>)) {
			chomp;
			last if (m/^END$/);
			$files{$_}=1;
		}
	}

	my @matches = sort keys %files;
	return \@matches;
}

sub getMatchesForSet {
	my $params = shift;
	my $setref = $params->{'_searchArray'};
							# array ref for complete set of user words to search

	$params->{'_minMatches'} = 'all' if ($params->{'_minMatches'} eq '');
	my $minhits = $params->{'_minMatches'};
							# we return only files with at least this many hits
	if ($minhits eq 'all') {	# convert symbolic hits to a number
		$minhits = scalar(@{$setref});
	}
	$params->{'_actualMatches'} = $minhits;

	my %accumulator=();
	my @hitfiles=();

	my ($word, $file);
	foreach $word (@{$setref}) {
		my $classref = getWordClass($word);
		my $matches = getMatchesForClass($classref);
		foreach $file (@{$matches}) {
			$accumulator{$file}++;
		}
	}

	foreach $file (sort keys %accumulator) {
		if ($accumulator{$file} >= $minhits) {
			push @hitfiles, $file;
		}
	}

	return \@hitfiles;
}

sub convertSearchParams {
	my $params = shift;
	my $pattern;
        my $encode_lang = FAQ::OMatic::I18N::language();

	# given a user-input search string, we break it into "legal" words
	# and store it in another parameter.
	
	$pattern = $params->{'_search'};
	if($encode_lang eq "ja_JP.EUC") {
	    require NKF; import NKF;
	    $pattern = nkf('-e', $pattern);
        }
	my @patternwords = FAQ::OMatic::Words::getWords( $pattern );

	$params->{'_searchArray'} = \@patternwords;
}

sub addNewFiles {
	my $wordset = shift;	#ary ref
	my $fileset = shift;	# hash ref -- where to add results
	my $words = {};

	# Get the list of files touched since last searchDB build
	if (not open HINTS, "<$FAQ::OMatic::Config::metaDir/searchHints") {
		# sorry, can't help ya
		return;
	}
	my @touchedFiles = <HINTS>;
	close HINTS;

	# index each item
	my $filename;
	my $item;
	foreach $filename (@touchedFiles) {
		chomp $filename;
		$item = new FAQ::OMatic::Item($filename);
		$item->extractWords($words);
	}

	# for every word in the wordset, add all the files it appears
	# in to the fileset passed to us.
	my $word;
	foreach $word (@{$wordset}) {
		if ($words->{$word}) {
			foreach $filename (keys %{$words->{$word}}) {
				$fileset->{$filename} = 1;
			}
		}
	}

	# notice that if there were suffixes of the user's requested words
	# in the new content that weren't in the system anywhere when the
	# searchDB was built, then those suffixes won't be in the wordset,
	# and the search will miss them. Hey, wah, this is better than
	# missing ALL the new content, okay? :v)

	# this also screws up the counts of how many matches this file
	# had (since it could contribute matches from the searchDB lookup
	# AND the newFiles lookup), so I'm going to leave it turned off
	# for now. Rats.
}

sub getRecentSet {
	my $params = shift;
	my $recentList = [];

	my $durationdays = $params->{'_duration'};
		# used directly to compare against perl's floating-point -M file test
	my $then = time() - $durationdays*24*60*60;
		# Used to compare against LastModifiedSecs field.
		# By 'days' we mean 24-hour periods, not calendar days.
		# (In the US, for example, there is a 23-hour calendar day in
		# April and a 25-hour one in the fall, what, in October? for daylight
		# savings time.)

	my $filei;
	foreach $filei (FAQ::OMatic::getAllItemNames()) {
		# use file time as a hint for which items we even need to open up.
		next if (-M "$FAQ::OMatic::Config::itemDir/$filei" >= $durationdays);
		# ...but only trust LastModifiedSecs field for final say on mod time.
		my $item = new FAQ::OMatic::Item($filei);
		my $lm = $item->{'LastModifiedSecs'} || 0;
		if ($lm > $then) {
			push @{$recentList}, $filei;
		}
	}

	return $recentList;
}

# reasonable text for 'n' days
my %dayMap = (
	0 => gettext("zero days"),
	1 => gettext("day"),
	2 => gettext("two days"),
	7 => gettext("week"),
	14 => gettext("fortnight"),
	31 => gettext("month"), # (31? a month, give or take. :v)
	92 => gettext("three months"),
	184 => gettext("six months"),
	366 => gettext("year")
);

sub getRecentMap {
	# get a copy of the day map (except for 0)
	# for use in creating the recent form
	my %recentMap = %dayMap;
	delete $recentMap{0};
	return \%recentMap;
}

sub textDays {
	my $duration = shift || 0;
	my $textDayStr = $dayMap{$duration} || $duration." ".gettext("days");
	return $textDayStr;
}

1;