1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
|
#!/bin/sh
# Part of the ht://Dig package <https://htdig.sourceforge.net/>
# Copyright (c) 1999-2004 The ht://Dig Group
# For copyright details, see the file COPYING in your distribution
# or the GNU Library General Public License (LGPL) version 2 or later
# <http://www.gnu.org/copyleft/lgpl.html>
#
# $Id: t_validwords,v 1.2 2004/05/28 13:15:30 lha Exp $
#
try() {
comment="$1"
shift
query="$1"
shift
$htsearch -c $config "$query" > $tmp 2> /dev/null
for pattern
do
if grep "$pattern" $tmp > /dev/null
then :
else
$htsearch -vv -c $config "$query" > /dev/null
echo "Output doesn't match \"$pattern\""
fail "$htsearch -c $config '$query' >> $tmp --
$comment"
fi
done
}
test_functions_action=--start-apache
. ./test_functions
config=$testdir/conf/htdig.conf.tmp
tmp=/tmp/t_htsearch$$
# set up config file with chosen non-default values
cp $testdir/conf/htdig.conf $config
set_attr allow_numbers "false"
set_attr minimum_word_length "3"
set_attr maximum_word_length "10"
set_attr translate_latin1 "0"
set_attr valid_punctuation "."
set_attr extra_word_characters ""
#set_attr locale fr
$htdig "$@" -t -i -c $config || fail "Couldn't dig"
set_attr remove_bad_urls "false"
set_attr remove_unretrieved_urls "true"
$htpurge -vv -c $config > tmp1 || fail "Couldn't purge"
# How can I check that unretrieved urls have been removed, but bad ones haven't?
try "Search for '2001' without allow_numbers" \
"words=2001" \
'No matches'
try "Search for '0b3' without allow_numbers" \
"words=0b3" \
'1 matches' 'bad_local.htm' '3.2.<strong>0b3</strong>'
try "Search for '3.2.0b3' without allow_numbers" \
"words=3.2.0b3" \
'1 matches' 'bad_local.htm' '<strong>3.2.0b3</strong>'
try "Search for '320b3' without allow_numbers" \
"words=320b3" \
'1 matches' 'bad_local.htm'
try 'Search for "archive." without . in extra_word_characters' \
'words=archive.' \
'1 matches' 'bad_local.htm' '<strong>archive</strong>.'
try 'Search for "archive" without . in extra_word_characters' \
'words=archive' \
'1 matches' 'bad_local.htm' '<strong>archive</strong>.'
try "Search for 'graduateprofessional' which should not match a slash" \
"words=graduateprofessional" \
'No matches'
try "Search for 'now' with minimum_word_length=3" \
"words=now" \
'1 matches' 'bad_local.htm'
try "Search for 'franais' without translate_latin1" \
"words=franais" \
'1 matches' 'site4.html' '<strong>franais</strong>'
try "Search for 'qubec' without translate_latin1" \
"words=qubec" \
'No matches'
try "Search for 'with' with default bad_word_list" \
"words=with" \
'No matches'
try "Search for 'technical' with default bad_word_list" \
"words=technical" \
'1 matches' 'site%201.html'
set_attr allow_numbers "true"
set_attr minimum_word_length "4"
set_attr maximum_word_length "13"
set_attr translate_latin1 "yes"
set_attr valid_punctuation "/"
set_attr extra_word_characters '.\\\$' # string is .\$, chars: .$
set_attr bad_word_list "${testdir}/bad_word_list"
#set_attr locale fr
$htdig "$@" -t -i -c $config || fail "Couldn't dig"
set_attr remove_bad_urls "true"
set_attr remove_unretrieved_urls "false"
$htpurge -vv -c $config > tmp || fail "Couldn't purge"
# How can I check that bad urls have been removed, but unretrieved ones haven't?
try "Search for '2001' " \
"words=2001" \
'1 matches' '1995-<strong>2001</strong>'
try "Search for '9.00'" \
"words=9.00" \
'1 matches' 'site4.html' '<strong>9.00</strong>'
try "Search for '9/00' -- checking . is not just valid_punctuation" \
"words=9/00" \
'No matches'
try 'Search for "archive." with . in extra_word_characters' \
'words=archive.' \
'1 matches' 'bad_local.htm' '<strong>archive.</strong>'
try 'Search for "archive" with . in extra_word_characters' \
'words=archive' \
'No matches'
try 'Search for "$195"' \
'words=$195' \
'1 matches' 'site4.html' '<strong>$195</strong>,000'
try "Search for 'graduateprofessional' which should match a slash" \
"words=graduateprofessional" \
'1 matches' 'site4.html' '<strong>graduate/professional</strong>'
#try "Search for 'graduateprofexyz' which should match a truncated word" \
# "words=graduateprofexyz" \
# '1 matches' 'site4.html' '<strong>graduate/professional</strong>'
try "Search for 'graduateprofexyz' which should match a truncated word" \
"words=graduateprofexyz" \
'1 matches' 'site4.html'
try "Search for 'graduateprofxyz' which should fail to match a truncated word" \
"words=graduateprofxyz" \
'No matches'
try "Search for 'part' with minimum_word_length=4" \
"words=part" \
'2 matches' 'bad_local.htm' 'script.html'
try "Search for 'now' with minimum_word_length=4" \
"words=now" \
'No matches'
try "Search for 'franais' with translate_latin1" \
"words=franais" \
'1 matches' 'site4.html' '<strong>français</strong>'
try "Search for 'qubec' with translate_latin1" \
"words=qubec" \
'1 matches' 'site4.html' '<strong>Québec</strong>'
try "Search for 'with' with new bad_word_list" \
"words=with" \
'4 matches' 'bad_local.htm' 'script.html' 'site4.html' 'site%201.html'
try "Search for 'technical' with new bad_word_list" \
"words=technical" \
'No matches'
test_functions_action=--stop-apache
. ./test_functions
|