File: HtURLSeedScore.cc

package info (click to toggle)
htdig 1%3A3.2.0b6-3.1etch1
  • links: PTS
  • area: main
  • in suites: etch
  • size: 14,936 kB
  • ctags: 9,369
  • sloc: ansic: 49,626; cpp: 46,470; sh: 23,053; xml: 4,180; perl: 2,543; makefile: 868; php: 79; asm: 14
file content (215 lines) | stat: -rw-r--r-- 5,481 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
//
// HtURLSeedScore.cc
//
// URLSeedScore:
//	Holds a list of configured adjustments to be applied on a given
//	score and given URL.
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 2000-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: HtURLSeedScore.cc,v 1.6 2004/05/28 13:15:24 lha Exp $

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include "StringList.h"
#include "HtRegex.h"
#include "HtURLSeedScore.h"

#include <stdio.h>
#include <ctype.h>

// This class is only used in private members of URLSeedScore.
// The OO-right thing would be to nest this inside the private
// declaration of HtURLSeedScore, but that would cause portability
// problems according to
// <URL:http://www.mozilla.org/hacking/portable-cpp.html#inner_classes>.

class ScoreAdjustItem : public Object
{
public:
    // Construct from a string applicable to StringMatch, and a string to
    // parse for a formula.
    ScoreAdjustItem(String &, String &);

    ~ScoreAdjustItem();

    // Does this item match?
    inline bool Match(const String &s) { return match.match(s, 1, 0) != 0; }

    // Return the argument adjusted according to this item.
    double adjust_score(double orig)
    { return orig*my_mul_factor + my_add_constant; }

    // Error in parsing?  Message given here if non-empty string.
    String& ErrMsg() { return myErrMsg; }

private:
    double my_add_constant;
    double my_mul_factor;
    HtRegex match;

    static String myErrMsg;

    // These member functions are not supposed to be implemented, but
    // mentioned here as private so the compiler will not generate them if
    // someone puts in buggy code that would use them.
    ScoreAdjustItem();
    ScoreAdjustItem(const ScoreAdjustItem &);
    void operator= (const ScoreAdjustItem &);
};

// Definition of myErrMsg.
String ScoreAdjustItem::myErrMsg("");

ScoreAdjustItem::ScoreAdjustItem(String &url_regex, String &formula)
{
    double mul_factor = 1;
    double add_constant = 0;
    bool factor_found = false;
    bool constant_found = false;
    int chars_so_far;
    StringList l(url_regex.get(), '|');
    match.setEscaped(l);

    // FIXME: Missing method to check if the regex was in error.
    //	myErrMsg = form("%s is not a valid regex", url_regex.get());

    char *s = formula.get();

    // Parse the ([*]N[ ]*)?[+]?M format.
    if (s[0] == '*')
    {
	// Skip past the '*'.
	s++;

	// There is a mul_factor.  Let's parse it.
	chars_so_far = 0;
	sscanf(s, "%lf%n", &mul_factor, &chars_so_far);

	// If '%lf' failed to match, then it will show up as either no
	// assignment to chars_so_far, or as writing 0 there.
	if (chars_so_far == 0)
	{
	    myErrMsg = form("%s is not a valid adjustment formula", s);
	    return;
	}

	// Skip past the number.
	s += chars_so_far;

	// Skip any whitespaces.
	while (isspace(*s))
	    s++;

	// Eat any plus-sign; it's redundant if alone, and may come before a
	// minus.
	if (*s == '+')
	    s++;

	factor_found = true;
    }

    // If there's anything here, it must be the additive constant.
    if (*s)
    {
	chars_so_far = 0;
	sscanf(s, "%lf%n", &add_constant, &chars_so_far);

	// If '%lf' failed to match, then it will show up as either no
	// assignment to chars_so_far, or as writing 0 there.
	//  We also need to check that it was the end of the input.
	if (chars_so_far == 0 || s[chars_so_far] != 0)
	{
	    myErrMsg = form("%s is not a valid adjustment formula",
			    formula.get());
	    return;
	}

	constant_found = true;
    }

    // Either part must be there.
    if (!factor_found && !constant_found)
    {
	myErrMsg = form("%s is not a valid formula", formula.get());
	return;
    }

    my_add_constant = add_constant;
    my_mul_factor = mul_factor;
}

ScoreAdjustItem::~ScoreAdjustItem()
{
}

URLSeedScore::URLSeedScore(Configuration &config)
{
    char *config_item = "url_seed_score";

    StringList sl(config[config_item], "\t \r\n");

    myAdjustmentList = new List();

    if (sl.Count() % 2)
    {
	myErrMsg = form("%s is not a list of pairs (odd number of items)",
			config_item);

	// We *could* continue, but that just means the error will be harder
	// to find, unless someone actually sees the error message.
	return;
    }

    // Parse each as in TemplateList::createFromString.
    for (int i = 0; i < sl.Count(); i += 2)
    {
	String url_regex = sl[i];
	String adjust_formula = sl[i+1];

	ScoreAdjustItem *adjust_item
	    = new ScoreAdjustItem(url_regex, adjust_formula);

	if (adjust_item->ErrMsg().length() != 0)
	{
	    // No point in continuing beyond the error; we might just
	    // overwrite the first error.
	    myErrMsg = form("While parsing %s: %s",
			    config_item,
			    adjust_item->ErrMsg().get());
	    return;
	}

	myAdjustmentList->Add(adjust_item);
    }
}

URLSeedScore::~URLSeedScore()
{
    delete myAdjustmentList;
}

double
URLSeedScore::noninline_adjust_score(double orig_score, const String &url)
{
    List *adjlist = myAdjustmentList;
    ScoreAdjustItem *adjust_item;

    adjlist->Start_Get();

    while ((adjust_item = (ScoreAdjustItem *) adjlist->Get_Next()))
    {
	// Use the first match only.
	if (adjust_item->Match(url))
	    return adjust_item->adjust_score(orig_score);
    }

    // We'll get here if no match was found.
    return orig_score;
}