Please note that I'll update and check in this patch to the main
trunk as soon as the merges from 3.2.0 are finished. If it is
requested, I could check it into the htdig-3-1-x branch too.
Not for 3.2.0.
I send it here as a heads-up, for the benefit of peer review, and
for bleeding-edge patch archives (hey, I *did* test it).
For prospective users: If you want to just order your search results in
areas, you may want to wait for a patch to implement an attribute
"results_order".
Thu Jan 6 10:20:15 2000 Hans-Peter Nilsson <[EMAIL PROTECTED]>
* htdoc/attrs.html (url_seed_score): New.
* htdoc/cf_byname.html: Added url_seed_score.
* htdoc/cf_byprog.html: Ditto.
* htcommon/defaults.cc (defaults): Add default for url_seed_score.
* htlib/HtURLSeedScore.cc, HtURLSeedScore.h: New.
* htsearch/Display.h (class Display: Add member minScore.
Change maxScore type to double.
* htsearch/Display.cc: Include math.h and HtURLSeedScore.h
(Display constructor): Initialize minScore, change init value for
maxScore to -DBL_MAX.
(displayMatch): Use minScore in calculation of score to adjust for
negative scores.
(buildMatchList): Use an URLSeedScore to adjust the score after
other calculations.
Calculate minScore.
Correct maxScore adjustment for change to double.
(sort): Calculation of maxScore moved to buildMatchList.
(Patch sent as an attachment)
brgds, H-P
*** /dev/null Tue Jan 1 05:00:00 1980
--- htlib/HtURLSeedScore.h Fri Jan 7 09:13:08 2000
***************
*** 0 ****
--- 1,55 ----
+ //
+ // HtURLSeedScore.h
+ //
+ // URLSeedScore: Constructed from a Configuration, see doc
+ // for format of config item "url_seed_score".
+ // Method "double adjust_score(double score, const String &url)"
+ // returns an adjusted score, given the original score, or returns the
+ // original score if there was no adjustment to do.
+ //
+ // $Id$
+ //
+ // Part of the ht://Dig package <http://www.htdig.org/>
+ // Copyright (c) 2000 The ht://Dig Group
+ // For copyright details, see the file COPYING in your distribution
+ // or the GNU Public License version 2 or later
+ // <http://www.gnu.org/copyleft/gpl.html>
+ //
+ #ifndef __HtURLSeedScore_h
+ #define __HtURLSeedScore_h
+
+ #include "Configuration.h"
+ #include "List.h"
+
+ class URLSeedScore
+ {
+ public:
+ URLSeedScore(Configuration &);
+ ~URLSeedScore();
+
+ // Return the "adjusted" score. Use an inline method to avoid
+ // function-call overhead when this feature is unused.
+ double adjust_score(double score, const String& url)
+ {
+ return myAdjustmentList->Count() == 0
+ ? score : noninline_adjust_score(score, url);
+ }
+
+ // If an error was discovered during the parsing of
+ // the configuration, this member gives a
+ // nonempty String with an error message.
+ const String& ErrMsg() { return myErrMsg; }
+
+ private:
+ double noninline_adjust_score(double score, const String& url);
+
+ // These member functions are not supposed to be implemented.
+ URLSeedScore();
+ URLSeedScore(const URLSeedScore &);
+ void operator= (const URLSeedScore &);
+
+ List *myAdjustmentList;
+ String myErrMsg;
+ };
+
+ #endif /* __HtURLSeedScore_h */
*** /dev/null Tue Jan 1 05:00:00 1980
--- htlib/HtURLSeedScore.cc Fri Jan 7 09:13:08 2000
***************
*** 0 ****
--- 1,214 ----
+ //
+ // HtURLSeedScore.cc
+ //
+ // URLSeedScore:
+ // Holds a list of configured adjustments to be applied on a given
+ // score and given URL.
+ //
+ // Part of the ht://Dig package <http://www.htdig.org/>
+ // Copyright (c) 2000 The ht://Dig Group
+ // For copyright details, see the file COPYING in your distribution
+ // or the GNU Public License version 2 or later
+ // <http://www.gnu.org/copyleft/gpl.html>
+ //
+ // $Id$
+
+ #include "StringList.h"
+ #include "StringMatch.h"
+ #include "HtURLSeedScore.h"
+ #include <stdio.h>
+ #include <ctype.h>
+
+ // This class is only used in private members of URLSeedScore.
+ // The OO-right thing would be to nest this inside the private
+ // declaration of HtURLSeedScore, but that would cause portability
+ // problems according to
+ // <URL:http://www.mozilla.org/hacking/portable-cpp.html#inner_classes>.
+
+ class ScoreAdjustItem : public Object
+ {
+ public:
+ // Construct from a string applicable to StringMatch, and a string to
+ // parse for a formula.
+ ScoreAdjustItem(String &, String &);
+
+ ~ScoreAdjustItem();
+
+ // Does this item match?
+ inline bool Match(const String &s) { return match.FindFirst(s.get()) != -1; }
+
+ // Return the argument adjusted according to this item.
+ double adjust_score(double orig)
+ { return orig*my_mul_factor + my_add_constant; }
+
+ // Error in parsing? Message given here if non-empty string.
+ String& ErrMsg() { return myErrMsg; }
+
+ private:
+ double my_add_constant;
+ double my_mul_factor;
+ StringMatch match;
+
+ static String myErrMsg;
+
+ // These member functions are not supposed to be implemented, but
+ // mentioned here as private so the compiler will not generate them if
+ // someone puts in buggy code that would use them.
+ ScoreAdjustItem();
+ ScoreAdjustItem(const ScoreAdjustItem &);
+ void operator= (const ScoreAdjustItem &);
+ };
+
+ // Definition of myErrMsg.
+ String ScoreAdjustItem::myErrMsg("");
+
+ ScoreAdjustItem::ScoreAdjustItem(String &url_regex, String &formula)
+ {
+ double mul_factor = 1;
+ double add_constant = 0;
+ bool factor_found = false;
+ bool constant_found = false;
+ int chars_so_far;
+ match.Pattern(url_regex);
+
+ // FIXME: Missing method to check if the regex was in error.
+ // We'll check hasPattern for the time being as a placeholder.
+ if (! match.hasPattern())
+ {
+ myErrMsg = form("%s is not a valid regex", url_regex.get());
+ return;
+ }
+
+ char *s = formula.get();
+
+ // Parse the ([*]N[ ]*)?[+]?M format.
+ if (s[0] == '*')
+ {
+ // Skip past the '*'.
+ s++;
+
+ // There is a mul_factor. Let's parse it.
+ chars_so_far = 0;
+ sscanf(s, "%lf%n", &mul_factor, &chars_so_far);
+
+ // If '%lf' failed to match, then it will show up as either no
+ // assignment to chars_so_far, or as writing 0 there.
+ if (chars_so_far == 0)
+ {
+ myErrMsg = form("%s is not a valid adjustment formula", s);
+ return;
+ }
+
+ // Skip past the number.
+ s += chars_so_far;
+
+ // Skip any whitespaces.
+ while (isspace(*s))
+ s++;
+
+ // Eat any plus-sign; it's redundant if alone, and may come before a
+ // minus.
+ if (*s == '+')
+ s++;
+
+ factor_found = true;
+ }
+
+ // If there's anything here, it must be the additive constant.
+ if (*s)
+ {
+ chars_so_far = 0;
+ sscanf(s, "%lf%n", &add_constant, &chars_so_far);
+
+ // If '%lf' failed to match, then it will show up as either no
+ // assignment to chars_so_far, or as writing 0 there.
+ // We also need to check that it was the end of the input.
+ if (chars_so_far == 0 || s[chars_so_far] != 0)
+ {
+ myErrMsg = form("%s is not a valid adjustment formula",
+ formula.get());
+ return;
+ }
+
+ constant_found = true;
+ }
+
+ // Either part must be there.
+ if (!factor_found && !constant_found)
+ {
+ myErrMsg = form("%s is not a valid formula", formula.get());
+ return;
+ }
+
+ my_add_constant = add_constant;
+ my_mul_factor = mul_factor;
+ }
+
+ ScoreAdjustItem::~ScoreAdjustItem()
+ {
+ }
+
+ URLSeedScore::URLSeedScore(Configuration &config)
+ {
+ char *config_item = "url_seed_score";
+
+ StringList sl(config[config_item], "\t \r\n");
+
+ myAdjustmentList = new List();
+
+ if (sl.Count() % 2)
+ {
+ myErrMsg = form("%s is not a list of pairs (odd number of items)",
+ config_item);
+
+ // We *could* continue, but that just means the error will be harder
+ // to find, unless someone actually sees the error message.
+ return;
+ }
+
+ // Parse each as in TemplateList::createFromString.
+ for (int i = 0; i < sl.Count(); i += 2)
+ {
+ String url_regex = sl[i];
+ String adjust_formula = sl[i+1];
+
+ ScoreAdjustItem *adjust_item
+ = new ScoreAdjustItem(url_regex, adjust_formula);
+
+ if (adjust_item->ErrMsg().length() != 0)
+ {
+ // No point in continuing beyond the error; we might just
+ // overwrite the first error.
+ myErrMsg = form("While parsing %s: %s",
+ config_item,
+ adjust_item->ErrMsg().get());
+ return;
+ }
+
+ myAdjustmentList->Add(adjust_item);
+ }
+ }
+
+ URLSeedScore::~URLSeedScore()
+ {
+ delete myAdjustmentList;
+ }
+
+ double
+ URLSeedScore::noninline_adjust_score(double orig_score, const String &url)
+ {
+ List *adjlist = myAdjustmentList;
+ ScoreAdjustItem *adjust_item;
+
+ adjlist->Start_Get();
+
+ while ((adjust_item = (ScoreAdjustItem *) adjlist->Get_Next()))
+ {
+ // Use the first match only.
+ if (adjust_item->Match(url))
+ return adjust_item->adjust_score(orig_score);
+ }
+
+ // We'll get here if no match was found.
+ return orig_score;
+ }
Index: htcommon/defaults.cc
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htcommon/defaults.cc,v
retrieving revision 1.43.2.12
diff -p -c -r1.43.2.12 defaults.cc
*** htcommon/defaults.cc 1999/12/06 22:26:46 1.43.2.12
--- htcommon/defaults.cc 2000/01/07 09:32:39
*************** ConfigDefaults defaults[] =
*** 148,153 ****
--- 148,154 ----
{"translate_amp", "false"},
{"translate_lt_gt", "false"},
{"translate_quot", "false"},
+ {"url_seed_score", ""},
{"url_list", "${database_base}.urls"},
{"url_part_aliases", ""},
{"url_log", "${database_base}.log"},
Index: htdoc/attrs.html
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htdoc/attrs.html,v
retrieving revision 1.27.2.25
diff -p -c -r1.27.2.25 attrs.html
*** htdoc/attrs.html 1999/12/07 04:29:26 1.27.2.25
--- htdoc/attrs.html 2000/01/07 09:32:50
***************
*** 6816,6821 ****
--- 6816,6895 ----
<hr>
<dl>
<dt>
+ <strong><a name="url_seed_score">url_seed_score</a></strong>
+ </dt>
+ <dd>
+ <dl>
+ <dt>
+ <em>type:</em>
+ </dt>
+ <dd>
+ string list
+ </dd>
+ <dt>
+ <em>used by:</em>
+ </dt>
+ <dd>
+ <a href="htsearch.html">htsearch</a>
+ </dd>
+ <dt>
+ <em>default:</em>
+ </dt>
+ <dd>
+ <em><empty></em>
+ </dd>
+ <dt>
+ <em>description:</em>
+ </dt>
+ <dd>
+ This is a list of pairs, <em>pattern</em>
+ <em>formula</em>, used to weigh the score of
+ hits, depending on the URL of the document.<br>
+ The <em>pattern</em> part is a substring to match
+ against the URL. Pipe ('|') characters can be
+ used in the pattern to concatenate substrings for
+ web-areas that have the same formula.<br>
+ The formula describes a <em>factor</em> and a
+ <em>constant</em>, by which the hit score is
+ weighed. The <em>factor</em> part is multiplied
+ to the original score, then the <em>constant</em>
+ part is added.<br>
+ The format of the formula is the factor part:
+ "*<em>N</em>" optionally followed by comma and
+ spaces, followed by the constant part :
+ "+<em>M</em>", where the plus sign may be emitted
+ for negative numbers. Either part is optional,
+ but must come in this order.<br>
+ The numbers <em>N</em> and <em>M</em> are floating
+ point constants.<br>
+ More straightforward is to think of the format as
+ "newscore = oldscore*<em>N</em>+<em>M</em>",
+ but with the "newscore = oldscore" part left out.
+ </dd>
+ <dt>
+ <em>example:</em>
+ </dt>
+ <dd>
+ <table border="0">
+ <tr>
+ <td valign="top">
+ url_seed_score:
+ </td>
+ <td nowrap>
+ /mailinglist/ *.5-1e6 \<br>
+ /docs/|/news/ *1.5 \<br>
+ /testresults/ "*.7 -200" \<br>
+ /faq-area/ *2+10000
+ </td>
+ </tr>
+ </table>
+ </dd>
+ </dl>
+ </dd>
+ </dl>
+ <hr>
+ <dl>
+ <dt>
<strong><a name="use_meta_description">
use_meta_description</a></strong>
</dt>
Index: htdoc/cf_byname.html
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htdoc/cf_byname.html,v
retrieving revision 1.18.2.13
diff -p -c -r1.18.2.13 cf_byname.html
*** htdoc/cf_byname.html 1999/12/06 22:26:48 1.18.2.13
--- htdoc/cf_byname.html 2000/01/07 09:32:51
***************
*** 176,181 ****
--- 176,182 ----
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#url_list">url_list</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#url_log">url_log</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#url_part_aliases">url_part_aliases</a><br>
+ <img src="dot.gif" alt="*" width=9 height=9> <a target="body"
+href="attrs.html#url_seed_score">url_seed_score</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#use_meta_description">use_meta_description</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#use_star_image">use_star_image</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#user_agent">user_agent</a><br>
Index: htdoc/cf_byprog.html
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htdoc/cf_byprog.html,v
retrieving revision 1.17.2.13
diff -p -c -r1.17.2.13 cf_byprog.html
*** htdoc/cf_byprog.html 1999/12/06 22:26:48 1.17.2.13
--- htdoc/cf_byprog.html 2000/01/07 09:32:52
***************
*** 175,180 ****
--- 175,181 ----
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#syntax_error_file">syntax_error_file</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#uncoded_db_compatible">uncoded_db_compatible</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#url_part_aliases">url_part_aliases</a><br>
+ <img src="dot.gif" alt="*" width=9 height=9> <a target="body"
+href="attrs.html#url_seed_score">url_seed_score</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#use_meta_description">use_meta_description</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#use_star_image">use_star_image</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#valid_punctuation">valid_punctuation</a><br>
Index: htlib/Makefile.in
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htlib/Makefile.in,v
retrieving revision 1.13.2.2
diff -p -c -r1.13.2.2 Makefile.in
*** htlib/Makefile.in 1999/03/29 15:53:48 1.13.2.2
--- htlib/Makefile.in 2000/01/07 09:32:52
*************** OBJS= Configuration.o Connection.o Datab
*** 16,22 ****
URL.o URLTrans.o cgi.o \
good_strtok.o io.o strcasecmp.o \
strptime.o mytimegm.o HtCodec.o HtWordCodec.o \
! HtURLCodec.o regex.o HtWordType.o
TARGET= libht.a
--- 16,22 ----
URL.o URLTrans.o cgi.o \
good_strtok.o io.o strcasecmp.o \
strptime.o mytimegm.o HtCodec.o HtWordCodec.o \
! HtURLCodec.o regex.o HtWordType.o HtURLSeedScore.o
TARGET= libht.a
Index: htsearch/Display.cc
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htsearch/Display.cc,v
retrieving revision 1.54.2.22
diff -p -c -r1.54.2.22 Display.cc
*** htsearch/Display.cc 1999/12/07 16:52:35 1.54.2.22
--- htsearch/Display.cc 2000/01/07 09:32:56
*************** static char RCSid[] = "$Id: Display.cc,v
*** 21,28 ****
--- 21,30 ----
#include <ctype.h>
#include <syslog.h>
#include <locale.h>
+ #include <math.h>
#include "HtURLCodec.h"
#include "HtWordType.h"
+ #include "HtURLSeedScore.h"
//*****************************************************************************
//
*************** Display::Display(char *indexFile, char *
*** 43,49 ****
templateError = 0;
maxStars = config.Value("max_stars");
! maxScore = 100;
setupImages();
setupTemplates();
--- 45,52 ----
templateError = 0;
maxStars = config.Value("max_stars");
! maxScore = -DBL_MAX;
! minScore = DBL_MAX;
setupImages();
setupTemplates();
*************** Display::displayMatch(ResultMatch *match
*** 304,310 ****
if (maxScore != 0)
{
! int percent = (int)(ref->DocScore() * 100 / (double)maxScore);
if (percent <= 0)
percent = 1;
vars.Add("PERCENT", new String(form("%d", percent)));
--- 307,314 ----
if (maxScore != 0)
{
! int percent = (int)((ref->DocScore() - minScore) * 100 /
! (maxScore - minScore));
if (percent <= 0)
percent = 1;
vars.Add("PERCENT", new String(form("%d", percent)));
*************** Display::generateStars(DocumentRef *ref,
*** 742,748 ****
if (maxScore != 0)
{
! score = ref->DocScore() / (double)maxScore;
}
else
{
--- 746,752 ----
if (maxScore != 0)
{
! score = (ref->DocScore() - minScore) / (maxScore - minScore);
}
else
{
*************** Display::buildMatchList()
*** 938,943 ****
--- 942,951 ----
double backlink_factor = config.Double("backlink_factor");
double date_factor = config.Double("date_factor");
SortType typ = sortType();
+ URLSeedScore adjustments(config);
+
+ // If we knew where to pass it, this would be a good place to pass
+ // on errors from adjustments.ErrMsg().
results->Start_Get();
while ((id = results->Get_Next()))
*************** Display::buildMatchList()
*** 1007,1012 ****
--- 1015,1023 ----
sortRef->DocTitle(thisRef->DocTitle());
thisMatch->setRef(sortRef);
}
+
+ score = adjustments.adjust_score(score, thisRef->DocURL());
+
}
// Get rid of it to free the memory!
delete thisRef;
*************** Display::buildMatchList()
*** 1019,1024 ****
--- 1030,1039 ----
// Append this match to our list of matches.
//
matches->Add(thisMatch);
+ if (matches->Count() == 1 || maxScore < score)
+ maxScore = score;
+ if (matches->Count() == 1 || minScore > score)
+ minScore = score;
}
//
*************** Display::sort(List *matches)
*** 1163,1170 ****
for (i = 0; i < numberOfMatches; i++)
{
array[i] = (ResultMatch *)(*matches)[i];
- if (i == 0 || maxScore < array[i]->getScore())
- maxScore = array[i]->getScore();
}
matches->Release();
--- 1178,1183 ----
Index: htsearch/Display.h
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htsearch/Display.h,v
retrieving revision 1.8.2.4
diff -p -c -r1.8.2.4 Display.h
*** htsearch/Display.h 1999/11/24 05:17:10 1.8.2.4
--- htsearch/Display.h 2000/01/07 09:32:57
*************** protected:
*** 125,131 ****
// Maximum number of stars to display
//
int maxStars;
! int maxScore;
//
// For display, we have different versions of the list of words.
--- 125,132 ----
// Maximum number of stars to display
//
int maxStars;
! double maxScore;
! double minScore;
//
// For display, we have different versions of the list of words.
Compilation exited abnormally with code 1 at Fri Jan 7 11:21:03
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED]
You will receive a message to confirm this.