On Tue, 2004-04-20 at 01:45, Joe R. Jah wrote:
> Attach exclude_perform.0 to an email to the list,
Cool, thanks very much for the pointer. That was easy. I've attached
the file. I also renamed the two variables to be foo_parsed rather than
foo_checked as it seemed to make them more obvious to me.
> or if you prefer ftp it to:
>
> ftp://ftp.ccsf.org/incoming
I can do that as well.
For the record, I did an index of 51k documents (including PDFs) on 61
different servers on campus with these changes applied. It took 4 hours
and 17 minutes. Before these changes, I would stop htdig after 8 hours
and it had only done a little more than 8k pages.
Cheers,
Chris
--
Christopher Murtagh
Enterprise Systems Administrator
ISR / Web Communications Group
McGill University
Montreal, Quebec
Canada
Tel.: (514) 398-3122
Fax: (514) 398-2017
--- htdig.h.original 2004-04-20 10:05:55.000000000 -0400
+++ htdig.h 2004-04-20 10:08:47.000000000 -0400
@@ -40,7 +40,11 @@
#include <stdio.h>
extern int debug;
+extern int exclude_parsed;
+extern int badquerystr_parsed;
extern DocumentDB docs;
+extern HtRegexList excludes;
+extern HtRegexList badquerystr;
extern HtRegexList limits;
extern HtRegexList limitsn;
extern HtRegexList excludes;
--- htdig.cc.original 2004-04-20 10:05:46.000000000 -0400
+++ htdig.cc 2004-04-20 10:08:35.000000000 -0400
@@ -57,7 +57,12 @@
//
int debug = 0;
int report_statistics = 0;
+int exclude_parsed = 0;
+int badquerystr_parsed = 0;
+
DocumentDB docs;
+HtRegexList excludes;
+HtRegexList badquerystr;
HtRegexList limits;
HtRegexList limitsn;
FILE *urls_seen = NULL;
--- Retriever.cc.original 2004-04-20 10:06:18.000000000 -0400
+++ Retriever.cc 2004-04-20 10:09:02.000000000 -0400
@@ -995,9 +995,15 @@
// If the URL contains any of the patterns in the exclude list,
// mark it as invalid
//
- tmpList.Create(config->Find(&aUrl, "exclude_urls"), " \t");
- HtRegexList excludes;
- excludes.setEscaped(tmpList, config->Boolean("case_sensitive"));
+
+ if(!(exclude_parsed)){
+ //only parse this once and store into global variable
+ tmpList.Destroy();
+ tmpList.Create(config->Find(&aUrl, "exclude_urls"), " \t");
+ excludes.setEscaped(tmpList, config->Boolean("case_sensitive"));
+ exclude_parsed = 1;
+ }
+
if (excludes.match(url, 0, 0) != 0)
{
if (debug > 2)
@@ -1009,10 +1015,14 @@
// If the URL has a query string and it is in the bad query list
// mark it as invalid
//
- tmpList.Destroy();
- tmpList.Create(config->Find(&aUrl, "bad_querystr"), " \t");
- HtRegexList badquerystr;
- badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive"));
+
+ if(!(badquerystr_parsed)){
+ //only parse this once and store into global variable
+ tmpList.Destroy();
+ tmpList.Create(config->Find(&aUrl, "bad_querystr"), " \t");
+ badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive"));
+ badquerystr_parsed = 1;
+ }
char *ext = strrchr((char *) url, '?');
if (ext && badquerystr.match(ext, 0, 0) != 0)
{