Greetings all,
(I tried to send this an hour ago, but don't see any sign in my
outbox, so apologies if this is a duplicate...)
How does the following sound:
1. Target 3.2.0b6 for the end of May
2. This will be basically a bug-fix / optimisation from 3.2.0b5
3. It will include the following bug fixes from Joe's archive
- config_parser.1
- cygwin.0
- DESTDIR.0
- exclude_perform.1
- extension_filter.0 (already committed)
- operator[].0 (already committed)
- robots.0
- TMPFILE.0
- without-zlib.0 (already committed)
4. We don't include fileSpace, since it isn't really a bug fix.
5. For those who want 3.1.6 functionality with minimal overhead,
I suggest the attached patch, which adds a "store_phrases"
attribute. If that is false, the DB size is cut to 1/3 by
storing only the first occurrence of each word in the text of
a document. (Keywords, links etc can still have reduncancy.)
6. Before we put out 3.2.0b6, we run it past everybody who has
commented on the speed of 3.2.0b5, to make sure that there are
no configurations which cause it to be particularly slow (like
having very many 'exclude's did).
7. After that, we resume testing for 3.2.0rc1
Does that sound reasonable?
Cheers,
Lachlan
On Fri, 23 Apr 2004 10:07 pm, Robert Ribnitz wrote:
> Is there any way to push the development a bit so that we get a
> 3.2.0 usable for Sarge?
--
[EMAIL PROTECTED]
ht://Dig developer DownUnder (http://www.htdig.org)
--- ../apply/htdig/Retriever.h 2003-10-23 20:11:43.000000000 +1000
+++ htdig/Retriever.h 2004-04-23 22:52:07.000000000 +1000
@@ -49,6 +49,16 @@
Retriever_Restart
};
+struct word_entry : public Object
+{
+ word_entry (int loc, int fl, HtWordReference& ref) :
+ location (loc), flags (fl), context (ref)
+ {};
+ int location;
+ int flags;
+ HtWordReference context;
+};
+
class Retriever
{
public:
@@ -119,6 +129,8 @@
String credentials;
HtWordReference word_context;
HtWordList words;
+
+ Dictionary words_to_add;
int check_unique_md5;
int check_unique_date;
--- ../apply/htdig/Retriever.cc 2004-04-24 10:59:57.000000000 +1000
+++ htdig/Retriever.cc 2004-04-24 11:00:57.000000000 +1000
@@ -50,12 +50,16 @@
static int noSignal;
+// no_store_phrases:
+// If true, only store first occurrence of each word in a document
+static bool no_store_phrases;
//*****************************************************************************
// Retriever::Retriever()
//
Retriever::Retriever(RetrieverLog flags):
-words(*(HtConfiguration::config()))
+words(*(HtConfiguration::config())),
+words_to_add (100, 0.75)
{
HtConfiguration *config = HtConfiguration::config();
FILE *urls_parsed;
@@ -63,6 +67,8 @@
currenthopcount = 0;
max_hop_count = config->Value("max_hop_count", 999999);
+ no_store_phrases = !config->Boolean("store_phrases");
+
//
// Initialize the flags for the various HTML factors
//
@@ -910,6 +916,28 @@
return;
}
+ // If just storing the first occurrence of each word in a document,
+ // we must now flush the words we saw in that document
+ if (no_store_phrases)
+ {
+ DictionaryCursor cursor;
+ char *key;
+ HtWordReference wordRef;
+ for (words_to_add.Start_Get (cursor);
+ (key = words_to_add.Get_Next(cursor)); )
+ {
+ word_entry *entry = (word_entry*) (words_to_add [key]);
+
+ wordRef.Location(entry->location);
+ wordRef.Flags(entry->flags);
+ wordRef.Word(key);
+ words.Replace(WordReference::Merge(wordRef, entry->context));
+ // How do I clean up properly?
+ delete entry;
+ }
+ words_to_add.Release ();
+ }
+
//
// We don't need to dispose of the parsable object since it will
// automatically be reused.
@@ -1412,10 +1440,25 @@
String w = word;
HtWordReference wordRef;
- wordRef.Location(location);
- wordRef.Flags(factor[heading]);
- wordRef.Word(w);
- words.Replace(WordReference::Merge(wordRef, word_context));
+ if (no_store_phrases)
+ {
+ // Add new word, or mark existing word as also being at
+ // this heading level
+ word_entry *entry;
+ if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+ {
+ words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+ } else
+ {
+ entry->flags |= factor[heading];
+ }
+ } else
+ {
+ wordRef.Location(location);
+ wordRef.Flags(factor[heading]);
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+ }
// Check for compound words...
String parts = word;
@@ -1453,8 +1496,23 @@
HtStripPunctuation(w);
if (w.length() >= minimumWordLength)
{
- wordRef.Word(w);
- words.Replace(WordReference::Merge(wordRef, word_context));
+ if (no_store_phrases)
+ {
+ // Add new word, or mark existing word as also being at
+ // this heading level
+ word_entry *entry;
+ if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+ {
+ words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+ } else
+ {
+ entry->flags |= factor[heading];
+ }
+ } else
+ {
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+ }
if (debug > 3)
cout << "word part: " << start << '@' << location << endl;
}
--- ../apply/htcommon/defaults.cc 2004-02-19 21:43:04.000000000 +1100
+++ htcommon/defaults.cc 2004-04-24 11:06:46.000000000 +1000
@@ -2396,6 +2396,14 @@
-90 will select matching documents modified within \
the last 90 days. \
" }, \
+{ "store_phrases", "true", \
+ "boolean", "htdig", "", "3.2.0b5", "Indexing:How", "startyear: false", " \
+ Causes htdig to record all occurrences of each word in a document, \
+ to allow accurate phrase searches. If this is false, only the first \
+ occurrence of each word will be stored, causing many phrases to be \
+ missed. Setting this false increases indexing speed by about 20%, \
+ and reduces disk requirements by about 60%.\
+" }, \
{ "substring_max_words", "25", \
"integer", "htsearch", "", "3.0.8b1", "Searching:Method", "substring_max_words: 100", " \
The Substring <a href=\"#search_algorithm\">fuzzy algorithm</a> \