Greetings all,

(I tried to send this an hour ago, but don't see any sign in my 
outbox, so apologies if this is a duplicate...)

How does the following sound:
1. Target 3.2.0b6 for the end of May
2. This will be basically a bug-fix / optimisation from 3.2.0b5
3. It will include the following bug fixes from Joe's archive
   - config_parser.1
   - cygwin.0
   - DESTDIR.0
   - exclude_perform.1
   - extension_filter.0  (already committed)
   - operator[].0        (already committed)
   - robots.0
   - TMPFILE.0
   - without-zlib.0      (already committed)
4. We don't include  fileSpace,  since it isn't really a bug fix.
5. For those who want 3.1.6 functionality with minimal overhead,
   I suggest the attached patch, which adds a "store_phrases"
   attribute.  If that is false, the DB size is cut to 1/3 by
   storing only the first occurrence of each word in the text of
   a document.  (Keywords, links etc can still have reduncancy.)
6. Before we put out 3.2.0b6, we run it past everybody who has
   commented on the speed of 3.2.0b5, to make sure that there are
   no configurations which cause it to be particularly slow (like
   having very many 'exclude's did).
7. After that, we resume testing for 3.2.0rc1

Does that sound reasonable?

Cheers,
Lachlan

On Fri, 23 Apr 2004 10:07 pm, Robert Ribnitz wrote:
> Is there any way to push the development a bit so that we get a
> 3.2.0 usable for Sarge?

-- 
[EMAIL PROTECTED]
ht://Dig developer DownUnder  (http://www.htdig.org)
--- ../apply/htdig/Retriever.h	2003-10-23 20:11:43.000000000 +1000
+++ htdig/Retriever.h	2004-04-23 22:52:07.000000000 +1000
@@ -49,6 +49,16 @@
     Retriever_Restart
 };
 
+struct word_entry : public Object
+{
+    		word_entry (int loc, int fl, HtWordReference& ref) :
+		    	location (loc), flags (fl), context (ref)
+			{};
+    int		location;
+    int		flags;
+    HtWordReference context;
+};
+
 class Retriever
 {
 public:
@@ -119,6 +129,8 @@
     String		credentials;
     HtWordReference	word_context;
     HtWordList		words;
+
+    Dictionary		words_to_add;
 	
     int			check_unique_md5;
     int			check_unique_date;
--- ../apply/htdig/Retriever.cc	2004-04-24 10:59:57.000000000 +1000
+++ htdig/Retriever.cc	2004-04-24 11:00:57.000000000 +1000
@@ -50,12 +50,16 @@
 
 static int noSignal;
 
+// no_store_phrases:
+// If true, only store first occurrence of each word in a document
+static bool no_store_phrases;
 
 //*****************************************************************************
 // Retriever::Retriever()
 //
 Retriever::Retriever(RetrieverLog flags):
-words(*(HtConfiguration::config()))
+words(*(HtConfiguration::config())),
+words_to_add (100, 0.75)
 {
 	HtConfiguration *config = HtConfiguration::config();
 	FILE *urls_parsed;
@@ -63,6 +67,8 @@
 	currenthopcount = 0;
 	max_hop_count = config->Value("max_hop_count", 999999);
 
+	no_store_phrases = !config->Boolean("store_phrases");
+
 	//
 	// Initialize the flags for the various HTML factors
 	//
@@ -910,6 +916,28 @@
 		return;
 	}
 
+	// If just storing the first occurrence of each word in a document,
+	// we must now flush the words we saw in that document
+	if (no_store_phrases)
+	{
+	    DictionaryCursor cursor;
+	    char *key;
+	    HtWordReference wordRef;
+	    for (words_to_add.Start_Get (cursor);
+		    (key = words_to_add.Get_Next(cursor)); )
+	    {
+		word_entry *entry = (word_entry*) (words_to_add [key]);
+
+		wordRef.Location(entry->location);
+		wordRef.Flags(entry->flags);
+		wordRef.Word(key);
+		words.Replace(WordReference::Merge(wordRef, entry->context));
+		// How do I clean up properly?
+		delete entry;
+	    }
+	    words_to_add.Release ();
+	}
+
 	//
 	// We don't need to dispose of the parsable object since it will
 	// automatically be reused.
@@ -1412,10 +1440,25 @@
 		String w = word;
 		HtWordReference wordRef;
 
-		wordRef.Location(location);
-		wordRef.Flags(factor[heading]);
-		wordRef.Word(w);
-		words.Replace(WordReference::Merge(wordRef, word_context));
+		if (no_store_phrases)
+		{
+		    // Add new word, or mark existing word as also being at
+		    // this heading level
+		    word_entry *entry;
+		    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+		    {
+			words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+		    } else
+		    {
+			entry->flags |= factor[heading];
+		    }
+		} else
+		{
+		    wordRef.Location(location);
+		    wordRef.Flags(factor[heading]);
+		    wordRef.Word(w);
+		    words.Replace(WordReference::Merge(wordRef, word_context));
+		}
 
 		// Check for compound words...
 		String parts = word;
@@ -1453,8 +1496,23 @@
 					HtStripPunctuation(w);
 					if (w.length() >= minimumWordLength)
 					{
-						wordRef.Word(w);
-						words.Replace(WordReference::Merge(wordRef, word_context));
+					        if (no_store_phrases)
+						{
+						    // Add new word, or mark existing word as also being at
+						    // this heading level
+						    word_entry *entry;
+						    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+						    {
+							words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+						    } else
+						    {
+							entry->flags |= factor[heading];
+						    }
+						} else
+						{
+						    wordRef.Word(w);
+						    words.Replace(WordReference::Merge(wordRef, word_context));
+						}
 						if (debug > 3)
 							cout << "word part: " << start << '@' << location << endl;
 					}
--- ../apply/htcommon/defaults.cc	2004-02-19 21:43:04.000000000 +1100
+++ htcommon/defaults.cc	2004-04-24 11:06:46.000000000 +1000
@@ -2396,6 +2396,14 @@
 	-90 will select matching documents modified within \
 	the last 90 days. \
 " }, \
+{ "store_phrases", "true",  \
+	"boolean", "htdig", "", "3.2.0b5", "Indexing:How", "startyear: false", " \
+	Causes htdig to record all occurrences of each word in a document, \
+	to allow accurate phrase searches.  If this is false, only the first \
+	occurrence of each word will be stored, causing many phrases to be \
+	missed. Setting this false increases indexing speed by about 20%, \
+	and reduces disk requirements by about 60%.\
+" }, \
 { "substring_max_words", "25",  \
 	"integer", "htsearch", "", "3.0.8b1", "Searching:Method", "substring_max_words: 100", " \
 	The Substring <a href=\"#search_algorithm\">fuzzy algorithm</a> \

Reply via email to