Awakening the dead...

On 6/29/07, jamie <[EMAIL PROTECTED]> wrote:
we only index the first 10,000 unique words in any one doc and/or only
the first 1mb of text

I believe the maximum text size limit can be adjusted in the config file
but the word limit is hardcoded (which needs to be changed to use a
config var). Note these settings greatly affect memory usage of trackerd
when indexing.

So, here's a patch to add support for the config key MaxWordsToIndex
which is stored in tracker->max_words_to_index and initialized by
default by the macro MAX_WORDS_TO_INDEX defined in tracker-utils.h.

This key will also be added to the default config written by trackerd.

I will look at adding support for these in the tracker-preferences UI in
the near future

So, as before, no tracker-preferences support from me...

As max_index_text_length and now also max_words_to_index is sanitized
to 0 if insane ;)

I hope it is OK to add yet another field to the Tracker structure.

Kind regards, Marcus
diff --git a/src/trackerd/tracker-parser.c b/src/trackerd/tracker-parser.c
index 802e824..56eb9c5 100644
--- a/src/trackerd/tracker-parser.c
+++ b/src/trackerd/tracker-parser.c
@@ -554,7 +554,7 @@ tracker_parse_text (GHashTable *word_table, const char *txt, int weight, gboolea
 					
 					total_words++;
 					
-					if (total_words < 10000) { 
+					if (total_words < tracker->max_words_to_index) { 
 
 						count = GPOINTER_TO_INT (g_hash_table_lookup (word_table, index_word));
 						g_hash_table_insert (word_table, index_word, GINT_TO_POINTER (count + weight));	
@@ -589,7 +589,7 @@ tracker_parse_text (GHashTable *word_table, const char *txt, int weight, gboolea
 
 				total_words++;
 
-				if (total_words < 10000) { 
+				if (total_words < tracker->max_words_to_index) { 
 
 			
 					count = GPOINTER_TO_INT (g_hash_table_lookup (word_table, word));
diff --git a/src/trackerd/tracker-utils.c b/src/trackerd/tracker-utils.c
index b5f56d5..37779a9 100644
--- a/src/trackerd/tracker-utils.c
+++ b/src/trackerd/tracker-utils.c
@@ -2303,6 +2303,8 @@ tracker_load_config_file ()
 					 "[Performance]\n",
 					 "# Maximum size of text in bytes to index from a file's text contents\n",
 					 "MaxTextToIndex=1048576\n",
+					 "# Maximum number of unique words to index from a file's text contents\n",
+					 "MaxWordsToIndex=10000\n",
 					 "# Specifies the no of entities to index before determining whether to perform index optimization\n",
 					 "OptimizationSweepCount=10000\n",
 					 "# Sets the maximum bucket count for the indexer\n",
@@ -2488,6 +2490,10 @@ tracker_load_config_file ()
 
 	}
 
+	if (g_key_file_has_key (key_file, "Performance", "MaxWordsToIndex", NULL)) {
+		tracker->max_words_to_index = g_key_file_get_integer (key_file, "Performance", "MaxWordsToIndex", NULL);
+	}
+
 
 	g_free (filename);
 
diff --git a/src/trackerd/tracker-utils.h b/src/trackerd/tracker-utils.h
index 83e4142..f20350e 100644
--- a/src/trackerd/tracker-utils.h
+++ b/src/trackerd/tracker-utils.h
@@ -50,7 +50,8 @@ extern char *tracker_actions[];
 #define MAX_INDEX_TEXT_LENGTH		1048576
 #define MAX_PROCESS_QUEUE_SIZE		100
 #define MAX_EXTRACT_QUEUE_SIZE		500
-#define	OPTIMIZATION_COUNT		10000
+#define OPTIMIZATION_COUNT		10000
+#define MAX_WORDS_TO_INDEX		10000
 
 /* default indexer options */
 #define MIN_INDEX_BUCKET_COUNT		65536    /* minimum bucket number of word index per division (total buckets = INDEXBNUM * INDEXDIV) */
@@ -215,6 +216,7 @@ typedef struct {
 	int		battery_throttle;
 	gboolean	use_extra_memory;
 	int		initial_sleep;
+	int		max_words_to_index;
 
 	/* indexing options */
 	int	 	max_index_bucket_count;
diff --git a/src/trackerd/trackerd.c b/src/trackerd/trackerd.c
index e7232e4..3138ff0 100644
--- a/src/trackerd/trackerd.c
+++ b/src/trackerd/trackerd.c
@@ -1794,6 +1794,7 @@ set_defaults ()
 	tracker->max_process_queue_size = MAX_PROCESS_QUEUE_SIZE;
 	tracker->max_extract_queue_size = MAX_EXTRACT_QUEUE_SIZE;
 	tracker->optimization_count = OPTIMIZATION_COUNT;
+	tracker->max_words_to_index = MAX_WORDS_TO_INDEX;
 
 	tracker->max_index_bucket_count = MAX_INDEX_BUCKET_COUNT;
 	tracker->min_index_bucket_count = MIN_INDEX_BUCKET_COUNT;
@@ -1860,6 +1861,7 @@ sanity_check_option_values ()
 
 
 	if (tracker->max_index_text_length < 0) tracker->max_index_text_length = 0;
+	if (tracker->max_words_to_index < 0) tracker->max_words_to_index = 0;
 	if (tracker->optimization_count < 1000) tracker->optimization_count = 1000;
 	if (tracker->max_index_bucket_count < 1000) tracker->max_index_bucket_count= 1000;
 	if (tracker->min_index_bucket_count < 1000) tracker->min_index_bucket_count= 1000;
_______________________________________________
tracker-list mailing list
tracker-list@gnome.org
http://mail.gnome.org/mailman/listinfo/tracker-list

Reply via email to