Merge authors: Michal Hruby (mhr3) Related merge proposals: https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479 proposed by: Michal Hruby (mhr3) review: Approve - Siegfried Gevatter (rainct) ------------------------------------------------------------ revno: 423 [merge] committer: Michal Hruby <michal....@gmail.com> branch nick: zeitgeist timestamp: Wed 2012-03-14 13:34:10 +0100 message: Merge lp:~mhr3/zeitgeist/fts-secondary-sorting modified: extensions/fts++/indexer.cpp extensions/fts++/test/test-indexer.cpp
-- lp:zeitgeist https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist. To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp' --- extensions/fts++/indexer.cpp 2012-03-12 14:22:16 +0000 +++ extensions/fts++/indexer.cpp 2012-03-14 12:31:51 +0000 @@ -824,7 +824,6 @@ if (event_templates->len > 0) { - ZeitgeistTimeRange *time_range = zeitgeist_time_range_new_anytime (); results = zeitgeist_db_reader_find_events (zg_reader, time_range, event_templates, @@ -833,8 +832,6 @@ result_type, NULL, error); - - g_object_unref (time_range); } else { @@ -861,6 +858,208 @@ return results; } +static guint32* +find_event_ids_for_combined_template (ZeitgeistDbReader *zg_reader, + ZeitgeistWhereClause *query_clause, // steals + GPtrArray *event_templates, // steals + guint count, + ZeitgeistResultType result_type, + gint *event_ids_length, + GError **error) +{ + g_return_val_if_fail (error == NULL || (error && *error == NULL), NULL); + + ZeitgeistWhereClause *uri_where; + uri_where = zeitgeist_db_reader_get_where_clause_from_event_templates ( + zg_reader, event_templates, error); + g_ptr_array_unref (event_templates); + + zeitgeist_where_clause_extend (query_clause, uri_where); + g_object_unref (G_OBJECT (uri_where)); + + guint32 *event_ids; + event_ids = zeitgeist_db_reader_find_event_ids_for_clause (zg_reader, + query_clause, count, result_type, event_ids_length, error); + + g_object_unref (query_clause); + + return event_ids; +} + +static GPtrArray* +find_events_for_result_type_and_ids (ZeitgeistDbReader *zg_reader, + ZeitgeistTimeRange *time_range, + GPtrArray *templates, + ZeitgeistStorageState storage_state, + unsigned count, + ZeitgeistResultType result_type, + std::vector<unsigned> const& event_ids, + std::map<unsigned, gdouble> &relevancy_map, + GError **error) +{ + GPtrArray *results = NULL; + results = zeitgeist_db_reader_get_events (zg_reader, + const_cast<unsigned*>(&event_ids[0]), + event_ids.size (), + NULL, + error); + + if (error && *error) return NULL; + + if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS) + return results; + + if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS) + { + // need to get the uris from the events and do another find_events call + GPtrArray *event_templates; + event_templates = g_ptr_array_new_with_free_func (g_object_unref); + std::map<std::string, unsigned> remapper; + + for (unsigned i = 0; i < results->len; i++) + { + ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i]; + unsigned event_id = zeitgeist_event_get_id (original_event); + GPtrArray *subjects = zeitgeist_event_get_subjects (original_event); + if (subjects == NULL) continue; + for (unsigned j = 0; j < subjects->len; j++) + { + const gchar *subj_uri = zeitgeist_subject_get_uri ((ZeitgeistSubject*) subjects->pdata[j]); + if (subj_uri == NULL) continue; + remapper[subj_uri] = event_id; + ZeitgeistEvent *event = zeitgeist_event_new (); + ZeitgeistSubject *subject = zeitgeist_subject_new (); + zeitgeist_subject_set_uri (subject, subj_uri); + zeitgeist_event_add_subject (event, subject); // FIXME: leaks? + g_ptr_array_add (event_templates, event); + } + } + + g_ptr_array_unref (results); + + // construct custom where clause which combines the original template + // with the uris we found + ZeitgeistWhereClause *where; + where = zeitgeist_db_reader_get_where_clause_for_query (zg_reader, + time_range, templates, storage_state, error); + + guint32 *real_event_ids; + gint real_event_ids_length; + + real_event_ids = find_event_ids_for_combined_template (zg_reader, + where, event_templates, count, result_type, &real_event_ids_length, + error); + + if (error && *error) return NULL; + + results = zeitgeist_db_reader_get_events (zg_reader, + real_event_ids, + real_event_ids_length, + NULL, + error); + + g_free (real_event_ids); + real_event_ids = NULL; + + if (error && *error) return NULL; + + // the event ids might have changed, we need to update the relevancy_map + for (unsigned i = 0; i < results->len; i++) + { + ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i]; + unsigned event_id = zeitgeist_event_get_id (original_event); + GPtrArray *subjects = zeitgeist_event_get_subjects (original_event); + if (subjects == NULL) continue; + for (unsigned j = 0; j < subjects->len; j++) + { + const gchar *subj_uri = zeitgeist_subject_get_uri ((ZeitgeistSubject*) subjects->pdata[j]); + if (subj_uri == NULL) continue; + relevancy_map[event_id] = relevancy_map[remapper[subj_uri]]; + } + } + + } + else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN) + { + // need to get the origins from the events and do another find_events call + GPtrArray *event_templates; + event_templates = g_ptr_array_new_with_free_func (g_object_unref); + std::map<std::string, unsigned> remapper; + + for (unsigned i = 0; i < results->len; i++) + { + ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i]; + unsigned event_id = zeitgeist_event_get_id (original_event); + GPtrArray *subjects = zeitgeist_event_get_subjects (original_event); + if (subjects == NULL) continue; + for (unsigned j = 0; j < subjects->len; j++) + { + const gchar *subj_origin = zeitgeist_subject_get_origin ((ZeitgeistSubject*) subjects->pdata[j]); + if (subj_origin == NULL) continue; + remapper[subj_origin] = event_id; + ZeitgeistEvent *event = zeitgeist_event_new (); + ZeitgeistSubject *subject = zeitgeist_subject_new (); + zeitgeist_subject_set_origin (subject, subj_origin); + zeitgeist_event_add_subject (event, subject); // FIXME: leaks? + g_ptr_array_add (event_templates, event); + } + } + + g_ptr_array_unref (results); + + // construct custom where clause which combines the original template + // with the uris we found + ZeitgeistWhereClause *where; + where = zeitgeist_db_reader_get_where_clause_for_query (zg_reader, + time_range, templates, storage_state, error); + + guint32 *real_event_ids; + gint real_event_ids_length; + + real_event_ids = find_event_ids_for_combined_template (zg_reader, + where, event_templates, count, result_type, &real_event_ids_length, + error); + + if (error && *error) return NULL; + + results = zeitgeist_db_reader_get_events (zg_reader, + real_event_ids, + real_event_ids_length, + NULL, + error); + + if (error && *error) return NULL; + + g_free (real_event_ids); + real_event_ids = NULL; + + // the event ids might have changed, we need to update the relevancy_map + for (unsigned i = 0; i < results->len; i++) + { + ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i]; + unsigned event_id = zeitgeist_event_get_id (original_event); + GPtrArray *subjects = zeitgeist_event_get_subjects (original_event); + if (subjects == NULL) continue; + for (unsigned j = 0; j < subjects->len; j++) + { + const gchar *subj_origin = zeitgeist_subject_get_origin ((ZeitgeistSubject*) subjects->pdata[j]); + if (subj_origin == NULL) continue; + relevancy_map[event_id] = relevancy_map[remapper[subj_origin]]; + } + } + + } + + return results; +} + GPtrArray* Indexer::SearchWithRelevancies (const gchar *search, ZeitgeistTimeRange *time_range, GPtrArray *templates, @@ -880,21 +1079,58 @@ guint maxhits = count; - if (result_type == RELEVANCY_RESULT_TYPE) - { - enquire->set_sort_by_relevance (); - } - else - { - enquire->set_sort_by_value (VALUE_TIMESTAMP, true); - } - if (storage_state != ZEITGEIST_STORAGE_STATE_ANY) { - g_set_error_literal (error, - ZEITGEIST_ENGINE_ERROR, - ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT, - "Only ANY stogate state is supported"); + // FIXME: add support for this by grabing (un)available storages + // from the storage table and appending them to the query + g_set_error_literal (error, + ZEITGEIST_ENGINE_ERROR, + ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT, + "Only ANY storage state is supported"); + return NULL; + } + + bool reversed_sort = + result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN; + + if (result_type == RELEVANCY_RESULT_TYPE) + { + enquire->set_sort_by_relevance (); + } + else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS) + { + enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort); + enquire->set_collapse_key (VALUE_EVENT_ID); + } + else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS) + { + enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort); + enquire->set_collapse_key (VALUE_URI_HASH); + } + else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN) + { + // FIXME: not really correct but close :) + enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort); + enquire->set_collapse_key (VALUE_URI_HASH); + maxhits *= 3; + } + else + { + g_set_error_literal (error, + ZEITGEIST_ENGINE_ERROR, + ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT, + "Requested result type is not supported"); return NULL; } @@ -926,6 +1162,8 @@ NULL, error); + if (error && *error) return NULL; + if (results->len != relevancy_arr.size ()) { g_warning ("Results don't match relevancies!"); @@ -948,22 +1186,56 @@ } else { - g_set_error_literal (error, - ZEITGEIST_ENGINE_ERROR, - ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT, - "Only RELEVANCY result type is supported"); - /* - * perhaps something like this could be used here? + std::vector<unsigned> event_ids; std::map<unsigned, gdouble> relevancy_map; - foreach (...) + Xapian::MSetIterator iter, end; + for (iter = hits.begin (), end = hits.end (); iter != end; ++iter) { + Xapian::Document doc(iter.get_document ()); + double unserialized = + Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID)); + unsigned event_id = static_cast<unsigned>(unserialized); + + event_ids.push_back (event_id); + double rank = iter.get_percent () / 100.; if (rank > relevancy_map[event_id]) { relevancy_map[event_id] = rank; } } - */ + + results = find_events_for_result_type_and_ids (zg_reader, time_range, + templates, storage_state, + count, result_type, + event_ids, + relevancy_map, error); + + if (error && *error) return NULL; + + if (results == NULL) + { + results = g_ptr_array_new (); + if (relevancies) *relevancies = NULL; + if (relevancies_size) *relevancies_size = 0; + } + else + { + if (relevancies) + { + *relevancies = g_new (gdouble, results->len); + for (unsigned i = 0; i < results->len; i++) + { + ZeitgeistEvent *event = (ZeitgeistEvent*) g_ptr_array_index (results, i); + (*relevancies)[i] = relevancy_map[zeitgeist_event_get_id (event)]; + } + } + + if (relevancies_size) + { + *relevancies_size = results->len; + } + } } if (matches) === modified file 'extensions/fts++/test/test-indexer.cpp' --- extensions/fts++/test/test-indexer.cpp 2012-02-14 16:56:04 +0000 +++ extensions/fts++/test/test-indexer.cpp 2012-03-11 18:58:01 +0000 @@ -163,6 +163,26 @@ return event; } +static ZeitgeistEvent* create_test_event6 (void) +{ + ZeitgeistEvent *event = zeitgeist_event_new (); + ZeitgeistSubject *subject = zeitgeist_subject_new (); + + zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_PRESENTATION); + zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT); + zeitgeist_subject_set_uri (subject, "file:///home/username/Documents/CamelCasePresentation.pdf"); + zeitgeist_subject_set_text (subject, NULL); + zeitgeist_subject_set_mimetype (subject, "application/pdf"); + + zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_MODIFY_EVENT); + zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY); + zeitgeist_event_set_actor (event, "application://libreoffice-impress.desktop"); + zeitgeist_event_add_subject (event, subject); + + g_object_unref (subject); + return event; +} + // Steals the event, ref it if you want to keep it static guint index_event (Fixture *fix, ZeitgeistEvent *event) @@ -172,6 +192,7 @@ guint *event_ids; int num_events_inserted; + zeitgeist_event_set_timestamp (event, zeitgeist_timestamp_now ()); // add event to DBs events = g_ptr_array_new (); g_ptr_array_add (events, event); @@ -586,6 +607,88 @@ g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "IDNwiki"); } +static void +test_simple_relevancies_query (Fixture *fix, gconstpointer data) +{ + guint matches; + guint event_id; + gdouble *relevancies; + gint relevancies_size; + ZeitgeistEvent* event; + + // add test events to DBs + event_id = index_event (fix, create_test_event1 ()); + index_event (fix, create_test_event2 ()); + index_event (fix, create_test_event3 ()); + index_event (fix, create_test_event4 ()); + + GPtrArray *results = + zeitgeist_indexer_search_with_relevancies (fix->indexer, + "text", + zeitgeist_time_range_new_anytime (), + g_ptr_array_new (), + ZEITGEIST_STORAGE_STATE_ANY, + 0, + 10, + (ZeitgeistResultType) 100, + &relevancies, &relevancies_size, + &matches, + NULL); + + g_assert_cmpuint (matches, >, 0); + g_assert_cmpuint (results->len, ==, 1); + g_assert_cmpint (relevancies_size, ==, 1); + g_assert_cmpfloat (relevancies[0], >=, 1.0); + + event = (ZeitgeistEvent*) results->pdata[0]; + g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id); + + ZeitgeistSubject *subject = (ZeitgeistSubject*) + g_ptr_array_index (zeitgeist_event_get_subjects (event), 0); + g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "text"); +} + +static void +test_simple_relevancies_subject_query (Fixture *fix, gconstpointer data) +{ + guint matches; + gdouble *relevancies; + gint relevancies_size; + guint event_id4, event_id5, event_id6; + + // add test events to DBs + index_event (fix, create_test_event1 ()); + index_event (fix, create_test_event2 ()); + index_event (fix, create_test_event3 ()); + event_id4 = index_event (fix, create_test_event4 ()); + usleep (50000); + event_id5 = index_event (fix, create_test_event5 ()); + usleep (50000); + event_id6 = index_event (fix, create_test_event6 ()); + + GPtrArray *results = + zeitgeist_indexer_search_with_relevancies (fix->indexer, + "user*", + zeitgeist_time_range_new_anytime (), + g_ptr_array_new (), + ZEITGEIST_STORAGE_STATE_ANY, + 0, + 10, + ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS, + &relevancies, &relevancies_size, + &matches, + NULL); + + g_assert_cmpuint (matches, >, 0); + g_assert_cmpuint (results->len, ==, 3); + g_assert_cmpint (relevancies_size, ==, 3); + + // we're creating event 6 after 5 and 4, so it has to be more recent (but it seems + // that number of terms indexed matters as well, so careful with the relevancies) + g_assert_cmpuint (event_id6, ==, + zeitgeist_event_get_id ((ZeitgeistEvent*) results->pdata[0])); +} + G_BEGIN_DECLS static void discard_message (const gchar *domain, @@ -619,6 +722,10 @@ setup, test_simple_idn_support, teardown); g_test_add ("/Zeitgeist/FTS/Indexer/CJK", Fixture, 0, setup, test_simple_cjk, teardown); + g_test_add ("/Zeitgeist/FTS/Indexer/Relevancies", Fixture, 0, + setup, test_simple_relevancies_query, teardown); + g_test_add ("/Zeitgeist/FTS/Indexer/RelevanciesSubject", Fixture, 0, + setup, test_simple_relevancies_subject_query, teardown); // get rid of the "rebuilding index..." messages g_log_set_handler (NULL, G_LOG_LEVEL_MESSAGE, discard_message, NULL);
_______________________________________________ Mailing list: https://launchpad.net/~zeitgeist Post to : zeitgeist@lists.launchpad.net Unsubscribe : https://launchpad.net/~zeitgeist More help : https://help.launchpad.net/ListHelp