Mikkel Kamstrup Erlandsen has proposed merging lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions.
Requested reviews: Zeitgeist Extensions (zeitgeist-extensions) Related bugs: Bug #843668 in Zeitgeist Extensions: "Blowing Xapian max term length corrupts index" https://bugs.launchpad.net/zeitgeist-extensions/+bug/843668 For more details, see: https://code.launchpad.net/~kamstrup/zeitgeist-extensions/fts-cap-term-length/+merge/74362 See attached bug -- https://code.launchpad.net/~kamstrup/zeitgeist-extensions/fts-cap-term-length/+merge/74362 Your team Zeitgeist Extensions is requested to review the proposed merge of lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions.
=== modified file 'fts/_tests.py' --- fts/_tests.py 2011-09-01 13:46:30 +0000 +++ fts/_tests.py 2011-09-07 08:45:23 +0000 @@ -104,3 +104,4 @@ assert u"æ¼¢å" in results[0].subjects[0].text, results[0].subjects[0].uri + === modified file 'fts/fts.py' --- fts/fts.py 2011-09-06 10:03:23 +0000 +++ fts/fts.py 2011-09-07 08:45:23 +0000 @@ -50,6 +50,7 @@ import threading from urllib import quote as url_escape, unquote as url_unescape import gobject, gio +from cStringIO import StringIO from zeitgeist.datamodel import Symbol, StorageState, ResultType, TimeRange, NULL_EVENT, NEGATION_OPERATOR from _zeitgeist.engine.datamodel import Event, Subject @@ -93,6 +94,10 @@ ResultType.LeastPopularActor, ] +# Xapian has a maximum term length of 245 bytes and Bad Things(TM) happen +# if you bust that. We use the cap_string() function to control this. +MAX_TERM_LENGTH = 245 + def synchronized(lock): """ Synchronization decorator. """ @@ -197,6 +202,31 @@ result += c return result +def cap_string (s, nbytes=MAX_TERM_LENGTH): + """ + If s has more than nbytes bytes (not characters) then cap it off + after nbytes bytes in a way still producing a valid utf-8 string. + + Assumes that s is a utf-8 string. + + This function useful for working with Xapian terms because Xapian has + a max term length of 245 (which is not very well documented, but see + http://xapian.org/docs/omega/termprefixes.html). + """ + # Check if we can fast-path this string + if (len(s.encode("utf-8")) <= nbytes): + return s + + # We use a StringIO here to avoid mem thrashing via naiive + # string concatenation. See fx. http://www.skymind.com/~ocrow/python_string/ + buf = StringIO() + for char in s : + if buf.tell() >= nbytes - 1 : + return buf.getvalue() + buf.write(char.encode("utf-8")) + + return unicode(buf.getvalue().decode("utf-8")) + def expand_type (type_prefix, uri): """ Return a string with a Xapian query matching all child types of 'uri' @@ -564,7 +594,7 @@ doc = self._tokenizer.get_document() for cat in desktop.getCategories(): - doc.add_boolean_term(FILTER_PREFIX_XDG_CATEGORY+cat.lower()) + doc.add_boolean_term(cap_string(FILTER_PREFIX_XDG_CATEGORY+cat.lower())) else: log.debug("Unable to look up app info for %s" % actor) @@ -649,25 +679,25 @@ """Adds the filtering rules to the doc. Filtering rules will not affect the relevancy ranking of the event/doc""" if event.interpretation: - doc.add_boolean_term (FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation) + doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation)) if event.manifestation: - doc.add_boolean_term (FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation) + doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation)) if event.actor: - doc.add_boolean_term (FILTER_PREFIX_ACTOR+mangle_uri(event.actor)) + doc.add_boolean_term (cap_string(FILTER_PREFIX_ACTOR+mangle_uri(event.actor))) for su in event.subjects: if su.uri: - doc.add_boolean_term (FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri)) + doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri))) if su.interpretation: - doc.add_boolean_term (FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation) + doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation)) if su.manifestation: - doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation) + doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation)) if su.origin: - doc.add_boolean_term (FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin)) + doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin))) if su.mimetype: - doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype) + doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype)) if su.storage: - doc.add_boolean_term (FILTER_PREFIX_SUBJECT_STORAGE+su.storage) + doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_STORAGE+su.storage)) @synchronized (INDEX_LOCK) def _index_event_real (self, event): @@ -766,6 +796,3 @@ return "%s..%sms" % (time_range.begin, time_range.end) -if __name__ == "__main__": - indexer = Indexer(None) - print indexer._compile_filter_query([Event.new_for_values(subject_interpretation="http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document")])
_______________________________________________ Mailing list: https://launchpad.net/~zeitgeist Post to : zeitgeist@lists.launchpad.net Unsubscribe : https://launchpad.net/~zeitgeist More help : https://help.launchpad.net/ListHelp