shiyao.ma added the comment:

This patch splits text in the following form,

suppose TEXT = "aa bb ee.ff.gg"

the splitted words will be: aa, bb, ee, ff, gg, ee.ff, ff.gg, ee.ff.gg

IOW, new words are connected by the dot.

When searching through web interface, for example if the text is "kk hh.pp", 
then the splitted words are "kk" and "hh.pp".
IOW, when searching, we take "dot" separated words as a whole.


the handling for csv interface, and the xapian based indexer is not modified. 
If the above form is okay, I will do the remaining stuff.

----------
nosy: +ezio.melotti

_______________________________________________________
PSF Meta Tracker <metatrac...@psf.upfronthosting.co.za>
<http://psf.upfronthosting.co.za/roundup/meta/issue515>
_______________________________________________________
diff --git a/roundup/backends/indexer_common.py 
b/roundup/backends/indexer_common.py
--- a/roundup/backends/indexer_common.py
+++ b/roundup/backends/indexer_common.py
@@ -23,6 +23,28 @@
         # gibberish (encoded text or somesuch) or shorter than 2 characters
         self.minlength = 2
         self.maxlength = 25
+        self.dot_maxlength = 10
+        self.dot_maxrepeat = 2
+        self.pattern_word = re.compile(
+            r'\b\w{%d,%d}\b'
+                % (self.minlength, self.maxlength),
+            re.UNICODE)
+        self.pattern_dot = re.compile(
+            r'\b(\w{1,%d}\.){1,%d}\w{1,%d}\b' %
+                (self.dot_maxlength, self.dot_maxrepeat, self.dot_maxlength),
+            re.UNICODE)
+
+    def segment_text(self, text):
+        wordlist = [w for w in re.findall(self.pattern_word, text)]
+        match = re.search(self.pattern_dot, text)
+        while match:
+            words = match.group().split('.')
+            for length in range(2, len(words)+1):
+                for idx in range(len(words)-length+1):
+                    wordlist.append('.'.join(words[idx:idx+length]))
+            text = text[match.end():]
+            match = re.search(self.pattern_dot, text)
+        return set(w.encode('utf8') for w in wordlist)
 
     def is_stopword(self, word):
         return word in self.stopwords
diff --git a/roundup/backends/indexer_rdbms.py 
b/roundup/backends/indexer_rdbms.py
--- a/roundup/backends/indexer_rdbms.py
+++ b/roundup/backends/indexer_rdbms.py
@@ -64,9 +64,7 @@
         if not isinstance(text, unicode):
             text = unicode(text, "utf-8", "replace")
         text = text.upper()
-        wordlist = [w.encode("utf-8")
-                    for w in re.findall(r'(?u)\b\w{%d,%d}\b'
-                                        % (self.minlength, self.maxlength), 
text)]
+        wordlist = self.segment_text(text)
         words = set()
         for word in wordlist:
             if self.is_stopword(word): continue
diff --git a/roundup/cgi/templating.py b/roundup/cgi/templating.py
--- a/roundup/cgi/templating.py
+++ b/roundup/cgi/templating.py
@@ -2800,7 +2800,7 @@
         if self.search_text:
             matches = self.client.db.indexer.search(
                 [w.upper().encode("utf-8", "replace") for w in re.findall(
-                    r'(?u)\b\w{2,25}\b',
+                    r'(?u)\b(?:\w|\.){2,40}\b',
                     unicode(self.search_text, "utf-8", "replace")
                 )], klass, ignore)
         else:
_______________________________________________
Tracker-discuss mailing list
Tracker-discuss@python.org
https://mail.python.org/mailman/listinfo/tracker-discuss

Reply via email to