Hello !

After reporting here previously about this issue I've got a working implementation of "min_word_size" option to Unicode61Tokenizer see patch bellow.

With it here is the result of a simple test:

====

./sqlite3
SQLite version 3.26.0 2018-09-20 20:43:28
Enter ".help" for usage hints.
Connected to a transient in-memory database.
Use ".open FILENAME" to reopen on a persistent database.
sqlite> create virtual table tfts using fts5(data, tokenize = 'unicode61 min_word_size 3'); sqlite> create virtual table if not exists tfts_vocab_row USING fts5vocab('tfts', 'row'); sqlite> insert into tfts(data) values('A new way to tokenize using fts5 from sqlite, we can discard n letters word');
sqlite> select * from tfts_vocab_row;
discard|1|1
from|1|1
fts5|1|1
letters|1|1
sqlite|1|1
tokenize|1|1
using|1|1
word|1|1

====

====

fossil diff fts5_tokenize.c
Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -233,10 +233,11 @@
 struct Unicode61Tokenizer {
   unsigned char aTokenChar[128];  /* ASCII range token characters */
   char *aFold;                    /* Buffer to fold text into */
   int nFold;                      /* Size of aFold[] in bytes */
   int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
+  int nMinWordSize;           /* Min size of a word to be indexed */
   int nException;
   int *aiException;

   unsigned char aCategory[32];    /* True for token char categories */
 };
@@ -360,10 +361,11 @@
       const char *zCat = "L* N* Co";
       int i;
       memset(p, 0, sizeof(Unicode61Tokenizer));

       p->bRemoveDiacritic = 1;
+      p->nMinWordSize = 0;
       p->nFold = 64;
       p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
       if( p->aFold==0 ){
         rc = SQLITE_NOMEM;
       }
@@ -393,10 +395,14 @@
         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
           rc = fts5UnicodeAddExceptions(p, zArg, 0);
         }else
         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
           /* no-op */
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){
+          int mwsz;
+          if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz;
         }else{
           rc = SQLITE_ERROR;
         }
       }

@@ -450,10 +456,11 @@
   while( rc==SQLITE_OK ){
     int iCode;                    /* non-ASCII codepoint read from input */
     char *zOut = aFold;
     int is;
     int ie;
+    int wsz;

     /* Skip any separator characters. */
     while( 1 ){
       if( zCsr>=zTerm ) goto tokenize_done;
       if( *zCsr & 0x80 ) {
@@ -517,12 +524,15 @@
         zCsr++;
       }
       ie = zCsr - (unsigned char*)pText;
     }

+    wsz = zOut-aFold;
+    /* Check min word size */
+    if(p->nMinWordSize && p->nMinWordSize >= wsz) continue;
     /* Invoke the token callback */
-    rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
+    rc = xToken(pCtx, 0, aFold, wsz, is, ie);
   }

  tokenize_done:
   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
   return rc;

====

_______________________________________________
sqlite-users mailing list
sqlite-users@mailinglists.sqlite.org
http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users

Reply via email to