Hello !
After reporting here previously about this issue I've got a working
implementation of "min_word_size" option to Unicode61Tokenizer see patch
bellow.
With it here is the result of a simple test:
====
./sqlite3
SQLite version 3.26.0 2018-09-20 20:43:28
Enter ".help" for usage hints.
Connected to a transient in-memory database.
Use ".open FILENAME" to reopen on a persistent database.
sqlite> create virtual table tfts using fts5(data, tokenize = 'unicode61
min_word_size 3');
sqlite> create virtual table if not exists tfts_vocab_row USING
fts5vocab('tfts', 'row');
sqlite> insert into tfts(data) values('A new way to tokenize using fts5
from sqlite, we can discard n letters word');
sqlite> select * from tfts_vocab_row;
discard|1|1
from|1|1
fts5|1|1
letters|1|1
sqlite|1|1
tokenize|1|1
using|1|1
word|1|1
====
====
fossil diff fts5_tokenize.c
Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -233,10 +233,11 @@
struct Unicode61Tokenizer {
unsigned char aTokenChar[128]; /* ASCII range token characters */
char *aFold; /* Buffer to fold text into */
int nFold; /* Size of aFold[] in bytes */
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
+ int nMinWordSize; /* Min size of a word to be indexed */
int nException;
int *aiException;
unsigned char aCategory[32]; /* True for token char categories */
};
@@ -360,10 +361,11 @@
const char *zCat = "L* N* Co";
int i;
memset(p, 0, sizeof(Unicode61Tokenizer));
p->bRemoveDiacritic = 1;
+ p->nMinWordSize = 0;
p->nFold = 64;
p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
if( p->aFold==0 ){
rc = SQLITE_NOMEM;
}
@@ -393,10 +395,14 @@
if( 0==sqlite3_stricmp(azArg[i], "separators") ){
rc = fts5UnicodeAddExceptions(p, zArg, 0);
}else
if( 0==sqlite3_stricmp(azArg[i], "categories") ){
/* no-op */
+ }else
+ if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){
+ int mwsz;
+ if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz;
}else{
rc = SQLITE_ERROR;
}
}
@@ -450,10 +456,11 @@
while( rc==SQLITE_OK ){
int iCode; /* non-ASCII codepoint read from
input */
char *zOut = aFold;
int is;
int ie;
+ int wsz;
/* Skip any separator characters. */
while( 1 ){
if( zCsr>=zTerm ) goto tokenize_done;
if( *zCsr & 0x80 ) {
@@ -517,12 +524,15 @@
zCsr++;
}
ie = zCsr - (unsigned char*)pText;
}
+ wsz = zOut-aFold;
+ /* Check min word size */
+ if(p->nMinWordSize && p->nMinWordSize >= wsz) continue;
/* Invoke the token callback */
- rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
+ rc = xToken(pCtx, 0, aFold, wsz, is, ie);
}
tokenize_done:
if( rc==SQLITE_DONE ) rc = SQLITE_OK;
return rc;
====
_______________________________________________
sqlite-users mailing list
sqlite-users@mailinglists.sqlite.org
http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users