Hi David, I overlooked your email at first. Thanks for your analysis.
I thought that doing the quick fix wasn¹t enough for my purpose. So I replaced the tokenization part icuNext(). I hope backwards compatibility will not be an issue for this fix. Regards Ben static int icuNext( sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, int *piStartOffset, int *piEndOffset, int *piPosition ){ IcuCursor *pCsr = (IcuCursor *)pCursor; int iStart = ubrk_current(pCsr->pIter); int iEnd = iStart; int breakType = 0; UChar32 c = 0; /* Search for token, skipping punctuation "boundary" tokens */ while (iStart != UBRK_DONE) { iEnd = ubrk_next(pCsr->pIter); breakType = ubrk_getRuleStatus(pCsr->pIter); if (breakType != UBRK_WORD_NONE) { break; } iStart = iEnd; } if( iStart==UBRK_DONE ){ return SQLITE_DONE; } /* Now we have a token. But it still may contain word boundary characters that we don't like, e.g: ( ) */ int isBoundary = 0; do { isBoundary = pCsr->aChar[iStart] == '(' || pCsr->aChar[iStart] == ')' || pCsr->aChar[iStart] == ' ' || pCsr->aChar[iStart] == ':'; if (!isBoundary) { break; } U16_NEXT(pCsr->aChar, iStart, pCsr->nChar, c); } while (iStart < iEnd); /* We couldn't find any character that is not a boundary up to the end of the text. Done. */ if( isBoundary ){ return SQLITE_DONE; } int limit = iEnd; iEnd = iStart; do { isBoundary = pCsr->aChar[iEnd] == '(' || pCsr->aChar[iEnd] == ')' || pCsr->aChar[iEnd] == ' ' || pCsr->aChar[iEnd] == ':'; if (isBoundary) { break; } U16_NEXT(pCsr->aChar, iEnd, pCsr->nChar, c); // Note: U16_PREV not working here } while (iEnd < limit); assert(iStart<=iEnd); int nByte = 0; do { UErrorCode status = U_ZERO_ERROR; if( nByte ){ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); if( !zNew ){ return SQLITE_NOMEM; } pCsr->zBuffer = zNew; pCsr->nBuffer = nByte; } u_strToUTF8( pCsr->zBuffer, pCsr->nBuffer, &nByte, &pCsr->aChar[iStart], iEnd-iStart, &status ); } while( nByte>pCsr->nBuffer ); *ppToken = pCsr->zBuffer; *pnBytes = nByte; *piStartOffset = pCsr->aOffset[iStart]; *piEndOffset = pCsr->aOffset[iEnd]; *piPosition = pCsr->iToken++; return SQLITE_OK; } Am 13.04.14 22:07 schrieb "David Hedley" unter <david.hed...@vistair.com>: >This is definitely a bug in sqlite. I have experienced it too. > >The problem stems from ³getNextToken(Š)² expecting to find the >parentheses in the token delimiters (rather than the tokens themselves). >The ICU tokenizer returns the parentheses as tokens, rather than ignoring >them as delimiters as the simple tokenizer does. > >Two possible fixes: >1. Fix getNextToken(...) to look in tokens as well as delimiters for >parentheses >2. Fix icuNext to not return parentheses as tokens. > >To me, option 1. seemed easier to do a quick hack to, until there is an >official fix. > >In getNextToken, I changed: > if (rc == SQLITE_DONE) iStart = n; > for (i = 0; i < iStart i++) { > if (z[i] == '(') { > >to: > > if (rc == SQLITE_DONE) iStart = n; > for (i = 0; i < iEnd; i++) { // 2014-04-12 DCRH: >Tweak to make parens work with ICU tokenizer > if (z[i] == '(') { > >That way, it now searches the token text in addition to the preceding >delimiters, and parentheses now work correctly with the ICU tokenizer. > >Hope this helps, > >David >-- >David Hedley >CTO >Vistair Systems Ltd >Mobile: +44 (0)7971 681088 >Tex: 0845 VISTAIR (8478247) / +44 1454 616531 >Fax: 0870 1350992 >-- >Information in this electronic mail message is confidential and may be >legally privileged. It is intended solely for the addressee. Access to >this message by anyone else is unauthorised. If you are not the intended >recipient any use, disclosure, copying or distribution of this message is >prohibited and may be unlawful. When addressed to our customers, any >information contained in this message is subject to Vistair Systems Ltd >Terms and Conditions. > >Vistair Systems Ltd is registered in England and Wales #5418081 > > > >_______________________________________________ >sqlite-users mailing list >sqlite-users@sqlite.org >http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users _______________________________________________ sqlite-users mailing list sqlite-users@sqlite.org http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users