Hello
Please see the source code at the bottom.
It represents custom NOCASE collation to be used with iOS, i.e. it is
registered as
int rc = sqlite3_create_collation(db, "NOCASE", SQLITE_UTF8, 0,
sqlite3_collate);
Main reason for this collation is the performance. It uses ascii comparison
if possible and falls bask to system compare function in other cases.
Here are my questions:
a) What kind of risks do you see?
b) How would you construct a unit test for this collation?
Ad b)
I was thinking of evaluating SQL commands such as "SELECT ? < ? COLLATE
NOCASE" for selected strings. This way one could verify relations such as
reflexivity/symmetry/transitivity for specific sets of strings. The question
is, however, where to find good test sets.
Best regards
Jan Slodicka
static char g_bUseAscii=-1;SQLITE_EXTERN int sqlite3_collate(void *NotUsed,
int nKey1, const void *pKey1, int nKey2, const void *pKey2){ // Test on
arg1==arg2 // Benchmark: When processing words extracted from a novelle,
this brings up 3-4% speed increase. if( nKey1==nKey2 &&
((UInt8*)pKey1)[0]==((UInt8*)pKey2)[0] && memcmp(pKey1,pKey2,nKey1)==0 )
return 0; // Try ascii comparison. // Benchmarks: English text - 6x
faster if( g_bUseAscii == -1 ) { // At first exclude languages,
where ascii
sorting produces bad results because of digraphs: // - Albanian:
DH, GJ,
LL, RR, SH, TH, XH, ZH // - Croatia: LJ, NJ // -
Czech/Slovak: CH // DZ
- unimportant, it sorts identically to D+Z // - Hungary: CS, DZ,
DZS, GY,
LY, NY, SZ, TY, ZS // Other european languages with latin-based
alphabet do
not use digraphs CFArrayRef langs = CFLocaleCopyPreferredLanguages();
CFStringRef langCode = (CFStringRef)CFArrayGetValueAtIndex(langs, 0);
CFStringRef sk = CFSTR("sk"); CFStringRef cz = CFSTR("cz");
CFStringRef hr = CFSTR("hr"); CFStringRef hu = CFSTR("hu");
CFStringRef sq = CFSTR("sq"); if (CFStringCompare(langCode,
sk, 0) == kCFCompareEqualTo || //Slovak
CFStringCompare(langCode, cz, 0) == kCFCompareEqualTo || //Czech
CFStringCompare(langCode, hr, 0) == kCFCompareEqualTo || //Croatia
CFStringCompare(langCode, hu, 0) == kCFCompareEqualTo || //Hungary
CFStringCompare(langCode, sq, 0) == kCFCompareEqualTo) //Albania
g_bUseAscii = 0; else g_bUseAscii = 1;
CFRelease(langs); }
if( g_bUseAscii ) { // Utf8 codes smaller than 0x80 are
identical with
ASCII. // A..Z = 0x41-0x5A, a..z = 0x61-0x7A int nn =
nKey1<nKey2 ? nKey1
: nKey2; int i; for( i=0 ; i < nn ; ++i )
{ uchar c1 =
((uchar*)pKey1)[i]; uchar c2 = ((uchar*)pKey2)[i];
uchar l1 = c1 | 0x20;
// lowercase uchar l2 = c2 | 0x20; //
lowercase //
Alternative A: Dangerous - special chars may sort differently than ASCII
#define SAFE_CHAR(x) ( (0x20<=(x) && (x)<=0x7A)) //so
far use
this till we find better solution (similar as it was before)
//
Alternative B: Safer, but slower //#define SAFE_CHAR(x)
( (0x61<=(x)
&& (x)<=0x7A) || (0x30<=(x) && (x)<=0x39) ||
(x)==32 ) if( !SAFE_CHAR(l1) || !SAFE_CHAR(l2) )
goto
labelFullAnalysis; if( l1!=l2 )
return l1<l2 ? -1 : 1;
} // Test string leftover if( nKey1 == nKey2 )
{ return 0; }
else { // Next command is the simplest (and
most logical) choice.
However, it sorts differently some special cases, for example
// Babe,
Babe's //return nKey1 < nKey2 ? -1 : 1;
// Hence, if the
leftover starts with a symbol, we'll go for standard procedure.
uchar next
= nKey1<nKey2 ? ((uchar*)pKey2)[nn] : ((uchar*)pKey1)[nn];
next |=
0x20; if( next<0x61 || next>0x7A )
goto labelFullAnalysis;
// Now, we can reliably test the leftover. return nKey1 <
nKey2 ? -1 : 1;
} }labelFullAnalysis: { CFStringRef s1 =
CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, (const UInt8*)pKey1,
nKey1, kCFStringEncodingUTF8, NO, kCFAllocatorNull); CFStringRef s2 =
CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, (const UInt8*)pKey2,
nKey2, kCFStringEncodingUTF8, NO, kCFAllocatorNull); int result =
CFStringCompare(s1, s2,
kCFCompareCaseInsensitive|kCFCompareLocalized|kCFCompareDiacriticInsensitive|kCFCompareNonliteral);
CFRelease(s2), s2 = nil; CFRelease(s1), s1 = nil; return
result; }}
--
View this message in context:
http://sqlite.1065341.n5.nabble.com/Collation-advice-tp70668.html
Sent from the SQLite mailing list archive at Nabble.com.
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users