Hello Please see the source code at the bottom. It represents custom NOCASE collation to be used with iOS, i.e. it is registered as int rc = sqlite3_create_collation(db, "NOCASE", SQLITE_UTF8, 0, sqlite3_collate); Main reason for this collation is the performance. It uses ascii comparison if possible and falls bask to system compare function in other cases. Here are my questions: a) What kind of risks do you see? b) How would you construct a unit test for this collation? Ad b) I was thinking of evaluating SQL commands such as "SELECT ? < ? COLLATE NOCASE" for selected strings. This way one could verify relations such as reflexivity/symmetry/transitivity for specific sets of strings. The question is, however, where to find good test sets. Best regards Jan Slodicka static char g_bUseAscii=-1;SQLITE_EXTERN int sqlite3_collate(void *NotUsed, int nKey1, const void *pKey1, int nKey2, const void *pKey2){ // Test on arg1==arg2 // Benchmark: When processing words extracted from a novelle, this brings up 3-4% speed increase. if( nKey1==nKey2 && ((UInt8*)pKey1)[0]==((UInt8*)pKey2)[0] && memcmp(pKey1,pKey2,nKey1)==0 ) return 0; // Try ascii comparison. // Benchmarks: English text - 6x faster if( g_bUseAscii == -1 ) { // At first exclude languages, where ascii sorting produces bad results because of digraphs: // - Albanian: DH, GJ, LL, RR, SH, TH, XH, ZH // - Croatia: LJ, NJ // - Czech/Slovak: CH // DZ - unimportant, it sorts identically to D+Z // - Hungary: CS, DZ, DZS, GY, LY, NY, SZ, TY, ZS // Other european languages with latin-based alphabet do not use digraphs CFArrayRef langs = CFLocaleCopyPreferredLanguages(); CFStringRef langCode = (CFStringRef)CFArrayGetValueAtIndex(langs, 0); CFStringRef sk = CFSTR("sk"); CFStringRef cz = CFSTR("cz"); CFStringRef hr = CFSTR("hr"); CFStringRef hu = CFSTR("hu"); CFStringRef sq = CFSTR("sq"); if (CFStringCompare(langCode, sk, 0) == kCFCompareEqualTo || //Slovak CFStringCompare(langCode, cz, 0) == kCFCompareEqualTo || //Czech CFStringCompare(langCode, hr, 0) == kCFCompareEqualTo || //Croatia CFStringCompare(langCode, hu, 0) == kCFCompareEqualTo || //Hungary CFStringCompare(langCode, sq, 0) == kCFCompareEqualTo) //Albania g_bUseAscii = 0; else g_bUseAscii = 1; CFRelease(langs); } if( g_bUseAscii ) { // Utf8 codes smaller than 0x80 are identical with ASCII. // A..Z = 0x41-0x5A, a..z = 0x61-0x7A int nn = nKey1<nKey2 ? nKey1 : nKey2; int i; for( i=0 ; i < nn ; ++i ) { uchar c1 = ((uchar*)pKey1)[i]; uchar c2 = ((uchar*)pKey2)[i]; uchar l1 = c1 | 0x20; // lowercase uchar l2 = c2 | 0x20; // lowercase // Alternative A: Dangerous - special chars may sort differently than ASCII #define SAFE_CHAR(x) ( (0x20<=(x) && (x)<=0x7A)) //so far use this till we find better solution (similar as it was before) // Alternative B: Safer, but slower //#define SAFE_CHAR(x) ( (0x61<=(x) && (x)<=0x7A) || (0x30<=(x) && (x)<=0x39) || (x)==32 ) if( !SAFE_CHAR(l1) || !SAFE_CHAR(l2) ) goto labelFullAnalysis; if( l1!=l2 ) return l1<l2 ? -1 : 1; } // Test string leftover if( nKey1 == nKey2 ) { return 0; } else { // Next command is the simplest (and most logical) choice. However, it sorts differently some special cases, for example // Babe, Babe's //return nKey1 < nKey2 ? -1 : 1; // Hence, if the leftover starts with a symbol, we'll go for standard procedure. uchar next = nKey1<nKey2 ? ((uchar*)pKey2)[nn] : ((uchar*)pKey1)[nn]; next |= 0x20; if( next<0x61 || next>0x7A ) goto labelFullAnalysis; // Now, we can reliably test the leftover. return nKey1 < nKey2 ? -1 : 1; } }labelFullAnalysis: { CFStringRef s1 = CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, (const UInt8*)pKey1, nKey1, kCFStringEncodingUTF8, NO, kCFAllocatorNull); CFStringRef s2 = CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, (const UInt8*)pKey2, nKey2, kCFStringEncodingUTF8, NO, kCFAllocatorNull); int result = CFStringCompare(s1, s2, kCFCompareCaseInsensitive|kCFCompareLocalized|kCFCompareDiacriticInsensitive|kCFCompareNonliteral); CFRelease(s2), s2 = nil; CFRelease(s1), s1 = nil; return result; }}
-- View this message in context: http://sqlite.1065341.n5.nabble.com/Collation-advice-tp70668.html Sent from the SQLite mailing list archive at Nabble.com. _______________________________________________ sqlite-users mailing list sqlite-users@sqlite.org http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users