[sqlite] Collation advice

Jan Slodicka Mon, 26 Aug 2013 08:45:44 -0700

Hello
Please see the source code at the bottom.
It represents custom NOCASE collation to be used with iOS, i.e. it is
registered as
    int rc = sqlite3_create_collation(db, "NOCASE", SQLITE_UTF8, 0,
sqlite3_collate);
Main reason for this collation is the performance. It uses ascii comparison
if possible and falls bask to system compare function in other cases.
Here are my questions:
a) What kind of risks do you see?
b) How would you construct a unit test for this collation?
Ad b)
I was thinking of evaluating SQL commands such as "SELECT ? < ? COLLATE
NOCASE" for selected strings. This way one could verify relations such as
reflexivity/symmetry/transitivity for specific sets of strings. The question
is, however, where to find good test sets.
Best regards
    Jan Slodicka
static char g_bUseAscii=-1;SQLITE_EXTERN int sqlite3_collate(void *NotUsed,
int nKey1, const void *pKey1, int nKey2, const void *pKey2){    // Test on
arg1==arg2      // Benchmark: When processing words extracted from a novelle,
this brings up 3-4% speed increase.     if( nKey1==nKey2 &&
((UInt8*)pKey1)[0]==((UInt8*)pKey2)[0] && memcmp(pKey1,pKey2,nKey1)==0 )        
return 0;        // Try ascii comparison.       // Benchmarks: English text - 6x
faster  if( g_bUseAscii == -1 ) {               // At first exclude languages, 
where ascii
sorting produces bad results because of digraphs:               // - Albanian: 
DH, GJ,
LL, RR, SH, TH, XH, ZH          // - Croatia: LJ, NJ            // - 
Czech/Slovak: CH                   // DZ
- unimportant, it sorts identically to D+Z              // - Hungary: CS, DZ, 
DZS, GY,
LY, NY, SZ, TY, ZS              // Other european languages with latin-based 
alphabet do
not use digraphs        CFArrayRef langs = CFLocaleCopyPreferredLanguages();    
   
CFStringRef langCode = (CFStringRef)CFArrayGetValueAtIndex(langs, 0);       
CFStringRef sk = CFSTR("sk");        CFStringRef cz = CFSTR("cz");       
CFStringRef hr = CFSTR("hr");        CFStringRef hu = CFSTR("hu");       
CFStringRef sq = CFSTR("sq");                if (CFStringCompare(langCode,
sk, 0) == kCFCompareEqualTo ||     //Slovak           
CFStringCompare(langCode, cz, 0) == kCFCompareEqualTo ||     //Czech           
CFStringCompare(langCode, hr, 0) == kCFCompareEqualTo ||     //Croatia          
 
CFStringCompare(langCode, hu, 0) == kCFCompareEqualTo ||     //Hungary          
 
CFStringCompare(langCode, sq, 0) == kCFCompareEqualTo)       //Albania          
g_bUseAscii = 0;                else                    g_bUseAscii = 1;        
        CFRelease(langs);       }
if( g_bUseAscii )       {               // Utf8 codes smaller than 0x80 are 
identical with
ASCII.          // A..Z = 0x41-0x5A, a..z = 0x61-0x7A           int nn = 
nKey1<nKey2 ? nKey1
: nKey2;                int i;          for( i=0 ; i &lt; nn ; ++i )            
{                       uchar c1 =
((uchar*)pKey1)[i];                     uchar c2 = ((uchar*)pKey2)[i];          
        uchar l1 = c1 | 0x20;           
// lowercase                    uchar l2 = c2 | 0x20;                   // 
lowercase                                    //
Alternative A: Dangerous - special chars may sort differently than ASCII        
        
#define SAFE_CHAR(x)    ( (0x20&lt;=(x) &amp;&amp; (x)&lt;=0x7A))       //so 
far use
this till we find better solution (similar as it was before)                    
//
Alternative B: Safer, but slower                        //#define SAFE_CHAR(x)  
( (0x61&lt;=(x)
&amp;&amp; (x)&lt;=0x7A)  ||  (0x30&lt;=(x) &amp;&amp; (x)&lt;=0x39)  || 
(x)==32 )                               if( !SAFE_CHAR(l1) || !SAFE_CHAR(l2) )  
                        goto
labelFullAnalysis;                              if( l1!=l2 )                    
        return l1&lt;l2 ? -1 : 1;       
}                       // Test string leftover         if( nKey1 == nKey2 )    
        {                       return 0;               }       
else            {                       // Next command is the simplest (and 
most logical) choice.
However, it sorts differently some special cases, for example                   
//              Babe,
Babe's                  //return nKey1 &lt; nKey2 ? -1 : 1;                     
        // Hence, if the
leftover starts with a symbol, we'll go for standard procedure.                 
uchar next
= nKey1&lt;nKey2 ? ((uchar*)pKey2)[nn] : ((uchar*)pKey1)[nn];                   
next |=
0x20;                   if( next&lt;0x61 || next>0x7A )                         
goto labelFullAnalysis;                         
// Now, we can reliably test the leftover.                      return nKey1 < 
nKey2 ? -1 : 1;  
}       }labelFullAnalysis:    {        CFStringRef s1 =
CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, (const UInt8*)pKey1,
nKey1, kCFStringEncodingUTF8, NO, kCFAllocatorNull);        CFStringRef s2 =
CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, (const UInt8*)pKey2,
nKey2, kCFStringEncodingUTF8, NO, kCFAllocatorNull);        int result =
CFStringCompare(s1, s2,
kCFCompareCaseInsensitive|kCFCompareLocalized|kCFCompareDiacriticInsensitive|kCFCompareNonliteral);
       
CFRelease(s2), s2 = nil;        CFRelease(s1), s1 = nil;        return
result;    }}





--
View this message in context: 
http://sqlite.1065341.n5.nabble.com/Collation-advice-tp70668.html
Sent from the SQLite mailing list archive at Nabble.com.
_______________________________________________
sqlite-users mailing list
sqlite-users@sqlite.org
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users

[sqlite] Collation advice

Reply via email to