commit a2a5117f770e0c2bbecd6d33d6ea3b536a5107b2
Author: dprunier <dominique.prunier@watch4net.com>
Date:   Thu Mar 8 13:31:04 2012

    new CS_PATTERN_MATCH define added to match LIKE patterns case-sensitively and perform specific optimizations

diff --git a/src/category.cpp b/src/category.cpp
index cb178dc..bff0e4e 100644
--- a/src/category.cpp
+++ b/src/category.cpp
@@ -590,13 +590,12 @@ long ibis::category::patternSearch(const char *pat) const {
 	<< '.' << m_name << "]::patternSearch starting to match pattern "
 	<< pat;
     long est = 0;
-    const uint32_t nd = dic.size();
-    for (uint32_t j = 1; j <= nd; ++ j) {
-	if (ibis::util::strMatch(dic[j], pat)) {
-	    const ibis::bitvector *bv = rlc->getBitvector(j);
-	    if (bv != 0)
-		est += bv->cnt();
-	}
+    std::auto_ptr< ibis::array_t<uint32_t> > tmp(new ibis::array_t<uint32_t>);
+    dic.patternSearch(pat, *tmp);
+    for (uint32_t j = 0; j < tmp->size(); ++ j) {
+	const ibis::bitvector *bv = rlc->getBitvector(j);
+	if (bv != 0)
+	    est += bv->cnt();
     }
     return est;
 } // ibis::category::patternSearch
@@ -631,21 +630,20 @@ long ibis::category::patternSearch(const char *pat,
 	<< pat;
     long est = 0;
     uint32_t cnt = 0;
-    const uint32_t nd = dic.size();
-    for (uint32_t j = 1; j <= nd; ++ j) {
-	if (ibis::util::strMatch(dic[j], pat)) {
-	    const ibis::bitvector *bv = rlc->getBitvector(j);
-	    if (bv != 0) {
-		++ cnt;
-		est += bv->cnt();
-		if (hits.empty()) {
-		    hits.copy(*bv);
-		}
-		else {
-		    if (cnt > 32 || (j > 3 && cnt*16 > j))
-			hits.decompress();
-		    hits |= *bv;
-		}
+    std::auto_ptr< ibis::array_t<uint32_t> > tmp(new ibis::array_t<uint32_t>);
+    dic.patternSearch(pat, *tmp);
+    for (uint32_t j = 0; j < tmp->size(); ++ j) {
+	const ibis::bitvector *bv = rlc->getBitvector(j);
+	if (bv != 0) {
+	    ++ cnt;
+	    est += bv->cnt();
+	    if (hits.empty()) {
+		hits.copy(*bv);
+	    }
+	    else {
+		if (cnt > 32 || (j > 3 && cnt*16 > j))
+		    hits.decompress();
+		hits |= *bv;
 	    }
 	}
     }
diff --git a/src/dictionary.cpp b/src/dictionary.cpp
index aca4ec8..8632964 100755
--- a/src/dictionary.cpp
+++ b/src/dictionary.cpp
@@ -368,6 +368,136 @@ void ibis::dictionary::clear() {
     raw_[0] = 0;
 } // ibis::dictionary::clear
 
+/// Find all codes that matches the SQL LIKE pattern.
+/// If the pattern is null or empty, matches is not changed.
+void ibis::dictionary::patternSearch(const char* pat, array_t<uint32_t>& matches) const {
+    if (pat == 0) return;
+    if (*pat == 0) return;
+    if (key_.size() == 0) return;
+    if (! (code_.size() == key_.size() &&
+	   key_.size()+1 == raw_.size())) {
+	LOGGER(ibis::gVerbose > 0)
+	    << "Warning -- dictionary::patternSearch(" << pat
+	    << ") can not proceed because the member variables have "
+	    "inconsistent sizes: raw_.size(" << raw_.size() << ", key_.size("
+	    << key_.size() << "), and code_.size(" << code_.size() << ')';
+	return;
+    }
+
+#ifndef CS_PATTERN_MATCH
+    // case insensitive, test all values
+    const uint32_t nd = key_.size();
+    for (uint32_t j = 0; j < nd; ++ j) {
+	if (ibis::util::strMatch(key_[j], pat)) {
+	    matches.push_back(code_[j]);
+	}
+    }
+#else
+    // case insensitive, extract longest constant prefix to restrict range
+    size_t pos;
+    bool esc = false;
+    bool meta = false;
+    std::string prefix;
+    const size_t len = strlen(pat);
+    for (pos = 0; pos < len && !meta; ++pos) {
+	const char c = *(pat + pos);
+	if (esc) {
+	    prefix.append(1, c);
+	    esc = false;
+	} else {
+	    switch (c) {
+		case STRMATCH_META_ESCAPE:
+		    esc = true;
+		    break;
+		case STRMATCH_META_CSH_ANY:
+		case STRMATCH_META_CSH_ONE:
+		case STRMATCH_META_SQL_ANY:
+		case STRMATCH_META_SQL_ONE:
+		    meta = true;
+		    break;
+		default:
+		    prefix.append(1, c);
+		    break;
+	    }
+	}
+    }
+
+    // if there is no meta char, find the string directly
+    if (!meta) {
+	uint32_t code = operator[](prefix.c_str());
+	if ( code != size() + 1 ) {
+	    matches.push_back(code);
+	}
+	return;
+    }
+
+    // locate prefix to restrict matching range
+    int32_t min = -1, max = -1;
+    if ( prefix.size() == 0 ) {
+	min = 0;
+	max = key_.size();
+    }
+    else if (key_.size() < 16 ) {
+	// use linear search
+	for (uint32_t m = 0; m < key_.size(); ++ m) {
+	    if ( min < 0 ) {
+		int comp = strncmp(key_[m], prefix.c_str(), prefix.length());
+		if ( comp == 0 ) {
+		    min = m;
+		} else if ( comp > 0 ) {
+		    break;
+		}
+	    } else if ( max < 0 ) {
+		if ( strncmp(key_[m], prefix.c_str(), prefix.length()) != 0 ) {
+		    max = m;
+		    break;
+		}
+	    }
+	}
+	if ( min < 0 ) return;
+	if ( max < 0 ) max = key_.size();
+    }
+    else {
+	// find lower bound using binary search
+	int32_t b = 0;
+	int32_t e = key_.size() - 1;
+	while ( b <= e ) {
+	    int32_t m = (b + e) / 2;
+	    if ( strncmp(key_[m], prefix.c_str(), prefix.size()) >= 0 ) {
+		e = m - 1;
+	    } else {
+		b = m + 1;
+	    }
+	}
+
+	if ( b < key_.size() && strncmp(key_[b], prefix.c_str(), prefix.size()) == 0 ) {
+	    min = b;
+	} else {
+	    return;
+	}
+
+	// find upper bound using binary search
+	e = key_.size() - 1;
+	while ( b <= e) {
+	    int32_t m = (b + e) / 2;
+	    if ( strncmp(key_[m], prefix.c_str(), prefix.size()) > 0 ) {
+		e = m - 1;
+	    } else {
+		b = m + 1;
+	    }
+	}
+	max = b;
+    }
+
+    // match values in the range
+    for (uint32_t j = min; j < max; ++ j) {
+	if (ibis::util::strMatch(key_[j] + prefix.size(), pat + pos - 1)) {
+	    matches.push_back(code_[j]);
+	}
+    }
+#endif
+}
+
 /// Convert a string to its integer code.  Returns 0 for empty (null)
 /// strings, 1:size() for strings in the dictionary, and
 /// dictionary::size()+1 for unknown values.
diff --git a/src/dictionary.h b/src/dictionary.h
index c5cd18a..de48918 100755
--- a/src/dictionary.h
+++ b/src/dictionary.h
@@ -23,6 +23,7 @@ public:
     const char* operator[](uint32_t i) const;
     uint32_t operator[](const char* str) const;
     const char* find(const char* str) const;
+    void patternSearch(const char* pat, array_t<uint32_t>& matches) const;
     uint32_t insert(const char* str);
     uint32_t insertRaw(char* str);
 
diff --git a/src/util.cpp b/src/util.cpp
index 1f3349a..40266cb 100755
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -1788,13 +1788,6 @@ ibis::util::timer::~timer() {
     }
 } // ibis::util::timer::~timer
 
-// The meta characters used in ibis::util::strMatch.  
-#define STRMATCH_META_CSH_ANY '*'
-#define STRMATCH_META_CSH_ONE '?'
-#define STRMATCH_META_SQL_ANY '%'
-#define STRMATCH_META_SQL_ONE '_'
-#define STRMATCH_META_ESCAPE '\\'
-
 /// If the whole string matches the pattern, this function returns true,
 /// otherwise, it returns false.  The special cases are (1) if the two
 /// pointers are the same, it returns true; (2) if both arguments point to
@@ -1845,9 +1838,17 @@ bool ibis::util::strMatch(const char *str, const char *pat) {
     const char *s1 = strpbrk(pat, metaList);
     const long int nhead = s1 - pat;
     if (s1 < pat) { // no meta character
+#ifdef CS_PATTERN_MATCH
+	return (0 == strcmp(str, pat));
+#else
 	return (0 == stricmp(str, pat));
+#endif
     }
+#ifdef CS_PATTERN_MATCH
+    else if (s1 > pat && 0 != strncmp(str, pat, nhead)) {
+#else
     else if (s1 > pat && 0 != strnicmp(str, pat, nhead)) {
+#endif
 	// characters before the first meta character do not match
 	return false;
     }
@@ -1927,7 +1928,11 @@ bool ibis::util::strMatch(const char *str, const char *pat) {
 	if (nstr < ntail)
 	    return false;
 	else
+#ifdef CS_PATTERN_MATCH
+	    return (0 == strcmp(s1, s0+(nstr-ntail)));
+#else
 	    return (0 == stricmp(s1, s0+(nstr-ntail)));
+#endif
     }
 
     const std::string anchor(s1, s2);
diff --git a/src/util.h b/src/util.h
index bcb8b9a..ae534ae 100755
--- a/src/util.h
+++ b/src/util.h
@@ -263,6 +263,16 @@ int truncate(const char*, uint32_t);
 #endif
 #endif
 
+// speeds up SQL LIKE by using case sensitive match
+#define CS_PATTERN_MATCH
+
+// The meta characters used in ibis::util::strMatch.
+#define STRMATCH_META_CSH_ANY '*'
+#define STRMATCH_META_CSH_ONE '?'
+#define STRMATCH_META_SQL_ANY '%'
+#define STRMATCH_META_SQL_ONE '_'
+#define STRMATCH_META_ESCAPE '\\'
+
 // // The function isfinite is a macro defined in math.h according to
 // // opengroup.org.  As of 2011, only MS visual studio does not have a
 // // definition for isfinite, but it has _finite in float,h.
