commit 618312b3698fe5ee41edb567817a2cc7cf5f30fd
Author: dprunier <dominique.prunier@watch4net.com>
Date:   Thu Mar 8 13:31:04 2012

    new CS_PATTERN_MATCH define added to match LIKE patterns case-sensitively and perform specific optimizations

diff --git a/src/category.cpp b/src/category.cpp
index cb178dc..5542135 100644
--- a/src/category.cpp
+++ b/src/category.cpp
@@ -590,6 +590,7 @@ long ibis::category::patternSearch(const char *pat) const {
 	<< '.' << m_name << "]::patternSearch starting to match pattern "
 	<< pat;
     long est = 0;
+#ifndef CS_PATTERN_MATCH
     const uint32_t nd = dic.size();
     for (uint32_t j = 1; j <= nd; ++ j) {
 	if (ibis::util::strMatch(dic[j], pat)) {
@@ -598,6 +599,15 @@ long ibis::category::patternSearch(const char *pat) const {
 		est += bv->cnt();
 	}
     }
+#else
+    std::auto_ptr< ibis::array_t<uint32_t> > tmp(new ibis::array_t<uint32_t>);
+    dic.patternSearch(pat, *tmp);
+    for (uint32_t j = 0; j < tmp->size(); ++ j) {
+	const ibis::bitvector *bv = rlc->getBitvector(j);
+	if (bv != 0)
+	    est += bv->cnt();
+    }
+#endif
     return est;
 } // ibis::category::patternSearch
 
@@ -631,6 +641,7 @@ long ibis::category::patternSearch(const char *pat,
 	<< pat;
     long est = 0;
     uint32_t cnt = 0;
+#ifndef CS_PATTERN_MATCH
     const uint32_t nd = dic.size();
     for (uint32_t j = 1; j <= nd; ++ j) {
 	if (ibis::util::strMatch(dic[j], pat)) {
@@ -649,6 +660,25 @@ long ibis::category::patternSearch(const char *pat,
 	    }
 	}
     }
+#else
+    std::auto_ptr< ibis::array_t<uint32_t> > tmp(new ibis::array_t<uint32_t>);
+    dic.patternSearch(pat, *tmp);
+    for (uint32_t j = 0; j < tmp->size(); ++ j) {
+	const ibis::bitvector *bv = rlc->getBitvector(j);
+	if (bv != 0) {
+	    ++ cnt;
+	    est += bv->cnt();
+	    if (hits.empty()) {
+		hits.copy(*bv);
+	    }
+	    else {
+		if (cnt > 32 || (j > 3 && cnt*16 > j))
+		    hits.decompress();
+		hits |= *bv;
+	    }
+	}
+    }
+#endif
     if (est > static_cast<long>(hits.size() >> 7))
 	hits.compress();
     return est;
diff --git a/src/dictionary.cpp b/src/dictionary.cpp
index aca4ec8..f6d02c6 100755
--- a/src/dictionary.cpp
+++ b/src/dictionary.cpp
@@ -368,6 +368,126 @@ void ibis::dictionary::clear() {
     raw_[0] = 0;
 } // ibis::dictionary::clear
 
+/// Find all codes that matches the SQL LIKE pattern (case sensitive).
+/// If the pattern is null or empty, matches is not changed.
+void ibis::dictionary::patternSearch(const char* pat, array_t<uint32_t>& matches) const {
+    if (pat == 0) return;
+    if (*pat == 0) return;
+    if (key_.size() == 0) return;
+    if (! (code_.size() == key_.size() &&
+	   key_.size()+1 == raw_.size())) {
+	LOGGER(ibis::gVerbose > 0)
+	    << "Warning -- dictionary::patternSearch(" << pat
+	    << ") can not proceed because the member variables have "
+	    "inconsistent sizes: raw_.size(" << raw_.size() << ", key_.size("
+	    << key_.size() << "), and code_.size(" << code_.size() << ')';
+	return;
+    }
+
+    // extract constant prefix
+    std::string prefix;
+    bool esc = false;
+    bool meta = false;
+    const size_t len = strlen(pat);
+    size_t pos;
+    for (pos = 0; pos < len && !meta; ++pos) {
+	const char c = *(pat + pos);
+	if (esc) {
+	    prefix.append(1, c);
+	    esc = false;
+	} else {
+	    switch (c) {
+		case '\\':
+		    esc = true;
+		    break;
+		case '%':
+		case '_':
+		case '*':
+		case '?':
+		    meta = true;
+		    break;
+		default:
+		    prefix.append(1, c);
+		    break;
+	    }
+	}
+    }
+
+    // if there is no meta char, find the string directly
+    if (!meta) {
+	uint32_t code = operator[](prefix.c_str());
+	if ( code != size() + 1 ) {
+	    matches.push_back(code);
+	}
+	return;
+    }
+
+    // locate prefix to restrict matching range
+    int32_t min = -1, max = -1;
+    if ( prefix.size() == 0 ) {
+	min = 0;
+	max = key_.size();
+    }
+    else if (key_.size() < 16 ) {
+	// use linear search
+	for (uint32_t m = 0; m < key_.size(); ++ m) {
+	    if ( min < 0 ) {
+		int comp = strncmp(key_[m], prefix.c_str(), prefix.length());
+		if ( comp == 0 ) {
+		    min = m;
+		} else if ( comp > 0 ) {
+		    break;
+		}
+	    } else if ( max < 0 ) {
+		if ( strncmp(key_[m], prefix.c_str(), prefix.length()) != 0 ) {
+		    max = m;
+		    break;
+		}
+	    }
+	}
+	if ( min < 0 ) return;
+	if ( max < 0 ) max = key_.size();
+    }
+    else {
+	// find lower bound using binary search
+	int32_t b = 0;
+	int32_t e = key_.size() - 1;
+	while ( b <= e ) {
+	    int32_t m = (b + e) / 2;
+	    if ( strncmp(key_[m], prefix.c_str(), prefix.size()) >= 0 ) {
+		e = m - 1;
+	    } else {
+		b = m + 1;
+	    }
+	}
+
+	if ( b < key_.size() && strncmp(key_[b], prefix.c_str(), prefix.size()) == 0 ) {
+	    min = b;
+	} else {
+	    return;
+	}
+
+	// find upper bound using binary search
+	e = key_.size() - 1;
+	while ( b <= e) {
+	    int32_t m = (b + e) / 2;
+	    if ( strncmp(key_[m], prefix.c_str(), prefix.size()) > 0 ) {
+		e = m - 1;
+	    } else {
+		b = m + 1;
+	    }
+	}
+	max = b;
+    }
+
+    // match values in the range
+    for (uint32_t j = min; j < max; ++ j) {
+	if (ibis::util::strMatch(key_[j] + prefix.size(), pat + pos - 1)) {
+	    matches.push_back(code_[j]);
+	}
+    }
+}
+
 /// Convert a string to its integer code.  Returns 0 for empty (null)
 /// strings, 1:size() for strings in the dictionary, and
 /// dictionary::size()+1 for unknown values.
diff --git a/src/dictionary.h b/src/dictionary.h
index c5cd18a..de48918 100755
--- a/src/dictionary.h
+++ b/src/dictionary.h
@@ -23,6 +23,7 @@ public:
     const char* operator[](uint32_t i) const;
     uint32_t operator[](const char* str) const;
     const char* find(const char* str) const;
+    void patternSearch(const char* pat, array_t<uint32_t>& matches) const;
     uint32_t insert(const char* str);
     uint32_t insertRaw(char* str);
 
diff --git a/src/util.cpp b/src/util.cpp
index 1f3349a..97b2e35 100755
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -1845,9 +1845,17 @@ bool ibis::util::strMatch(const char *str, const char *pat) {
     const char *s1 = strpbrk(pat, metaList);
     const long int nhead = s1 - pat;
     if (s1 < pat) { // no meta character
+#ifdef CS_PATTERN_MATCH
+	return (0 == strcmp(str, pat));
+#else
 	return (0 == stricmp(str, pat));
+#endif
     }
+#ifdef CS_PATTERN_MATCH
+    else if (s1 > pat && 0 != strncmp(str, pat, nhead)) {
+#else
     else if (s1 > pat && 0 != strnicmp(str, pat, nhead)) {
+#endif
 	// characters before the first meta character do not match
 	return false;
     }
@@ -1927,7 +1935,11 @@ bool ibis::util::strMatch(const char *str, const char *pat) {
 	if (nstr < ntail)
 	    return false;
 	else
+#ifdef CS_PATTERN_MATCH
+	    return (0 == strcmp(s1, s0+(nstr-ntail)));
+#else
 	    return (0 == stricmp(s1, s0+(nstr-ntail)));
+#endif
     }
 
     const std::string anchor(s1, s2);
diff --git a/src/util.h b/src/util.h
index bcb8b9a..527ce97 100755
--- a/src/util.h
+++ b/src/util.h
@@ -263,6 +263,9 @@ int truncate(const char*, uint32_t);
 #endif
 #endif
 
+// speeds up SQL LIKE by using case sensitive match
+#define CS_PATTERN_MATCH
+
 // // The function isfinite is a macro defined in math.h according to
 // // opengroup.org.  As of 2011, only MS visual studio does not have a
 // // definition for isfinite, but it has _finite in float,h.
