[1/3] asterixdb git commit: Full-text implementation step 1

wangsaeu Sat, 10 Dec 2016 00:15:07 -0800

Repository: asterixdb
Updated Branches:
  refs/heads/master fce8e2925 -> 44cef249a



http://git-wip-us.apache.org/repos/asf/asterixdb/blob/44cef249/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
----------------------------------------------------------------------
diff --git 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index e867ecc..cd654d7 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -249,6 +249,13 @@ public class UTF8StringUtil {
         return compareTo(thisBytes, thisStart, thatBytes, thatStart, true, 
false);
     }
 
+    // Certain type of string does not include lengthByte in the beginning and
+    // the length of the given string is given explicitly as a parameter. 
(e.g., token in a string)
+    public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, int 
thisLength, byte[] thatBytes,
+            int thatStart, int thatLength) {
+        return compareTo(thisBytes, thisStart, thisLength, thatBytes, 
thatStart, thatLength, true, false);
+    }
+
     public static int hash(byte[] bytes, int start, int coefficient, int r) {
         return hash(bytes, start, false, false, coefficient, r);
     }
@@ -257,6 +264,12 @@ public class UTF8StringUtil {
         return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
     }
 
+    private static int hash(byte[] bytes, int start, boolean useLowerCase, 
boolean useRawByte, int coefficient, int r) {
+        int utflen = getUTFLength(bytes, start);
+        int sStart = start + getNumBytesToStoreLength(utflen);
+        return hash(bytes, sStart, utflen, useLowerCase, useRawByte, 
coefficient, r);
+    }
+
     /**
      * This function provides the raw bytes-based hash function for UTF8 
strings.
      * Note that the hash values may not deliver the correct ordering for 
certain languages that include 2 or 3 bytes characters.
@@ -270,6 +283,12 @@ public class UTF8StringUtil {
         return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
     }
 
+    // Certain type of string does not include lengthByte in the beginning and
+    // the length of the given string is given explicitly as a parameter.
+    public static int lowerCaseHash(byte[] bytes, int start, int length) {
+        return hash(bytes, start, length, true, false, 31, Integer.MAX_VALUE);
+    }
+
     public static StringBuilder toString(StringBuilder builder, byte[] bytes, 
int start) {
         int utfLen = getUTFLength(bytes, start);
         int offset = getNumBytesToStoreLength(utfLen);
@@ -352,23 +371,27 @@ public class UTF8StringUtil {
 
     private static int compareTo(byte[] thisBytes, int thisStart, byte[] 
thatBytes, int thatStart, boolean useLowerCase,
             boolean useRawByte) {
-        int utflen1 = getUTFLength(thisBytes, thisStart);
-        int utflen2 = getUTFLength(thatBytes, thatStart);
+        int thisLength = getUTFLength(thisBytes, thisStart);;
+        int thatLength = getUTFLength(thatBytes, thatStart);
+        int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength);
+        int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength);
+        return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, 
thatActualStart, thatLength, useLowerCase,
+                useRawByte);
+    }
 
+    private static int compareTo(byte[] thisBytes, int thisActualStart, int 
thisLength, byte[] thatBytes,
+            int thatActualStart, int thatLength, boolean useLowerCase, boolean 
useRawByte) {
         int c1 = 0;
         int c2 = 0;
 
-        int s1Start = thisStart + getNumBytesToStoreLength(utflen1);
-        int s2Start = thatStart + getNumBytesToStoreLength(utflen2);
-
-        while (c1 < utflen1 && c2 < utflen2) {
+        while (c1 < thisLength && c2 < thatLength) {
             char ch1, ch2;
             if (useRawByte) {
-                ch1 = (char) thisBytes[s1Start + c1];
-                ch2 = (char) thatBytes[s2Start + c2];
+                ch1 = (char) thisBytes[thisActualStart + c1];
+                ch2 = (char) thatBytes[thatActualStart + c2];
             } else {
-                ch1 = (charAt(thisBytes, s1Start + c1));
-                ch2 = (charAt(thatBytes, s2Start + c2));
+                ch1 = charAt(thisBytes, thisActualStart + c1);
+                ch2 = charAt(thatBytes, thatActualStart + c2);
 
                 if (useLowerCase) {
                     ch1 = Character.toLowerCase(ch1);
@@ -379,30 +402,29 @@ public class UTF8StringUtil {
             if (ch1 != ch2) {
                 return ch1 - ch2;
             }
-            c1 += charSize(thisBytes, s1Start + c1);
-            c2 += charSize(thatBytes, s2Start + c2);
+            c1 += charSize(thisBytes, thisActualStart + c1);
+            c2 += charSize(thatBytes, thatActualStart + c2);
         }
-        return utflen1 - utflen2;
+        return thisLength - thatLength;
     }
 
-    private static int hash(byte[] bytes, int start, boolean useLowerCase, 
boolean useRawByte, int coefficient, int r) {
+    private static int hash(byte[] bytes, int start, int length, boolean 
useLowerCase, boolean useRawByte,
+            int coefficient, int r) {
         int h = 0;
-        int utflen = getUTFLength(bytes, start);
-        int sStart = start + getNumBytesToStoreLength(utflen);
         int c = 0;
 
-        while (c < utflen) {
+        while (c < length) {
             char ch;
             if (useRawByte) {
-                ch = (char) bytes[sStart + c];
+                ch = (char) bytes[start + c];
             } else {
-                ch = charAt(bytes, sStart + c);
+                ch = charAt(bytes, start + c);
                 if (useLowerCase) {
                     ch = Character.toLowerCase(ch);
                 }
             }
             h = (coefficient * h + ch) % r;
-            c += charSize(bytes, sStart + c);
+            c += charSize(bytes, start + c);
         }
         return h;
     }

[1/3] asterixdb git commit: Full-text implementation step 1

Reply via email to