Repository: spark
Updated Branches:
  refs/heads/master 9514d874f -> 07fd7d364


[SPARK-9460] Avoid byte array allocation in StringPrefixComparator.

As of today, StringPrefixComparator converts the long values back to byte 
arrays in order to compare them. This patch optimizes this to compare the longs 
directly, rather than turning the longs into byte arrays and comparing them 
byte by byte (unsigned).

This only works on little-endian architecture right now.

Author: Reynold Xin <r...@databricks.com>

Closes #7765 from rxin/SPARK-9460 and squashes the following commits:

e4908cc [Reynold Xin] Stricter randomized tests.
4c8d094 [Reynold Xin] [SPARK-9460] Avoid byte array allocation in 
StringPrefixComparator.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/07fd7d36
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/07fd7d36
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/07fd7d36

Branch: refs/heads/master
Commit: 07fd7d36471dfb823c1ce3e3a18464043affde18
Parents: 9514d87
Author: Reynold Xin <r...@databricks.com>
Authored: Wed Jul 29 21:18:43 2015 -0700
Committer: Reynold Xin <r...@databricks.com>
Committed: Wed Jul 29 21:18:43 2015 -0700

----------------------------------------------------------------------
 .../unsafe/sort/PrefixComparators.java          | 29 ++------------------
 .../unsafe/sort/PrefixComparatorsSuite.scala    | 19 +++++++++----
 .../apache/spark/unsafe/types/UTF8String.java   |  9 ++++++
 .../spark/unsafe/types/UTF8StringSuite.java     | 11 ++++++++
 4 files changed, 36 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/07fd7d36/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
 
b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index 5624e06..a9ee604 100644
--- 
a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ 
b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -17,9 +17,7 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
-import com.google.common.base.Charsets;
-import com.google.common.primitives.Longs;
-import com.google.common.primitives.UnsignedBytes;
+import com.google.common.primitives.UnsignedLongs;
 
 import org.apache.spark.annotation.Private;
 import org.apache.spark.unsafe.types.UTF8String;
@@ -36,32 +34,11 @@ public class PrefixComparators {
   public static final class StringPrefixComparator extends PrefixComparator {
     @Override
     public int compare(long aPrefix, long bPrefix) {
-      // TODO: can done more efficiently
-      byte[] a = Longs.toByteArray(aPrefix);
-      byte[] b = Longs.toByteArray(bPrefix);
-      for (int i = 0; i < 8; i++) {
-        int c = UnsignedBytes.compare(a[i], b[i]);
-        if (c != 0) return c;
-      }
-      return 0;
-    }
-
-    public long computePrefix(byte[] bytes) {
-      if (bytes == null) {
-        return 0L;
-      } else {
-        byte[] padded = new byte[8];
-        System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
-        return Longs.fromByteArray(padded);
-      }
-    }
-
-    public long computePrefix(String value) {
-      return value == null ? 0L : 
computePrefix(value.getBytes(Charsets.UTF_8));
+      return UnsignedLongs.compare(aPrefix, bPrefix);
     }
 
     public long computePrefix(UTF8String value) {
-      return value == null ? 0L : computePrefix(value.getBytes());
+      return value == null ? 0L : value.getPrefix();
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/07fd7d36/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
 
b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index 28fe925..26b7a9e 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -17,22 +17,29 @@
 
 package org.apache.spark.util.collection.unsafe.sort
 
+import com.google.common.primitives.UnsignedBytes
 import org.scalatest.prop.PropertyChecks
-
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.unsafe.types.UTF8String
 
 class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
 
   test("String prefix comparator") {
 
     def testPrefixComparison(s1: String, s2: String): Unit = {
-      val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
-      val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
+      val utf8string1 = UTF8String.fromString(s1)
+      val utf8string2 = UTF8String.fromString(s2)
+      val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
+      val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
       val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, 
s2Prefix)
+
+      val cmp = UnsignedBytes.lexicographicalComparator().compare(
+        utf8string1.getBytes.take(8), utf8string2.getBytes.take(8))
+
       assert(
-        (prefixComparisonResult == 0) ||
-        (prefixComparisonResult < 0 && s1 < s2) ||
-        (prefixComparisonResult > 0 && s1 > s2))
+        (prefixComparisonResult == 0 && cmp == 0) ||
+        (prefixComparisonResult < 0 && s1.compareTo(s2) < 0) ||
+        (prefixComparisonResult > 0 && s1.compareTo(s2) > 0))
     }
 
     // scalastyle:off

http://git-wip-us.apache.org/repos/asf/spark/blob/07fd7d36/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
----------------------------------------------------------------------
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 3e1cc67..5752200 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -138,6 +138,15 @@ public final class UTF8String implements 
Comparable<UTF8String>, Serializable {
   }
 
   /**
+   * Returns a 64-bit integer that can be used as the prefix used in sorting.
+   */
+  public long getPrefix() {
+    long p = PlatformDependent.UNSAFE.getLong(base, offset);
+    p = java.lang.Long.reverseBytes(p);
+    return p;
+  }
+
+  /**
    * Returns the underline bytes, will be a copy of it if it's part of another 
array.
    */
   public byte[] getBytes() {

http://git-wip-us.apache.org/repos/asf/spark/blob/07fd7d36/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
----------------------------------------------------------------------
diff --git 
a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java 
b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index e2a5628..42e09e4 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -64,7 +64,18 @@ public class UTF8StringSuite {
   }
 
   @Test
+  public void prefix() {
+    assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
+    assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
+    assertTrue(
+      fromString("abbbbbbbbbbbasdf").getPrefix() - 
fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
+    assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
+    assertTrue(fromString("你好").getPrefix() - 
fromString("世界").getPrefix() > 0);
+  }
+
+  @Test
   public void compareTo() {
+    assertTrue(fromString("").compareTo(fromString("a")) < 0);
     assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
     assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
     assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 
0);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to