HIVE-11544: Improve LazySimpleSerDe null data handling for Byte, Short, Integer, Float, Long and Double. (Gopal V, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/98049182 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/98049182 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/98049182 Branch: refs/heads/spark Commit: 980491821e6c33d595e8a0e3abcfbc6c207aa436 Parents: 0bc9677 Author: Gopal V <gop...@apache.org> Authored: Mon Sep 14 18:13:42 2015 -0700 Committer: Gopal V <gop...@apache.org> Committed: Mon Sep 14 18:13:42 2015 -0700 ---------------------------------------------------------------------- .../benchmark/serde/LazySimpleSerDeBench.java | 453 +++++++++++++++++++ .../hadoop/hive/serde2/lazy/LazyByte.java | 4 + .../hadoop/hive/serde2/lazy/LazyDouble.java | 4 + .../hadoop/hive/serde2/lazy/LazyFloat.java | 4 + .../hadoop/hive/serde2/lazy/LazyInteger.java | 4 + .../hadoop/hive/serde2/lazy/LazyLong.java | 4 + .../hadoop/hive/serde2/lazy/LazyShort.java | 4 + .../hadoop/hive/serde2/lazy/LazyUtils.java | 28 ++ 8 files changed, 505 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java ---------------------------------------------------------------------- diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java new file mode 100644 index 0000000..a1b63d5 --- /dev/null +++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java @@ -0,0 +1,453 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hive.benchmark.serde; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; +import org.apache.hadoop.hive.serde2.lazy.LazyByte; +import org.apache.hadoop.hive.serde2.lazy.LazyDouble; +import org.apache.hadoop.hive.serde2.lazy.LazyFloat; +import org.apache.hadoop.hive.serde2.lazy.LazyInteger; +import org.apache.hadoop.hive.serde2.lazy.LazyLong; +import org.apache.hadoop.hive.serde2.lazy.LazyShort; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +@State(Scope.Benchmark) +public class LazySimpleSerDeBench { + /** + * This test measures the performance for LazySimpleSerDe. + * <p/> + * This test uses JMH framework for benchmarking. You may execute this + * benchmark tool using JMH command line in different ways: + * <p/> + * To run using default settings, use: + * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.serde.LazySimpleSerDeBench + * <p/> + */ + + @BenchmarkMode(Mode.AverageTime) + @Fork(1) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public static abstract class AbstractDeserializer { + public static final int DEFAULT_ITER_TIME = 1000000; + + public static final int DEFAULT_DATA_SIZE = 4096; + + public int[] offsets = new int[DEFAULT_DATA_SIZE]; + public int[] sizes = new int[DEFAULT_DATA_SIZE]; + protected final ByteArrayRef ref = new ByteArrayRef(); + + @Setup + public abstract void setup(); + + @Benchmark + @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) + @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) + public void bench() { + + } + } + + public static abstract class RandomDataInitializer extends + AbstractDeserializer { + + final int width; + + public RandomDataInitializer(final int width) { + this.width = width; + } + + @Override + public void setup() { + int len = 0; + Random r = new Random(); + for (int i = 0; i < sizes.length; i++) { + sizes[i] = (int) (r.nextInt(width)); + offsets[i] = len; + len += sizes[i]; + } + byte[] data = new byte[len + 1]; + r.nextBytes(data); + ref.setData(data); + } + } + + public static abstract class GoodDataInitializer extends AbstractDeserializer { + + public final int max; + + public GoodDataInitializer(final int max) { + this.max = max; + } + + @Override + public void setup() { + sizes = new int[1024]; + offsets = new int[sizes.length]; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + Random r = new Random(); + int len = 0; + for (int i = 0; i < sizes.length / 2; i++) { + int p = r.nextInt(max); + int n = -1 * (p - 1); + byte[] ps = String.format("%d", p).getBytes(); + byte[] ns = String.format("%d", n).getBytes(); + sizes[2 * i] = ps.length; + sizes[2 * i + 1] = ns.length; + offsets[2 * i] = len; + offsets[2 * i + 1] = len + ps.length; + len += ps.length + ns.length; + try { + bos.write(ns); + bos.write(ps); + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + ref.setData(bos.toByteArray()); + } + } + + public static class RandomLazyByte extends RandomDataInitializer { + + public RandomLazyByte() { + super(2); + } + + final LazyByte obj = new LazyByte( + LazyPrimitiveObjectInspectorFactory.LAZY_BYTE_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyByte extends RandomDataInitializer { + + public WorstLazyByte() { + super(8); + } + + final LazyByte obj = new LazyByte( + LazyPrimitiveObjectInspectorFactory.LAZY_BYTE_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class GoodLazyByte extends GoodDataInitializer { + + final LazyByte obj = new LazyByte( + LazyPrimitiveObjectInspectorFactory.LAZY_BYTE_OBJECT_INSPECTOR); + + public GoodLazyByte() { + super(Integer.MAX_VALUE); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyShort extends RandomDataInitializer { + + public RandomLazyShort() { + super(2); + } + + final LazyShort obj = new LazyShort( + LazyPrimitiveObjectInspectorFactory.LAZY_SHORT_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyShort extends RandomDataInitializer { + + public WorstLazyShort() { + super(8); + } + + final LazyShort obj = new LazyShort( + LazyPrimitiveObjectInspectorFactory.LAZY_SHORT_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class GoodLazyShort extends GoodDataInitializer { + + final LazyShort obj = new LazyShort( + LazyPrimitiveObjectInspectorFactory.LAZY_SHORT_OBJECT_INSPECTOR); + + public GoodLazyShort() { + super(Integer.MAX_VALUE); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyInteger extends RandomDataInitializer { + + public RandomLazyInteger() { + super(2); + } + + final LazyInteger obj = new LazyInteger( + LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyInteger extends RandomDataInitializer { + + public WorstLazyInteger() { + super(8); + } + + final LazyInteger obj = new LazyInteger( + LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class GoodLazyInteger extends GoodDataInitializer { + + final LazyInteger obj = new LazyInteger( + LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR); + + public GoodLazyInteger() { + super(Integer.MAX_VALUE); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyFloat extends RandomDataInitializer { + + public RandomLazyFloat() { + super(2); + } + + final LazyFloat obj = new LazyFloat( + LazyPrimitiveObjectInspectorFactory.LAZY_FLOAT_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyFloat extends RandomDataInitializer { + + public WorstLazyFloat() { + super(8); + } + + final LazyFloat obj = new LazyFloat( + LazyPrimitiveObjectInspectorFactory.LAZY_FLOAT_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class GoodLazyFloat extends GoodDataInitializer { + + final LazyFloat obj = new LazyFloat( + LazyPrimitiveObjectInspectorFactory.LAZY_FLOAT_OBJECT_INSPECTOR); + + public GoodLazyFloat() { + super(Integer.MAX_VALUE); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyLong extends RandomDataInitializer { + + public RandomLazyLong() { + super(2); + } + + final LazyLong obj = new LazyLong( + LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyLong extends RandomDataInitializer { + + public WorstLazyLong() { + super(8); + } + + final LazyLong obj = new LazyLong( + LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class GoodLazyLong extends GoodDataInitializer { + + final LazyLong obj = new LazyLong( + LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR); + + public GoodLazyLong() { + super(Integer.MAX_VALUE); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyDouble extends RandomDataInitializer { + + public RandomLazyDouble() { + super(2); + } + + final LazyDouble obj = new LazyDouble( + LazyPrimitiveObjectInspectorFactory.LAZY_DOUBLE_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyDouble extends RandomDataInitializer { + + public WorstLazyDouble() { + super(8); + } + + final LazyDouble obj = new LazyDouble( + LazyPrimitiveObjectInspectorFactory.LAZY_DOUBLE_OBJECT_INSPECTOR); + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class GoodLazyDouble extends GoodDataInitializer { + + final LazyDouble obj = new LazyDouble( + LazyPrimitiveObjectInspectorFactory.LAZY_DOUBLE_OBJECT_INSPECTOR); + + public GoodLazyDouble() { + super(Integer.MAX_VALUE); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder().include( + ".*" + LazySimpleSerDeBench.class.getSimpleName() + ".*").build(); + new Runner(opt).run(); + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java index a3b8f76..1f9cead 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java @@ -48,6 +48,10 @@ public class LazyByte extends @Override public void init(ByteArrayRef bytes, int start, int length) { + if (!LazyUtils.isNumberMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { data.set(parseByte(bytes.getData(), start, length, 10)); isNull = false; http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDouble.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDouble.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDouble.java index 05ca4e9..35c2141 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDouble.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDouble.java @@ -46,6 +46,10 @@ public class LazyDouble extends @Override public void init(ByteArrayRef bytes, int start, int length) { String byteData = null; + if (!LazyUtils.isNumberMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { byteData = Text.decode(bytes.getData(), start, length); data.set(Double.parseDouble(byteData)); http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFloat.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFloat.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFloat.java index 37676d1..6e132c7 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFloat.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFloat.java @@ -46,6 +46,10 @@ public class LazyFloat extends @Override public void init(ByteArrayRef bytes, int start, int length) { String byteData = null; + if (!LazyUtils.isNumberMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { byteData = Text.decode(bytes.getData(), start, length); data.set(Float.parseFloat(byteData)); http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java index ad82ebf..22742aa 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java @@ -51,6 +51,10 @@ public class LazyInteger extends @Override public void init(ByteArrayRef bytes, int start, int length) { + if (!LazyUtils.isNumberMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { data.set(parseInt(bytes.getData(), start, length, 10)); isNull = false; http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java index a9779a0..c0d52b9 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java @@ -51,6 +51,10 @@ public class LazyLong extends @Override public void init(ByteArrayRef bytes, int start, int length) { + if (!LazyUtils.isNumberMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { data.set(parseLong(bytes.getData(), start, length, 10)); isNull = false; http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java index f04e131..b8b9488 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java @@ -48,6 +48,10 @@ public class LazyShort extends @Override public void init(ByteArrayRef bytes, int start, int length) { + if (!LazyUtils.isNumberMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { data.set(parseShort(bytes.getData(), start, length)); isNull = false; http://git-wip-us.apache.org/repos/asf/hive/blob/98049182/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java index 5c58f6b..a5e4be4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java @@ -81,6 +81,34 @@ public final class LazyUtils { } /** + * returns false, when the bytes definitely cannot be parsed into a base-10 + * Number (Long or a Double) + * + * If it returns true, the bytes might still be invalid, but not obviously. + */ + + public static boolean isNumberMaybe(byte[] buf, int offset, int len) { + switch (len) { + case 0: + return false; + case 1: + // space usually + return Character.isDigit(buf[offset]); + case 2: + // \N or -1 (allow latter) + return Character.isDigit(buf[offset + 1]) + || Character.isDigit(buf[offset + 0]); + case 4: + // null or NULL + if (buf[offset] == 'N' || buf[offset] == 'n') { + return false; + } + } + // maybe valid - too expensive to check without a parse + return true; + } + + /** * Returns -1 if the first byte sequence is lexicographically less than the * second; returns +1 if the second byte sequence is lexicographically less * than the first; otherwise return 0.