spark git commit: [SPARK-23047][PYTHON][SQL] Change MapVector to NullableMapVector in ArrowColumnVector
Repository: spark Updated Branches: refs/heads/branch-2.3 79ccd0cad -> 6e509fde3 [SPARK-23047][PYTHON][SQL] Change MapVector to NullableMapVector in ArrowColumnVector ## What changes were proposed in this pull request? This PR changes usage of `MapVector` in Spark codebase to use `NullableMapVector`. `MapVector` is an internal Arrow class that is not supposed to be used directly. We should use `NullableMapVector` instead. ## How was this patch tested? Existing test. Author: Li Jin Closes #20239 from icexelloss/arrow-map-vector. (cherry picked from commit 4e6f8fb150ae09c7d1de6beecb2b98e5afa5da19) Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e509fde Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e509fde Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e509fde Branch: refs/heads/branch-2.3 Commit: 6e509fde3f056316f46c71b672a7d69adb1b4f8e Parents: 79ccd0c Author: Li Jin Authored: Thu Jan 18 07:26:43 2018 +0900 Committer: hyukjinkwon Committed: Thu Jan 18 07:26:57 2018 +0900 -- .../spark/sql/vectorized/ArrowColumnVector.java | 13 +-- .../vectorized/ArrowColumnVectorSuite.scala | 36 2 files changed, 46 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e509fde/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java index 7083332..eb69001 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java @@ -247,8 +247,8 @@ public final class ArrowColumnVector extends ColumnVector { childColumns = new ArrowColumnVector[1]; childColumns[0] = new ArrowColumnVector(listVector.getDataVector()); -} else if (vector instanceof MapVector) { - MapVector mapVector = (MapVector) vector; +} else if (vector instanceof NullableMapVector) { + NullableMapVector mapVector = (NullableMapVector) vector; accessor = new StructAccessor(mapVector); childColumns = new ArrowColumnVector[mapVector.size()]; @@ -553,9 +553,16 @@ public final class ArrowColumnVector extends ColumnVector { } } + /** + * Any call to "get" method will throw UnsupportedOperationException. + * + * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses getStruct() method defined + * in the parent class. Any call to "get" method in this class is a bug in the code. + * + */ private static class StructAccessor extends ArrowVectorAccessor { -StructAccessor(MapVector vector) { +StructAccessor(NullableMapVector vector) { super(vector); } } http://git-wip-us.apache.org/repos/asf/spark/blob/6e509fde/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala index 7304803..5343266 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala @@ -322,6 +322,42 @@ class ArrowColumnVectorSuite extends SparkFunSuite { allocator.close() } + test("non nullable struct") { +val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue) +val schema = new StructType().add("int", IntegerType).add("long", LongType) +val vector = ArrowUtils.toArrowField("struct", schema, nullable = false, null) + .createVector(allocator).asInstanceOf[NullableMapVector] + +vector.allocateNew() +val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector] +val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector] + +vector.setIndexDefined(0) +intVector.setSafe(0, 1) +longVector.setSafe(0, 1L) + +vector.setIndexDefined(1) +intVector.setSafe(1, 2) +longVector.setNull(1) + +vector.setValueCount(2) + +val columnVector = new ArrowColumnVector(vector) +assert(columnVector.dataType === schema) +assert(columnVector.numNulls === 0) + +val row0 = columnVector.getStruct(0, 2) +assert(row0.getInt(0) === 1) +assert(row0.getLong(1) === 1L) + +val row1 = columnVector.getStruct(1, 2)
spark git commit: [SPARK-23047][PYTHON][SQL] Change MapVector to NullableMapVector in ArrowColumnVector
Repository: spark Updated Branches: refs/heads/master e946c63dd -> 4e6f8fb15 [SPARK-23047][PYTHON][SQL] Change MapVector to NullableMapVector in ArrowColumnVector ## What changes were proposed in this pull request? This PR changes usage of `MapVector` in Spark codebase to use `NullableMapVector`. `MapVector` is an internal Arrow class that is not supposed to be used directly. We should use `NullableMapVector` instead. ## How was this patch tested? Existing test. Author: Li Jin Closes #20239 from icexelloss/arrow-map-vector. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e6f8fb1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e6f8fb1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e6f8fb1 Branch: refs/heads/master Commit: 4e6f8fb150ae09c7d1de6beecb2b98e5afa5da19 Parents: e946c63 Author: Li Jin Authored: Thu Jan 18 07:26:43 2018 +0900 Committer: hyukjinkwon Committed: Thu Jan 18 07:26:43 2018 +0900 -- .../spark/sql/vectorized/ArrowColumnVector.java | 13 +-- .../vectorized/ArrowColumnVectorSuite.scala | 36 2 files changed, 46 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e6f8fb1/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java index 7083332..eb69001 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java @@ -247,8 +247,8 @@ public final class ArrowColumnVector extends ColumnVector { childColumns = new ArrowColumnVector[1]; childColumns[0] = new ArrowColumnVector(listVector.getDataVector()); -} else if (vector instanceof MapVector) { - MapVector mapVector = (MapVector) vector; +} else if (vector instanceof NullableMapVector) { + NullableMapVector mapVector = (NullableMapVector) vector; accessor = new StructAccessor(mapVector); childColumns = new ArrowColumnVector[mapVector.size()]; @@ -553,9 +553,16 @@ public final class ArrowColumnVector extends ColumnVector { } } + /** + * Any call to "get" method will throw UnsupportedOperationException. + * + * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses getStruct() method defined + * in the parent class. Any call to "get" method in this class is a bug in the code. + * + */ private static class StructAccessor extends ArrowVectorAccessor { -StructAccessor(MapVector vector) { +StructAccessor(NullableMapVector vector) { super(vector); } } http://git-wip-us.apache.org/repos/asf/spark/blob/4e6f8fb1/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala index 7304803..5343266 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala @@ -322,6 +322,42 @@ class ArrowColumnVectorSuite extends SparkFunSuite { allocator.close() } + test("non nullable struct") { +val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue) +val schema = new StructType().add("int", IntegerType).add("long", LongType) +val vector = ArrowUtils.toArrowField("struct", schema, nullable = false, null) + .createVector(allocator).asInstanceOf[NullableMapVector] + +vector.allocateNew() +val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector] +val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector] + +vector.setIndexDefined(0) +intVector.setSafe(0, 1) +longVector.setSafe(0, 1L) + +vector.setIndexDefined(1) +intVector.setSafe(1, 2) +longVector.setNull(1) + +vector.setValueCount(2) + +val columnVector = new ArrowColumnVector(vector) +assert(columnVector.dataType === schema) +assert(columnVector.numNulls === 0) + +val row0 = columnVector.getStruct(0, 2) +assert(row0.getInt(0) === 1) +assert(row0.getLong(1) === 1L) + +val row1 = columnVector.getStruct(1, 2) +assert(row1.getInt(0) === 2) +assert(row1.isNullAt(1)) + +columnVector.close() +allocato