spark git commit: [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics
Repository: spark Updated Branches: refs/heads/branch-2.0 1d5c7f452 -> 2b18bd4b9 [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element. This should be covered by existing tests. Author: Reynold XinCloses #16274 from rxin/SPARK-18853. (cherry picked from commit 5d799473696a15fddd54ec71a93b6f8cb169810c) Signed-off-by: Herman van Hovell (cherry picked from commit e8866f9fc62095b78421d461549f7eaf8e9070b3) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b18bd4b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b18bd4b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b18bd4b Branch: refs/heads/branch-2.0 Commit: 2b18bd4b99c0c85f28bd174751f52f940f78d2e8 Parents: 1d5c7f4 Author: Reynold Xin Authored: Wed Dec 14 21:22:49 2016 +0100 Committer: Herman van Hovell Committed: Wed Dec 14 21:25:07 2016 +0100 -- .../scala/org/apache/spark/sql/types/ArrayType.scala | 6 +++--- .../scala/org/apache/spark/sql/types/MapType.scala| 6 +++--- .../org/apache/spark/sql/types/DataTypeSuite.scala| 14 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2b18bd4b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala index 82a03b0..8c4df58 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala @@ -70,10 +70,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT ("containsNull" -> containsNull) /** - * The default size of a value of the ArrayType is 100 * the default size of the element type. - * (We assume that there are 100 elements). + * The default size of a value of the ArrayType is the default size of the element type. + * We assume that there is only 1 element on average in an array. See SPARK-18853. */ - override def defaultSize: Int = 100 * elementType.defaultSize + override def defaultSize: Int = 1 * elementType.defaultSize override def simpleString: String = s"array<${elementType.simpleString}>" http://git-wip-us.apache.org/repos/asf/spark/blob/2b18bd4b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index 1789609..4d59920 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -57,10 +57,10 @@ case class MapType( /** * The default size of a value of the MapType is - * 100 * (the default size of the key type + the default size of the value type). - * (We assume that there are 100 elements). + * (the default size of the key type + the default size of the value type). + * We assume that there is only 1 element on average in a map. See SPARK-18853. */ - override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize) + override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize) override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>" http://git-wip-us.apache.org/repos/asf/spark/blob/2b18bd4b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index 569230a..8c947ed 100644 ---
spark git commit: [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics
Repository: spark Updated Branches: refs/heads/branch-2.1 af12a21ca -> e8866f9fc [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics ## What changes were proposed in this pull request? This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element. ## How was this patch tested? This should be covered by existing tests. Author: Reynold XinCloses #16274 from rxin/SPARK-18853. (cherry picked from commit 5d799473696a15fddd54ec71a93b6f8cb169810c) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8866f9f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8866f9f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8866f9f Branch: refs/heads/branch-2.1 Commit: e8866f9fc62095b78421d461549f7eaf8e9070b3 Parents: af12a21 Author: Reynold Xin Authored: Wed Dec 14 21:22:49 2016 +0100 Committer: Herman van Hovell Committed: Wed Dec 14 21:23:01 2016 +0100 -- .../scala/org/apache/spark/sql/types/ArrayType.scala | 6 +++--- .../scala/org/apache/spark/sql/types/MapType.scala| 6 +++--- .../org/apache/spark/sql/types/DataTypeSuite.scala| 14 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8866f9f/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala index d409271..98efba1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala @@ -78,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT ("containsNull" -> containsNull) /** - * The default size of a value of the ArrayType is 100 * the default size of the element type. - * (We assume that there are 100 elements). + * The default size of a value of the ArrayType is the default size of the element type. + * We assume that there is only 1 element on average in an array. See SPARK-18853. */ - override def defaultSize: Int = 100 * elementType.defaultSize + override def defaultSize: Int = 1 * elementType.defaultSize override def simpleString: String = s"array<${elementType.simpleString}>" http://git-wip-us.apache.org/repos/asf/spark/blob/e8866f9f/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index fbf3a61..6691b81 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -56,10 +56,10 @@ case class MapType( /** * The default size of a value of the MapType is - * 100 * (the default size of the key type + the default size of the value type). - * (We assume that there are 100 elements). + * (the default size of the key type + the default size of the value type). + * We assume that there is only 1 element on average in a map. See SPARK-18853. */ - override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize) + override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize) override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>" http://git-wip-us.apache.org/repos/asf/spark/blob/e8866f9f/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index b8ab9a9..12d2c00 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala +++
spark git commit: [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics
Repository: spark Updated Branches: refs/heads/master 89ae26dcd -> 5d7994736 [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics ## What changes were proposed in this pull request? This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element. ## How was this patch tested? This should be covered by existing tests. Author: Reynold XinCloses #16274 from rxin/SPARK-18853. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d799473 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d799473 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d799473 Branch: refs/heads/master Commit: 5d799473696a15fddd54ec71a93b6f8cb169810c Parents: 89ae26d Author: Reynold Xin Authored: Wed Dec 14 21:22:49 2016 +0100 Committer: Herman van Hovell Committed: Wed Dec 14 21:22:49 2016 +0100 -- .../scala/org/apache/spark/sql/types/ArrayType.scala | 6 +++--- .../scala/org/apache/spark/sql/types/MapType.scala| 6 +++--- .../org/apache/spark/sql/types/DataTypeSuite.scala| 14 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5d799473/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala index d409271..98efba1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala @@ -78,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT ("containsNull" -> containsNull) /** - * The default size of a value of the ArrayType is 100 * the default size of the element type. - * (We assume that there are 100 elements). + * The default size of a value of the ArrayType is the default size of the element type. + * We assume that there is only 1 element on average in an array. See SPARK-18853. */ - override def defaultSize: Int = 100 * elementType.defaultSize + override def defaultSize: Int = 1 * elementType.defaultSize override def simpleString: String = s"array<${elementType.simpleString}>" http://git-wip-us.apache.org/repos/asf/spark/blob/5d799473/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index fbf3a61..6691b81 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -56,10 +56,10 @@ case class MapType( /** * The default size of a value of the MapType is - * 100 * (the default size of the key type + the default size of the value type). - * (We assume that there are 100 elements). + * (the default size of the key type + the default size of the value type). + * We assume that there is only 1 element on average in a map. See SPARK-18853. */ - override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize) + override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize) override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>" http://git-wip-us.apache.org/repos/asf/spark/blob/5d799473/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index b8ab9a9..12d2c00 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala @@ -253,7 +253,7 @@ class DataTypeSuite extends SparkFunSuite {