svn commit: r36656 - /release/spark/KEYS
Author: lian Date: Wed Nov 6 23:21:34 2019 New Revision: 36656 Log: Update KEYS Modified: release/spark/KEYS Modified: release/spark/KEYS == --- release/spark/KEYS (original) +++ release/spark/KEYS Wed Nov 6 23:21:34 2019 @@ -1050,3 +1050,60 @@ XJ5Dp1pqv9DC6cl9vLSHctRrM2kG =mQLW -END PGP PUBLIC KEY BLOCK- +pub rsa4096 2019-10-22 [SC] [expires: 2029-10-19] + 26959C22EE5D4682049D6526E1B7E0F25E4BF56B +uid [ultimate] Xingbo Jiang (CODE SIGNING KEY) +sub rsa4096 2019-10-22 [E] [expires: 2029-10-19] + +-BEGIN PGP PUBLIC KEY BLOCK- + +mQINBF2uvsQBEAC2Nv1IChLuRTlq/fADcp2Q0dJ4dwsdEZuuGqtius6hiQuJX/6b +1EoVkns+8EVF88mBLGyw5RtYLqVmfPjuObhw0yHX5wO9ilSjrhyVPf1oSAg7TdZg +Lm2Sr4Lc1tzMAS+smtkrsNdARzTHYE5s4gFh5Kq65Y4+aNjbNOWomlNIAJumXO62 +5RGYn0cMaqx2DyFmavUYp2ofAf1OnBlTOhZnVFxkl+EIbbZfuynRcWG2dV1IYbPk ++felLi9EDtps0H0MWxZrwqOdHC5gTBoA10XTGmF0mYiTaT60+FclnBTiGipzhrP8 +O4lIWfzokYe9UjE5MX454tsPPNAHQ1b24AYqZJfBOzOn62yVayVzSy2LvRUinbPp +O7fMmfmMhxQw+TpG5M9wlrxyfNrOwc7kZP8xn0g2MIEPtCEBkkgEB3vrGwvPnIdi +ZikgUi3zCTWRXhODzdSt/spbtQrAJRaB63KbfdIYRO/wJOQ/I7+ytg/UO+SX1+X1 +nbjjoCUkbShaUPCdxUWM/f9D60Kz9yWG8ue0n911H0s2kBSOUNmerhMt9EJbvW04 +YuLZ3Vtt7ju1b+Ol0Nar2y4H8mVh8DAelc9ZXE+T4CaGWaC0ynqAZbg4y4bvdMvm +hsdPP0jGHJYh8fxjGbiNGx513i5kshZ+HX9KxX09lsehfVbgPgayk3XQoQARAQAB +tDhYaW5nYm8gSmlhbmcgKENPREUgU0lHTklORyBLRVkpIDxqaWFuZ3hiMTk4N0Bh +cGFjaGUub3JnPokCVAQTAQgAPhYhBCaVnCLuXUaCBJ1lJuG34PJeS/VrBQJdrr7E +AhsDBQkSzAMABQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAAAoJEOG34PJeS/VraCsQ +AIKYFXrprDCsl/rIc4MhvzLMvibLPWzGgjmkVZ5uJVS9zhGGU6gwatzllbpFeKkV +uXQrPt8kn2ygx/jhGvRr8ku0eKK2xAoEii89Ta2QVxkDMszKUB31OUZKJDN0ytmM +i1Y3NeWFwiKWNh9bCh1voamKHtiVHeB3LgxoKPYfYxTodr+lZFZBjo8ZAZEcNAzQ +iP46XPXCTu0J0jJhdfiP+haofOR49nP7VcD+ALS2AkEwD+2zpkcJLDB/CTEgBxco +xCCAHL7WKSXqU/526R84JXO8zLVOLIDG3//m4cagQknPvux8eTyu+OZTqwFz1gLc +Vtvv5tIH3gotmohbTNzzn+9LQhd18YawW+D+1ie29qqwq/K48K8EXpy9JAnBTJv6 +z1VcQ5HS2Va7U/fqQf9TbHLlZiK1u7XVmhvuVzqgLruAAl5zz0AybjNo9iEa+pSB +7L1MFHx7oEI3aoIiuI39dyrE0JFwUelbfHrpuBgfI/VfFW5vqkBbJrNxb4tSgzcY ++q37nNp72pZkeL5Kn3m9R4M5GDU6saaGsCpqr802f3AGdDfSI4rKfPFA7Y62S/gI +rs6da1i15sCprQvuYKMIPeUq0YIUoJ0WuHuBX4gF8XfmtGAxriYU0moaGzoCPMuT +dT4Nc4uYWaTVx5SvIo40w88h0rHg9g6ty65ur4hn7G0iuQINBF2uvsQBEADj0Cb4 +EJKqvV2SxysiW2mdDnRN0c+/aX1zB64U5VMJJl2h4crlYd71hDTXwu0Zt0MMAg+8 +HqkjAAp8szsYTSr0kz/9xSnJzuMuTWnaNe9eCii/HTzHSbkN25JJTThNX5hZ0And +wmtFfhelgZczIt9EnblCUqsyPiwL1w4vF1x7a2ftbS+k1n2Q+UIYAMZfsoYIlZzZ +cPKNJdHI0itBPaKBEoOQ3KebCyl91xdXb2elzS1NiID4Rx0dHB6C1w3v8wnBmgaC +IEv7Z7sS1Fr6NHf3pTnsBiyu/xcwE8eznhH0Bqb/i4qSCA8H6DMzwj5JiZ7yMvGd +S2PbUCLKSFaeZOcXY9LN9K9rqztxgCSVBqSZ6IB41HnjKd97hO4yMJqvrelg+KUN +31F3Jp0VBQB1a+j2LLNMJoFcrfG2Tv6R8sx1pwJWySRkdHKX6bB42Dzpd66ReZ0e +3RrcoOWe0jtXr0WwuxP94EZoyRN/iRQ6k3jR6B3n6RBg8CL0qNc2yRGsuXR9MPfE +GHUW/cYvZZbfkORYXSPt8pfdziWnr18a6oagcTGdeNyjVWWsQE39psYNM6HnhBSs +IxkMb7lXTz43MfFk55nrJqE/3LIH3AkJxKr3HOzGbUM832Iiqvg0o0nBRd9B4vLz +mkdtD+K9hCW6G7g+1hzx1Tqaf6n2Q69gv/zq+wARAQABiQI8BBgBCAAmFiEEJpWc +Iu5dRoIEnWUm4bfg8l5L9WsFAl2uvsQCGwwFCRLMAwAACgkQ4bfg8l5L9Wv7fw/+ +LDXns0Bph1X/ph2+kUBWL6KC6Fq+f636HvZfaRQMxs9vPXEBWpcTNFAmjPckmNJ+ +y/VbgeUSKR5ylol5jyTIamAR3J9xI0kQdaN7URnBmn2WLIxiRR3houLBMaJFWOnr +5/4+LJ42R4c1Le3GJOXVfiCustJf0eZTMoAyN4bQHPd2vXb9p/BwQfctTNrvv+ZT +v5D0rUAZO+kmIO9hzfVnQ/RCDPzMZRXImxWFOKczbwUq3By+ZsoPtG0IHjtbSErt +/Rl+CK0vzD008Mq2vEP/OJ1IW32+QrokoWRAam87RAe7YATOvWYsMib3m8Hnh0JY +AwDlgPzhvvymA9eVOscs608ZOtuNzDRTCKTRTv2T6Ktsh7ZBUSU0aXN2nafC+8xA +/SBAeUqH+C9+aHiP8gdAG9xq2MXs8V5oQRVoYriQZuSlM2r/u0DNq8BGcXbguHjn +opBNCzchUWR5X19V0KDN8cW9f22RcrenREJhrrn2WfVT/lvBF1E0X70H8ziMHrrf +fWOysS0CT3qM2hNCa15MH+xsu6Lq56lqzCC5UhR+vTo4SNXSScv2Z5Verf30xCnN +/JuPHYL8Y48/pX+9dYSAali4oDNQ0mI38j+A44ik8C8o6qOjk7qQwUlluCmV62ut +kR7loYvuYi9fxvlaW0kc3Bd10JCHCEmobHno5Gflr9I= +=l4ux +-END PGP PUBLIC KEY BLOCK- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r36655 - /dev/spark/v3.0.0-preview-rc2-bin/ /release/spark/spark-3.0.0-preview/
Author: lian Date: Wed Nov 6 23:15:10 2019 New Revision: 36655 Log: Move 3.0.0-preview artifacts to the release folder Added: release/spark/spark-3.0.0-preview/ - copied from r36654, dev/spark/v3.0.0-preview-rc2-bin/ Removed: dev/spark/v3.0.0-preview-rc2-bin/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-22951][SQL] fix aggregation after dropDuplicates on empty data frames
Repository: spark Updated Branches: refs/heads/master 344e3aab8 -> 9b33dfc40 [SPARK-22951][SQL] fix aggregation after dropDuplicates on empty data frames ## What changes were proposed in this pull request? (courtesy of liancheng) Spark SQL supports both global aggregation and grouping aggregation. Global aggregation always return a single row with the initial aggregation state as the output, even there are zero input rows. Spark implements this by simply checking the number of grouping keys and treats an aggregation as a global aggregation if it has zero grouping keys. However, this simple principle drops the ball in the following case: ```scala spark.emptyDataFrame.dropDuplicates().agg(count($"*") as "c").show() // +---+ // | c | // +---+ // | 1 | // +---+ ``` The reason is that: 1. `df.dropDuplicates()` is roughly translated into something equivalent to: ```scala val allColumns = df.columns.map { col } df.groupBy(allColumns: _*).agg(allColumns.head, allColumns.tail: _*) ``` This translation is implemented in the rule `ReplaceDeduplicateWithAggregate`. 2. `spark.emptyDataFrame` contains zero columns and zero rows. Therefore, rule `ReplaceDeduplicateWithAggregate` makes a confusing transformation roughly equivalent to the following one: ```scala spark.emptyDataFrame.dropDuplicates() => spark.emptyDataFrame.groupBy().agg(Map.empty[String, String]) ``` The above transformation is confusing because the resulting aggregate operator contains no grouping keys (because `emptyDataFrame` contains no columns), and gets recognized as a global aggregation. As a result, Spark SQL allocates a single row filled by the initial aggregation state and uses it as the output, and returns a wrong result. To fix this issue, this PR tweaks `ReplaceDeduplicateWithAggregate` by appending a literal `1` to the grouping key list of the resulting `Aggregate` operator when the input plan contains zero output columns. In this way, `spark.emptyDataFrame.dropDuplicates()` is now translated into a grouping aggregation, roughly depicted as: ```scala spark.emptyDataFrame.dropDuplicates() => spark.emptyDataFrame.groupBy(lit(1)).agg(Map.empty[String, String]) ``` Which is now properly treated as a grouping aggregation and returns the correct answer. ## How was this patch tested? New unit tests added Author: Feng Liu <feng...@databricks.com> Closes #20174 from liufengdb/fix-duplicate. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9b33dfc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9b33dfc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9b33dfc4 Branch: refs/heads/master Commit: 9b33dfc408de986f4203bb0ac0c3f5c56effd69d Parents: 344e3aa Author: Feng Liu <feng...@databricks.com> Authored: Wed Jan 10 14:25:04 2018 -0800 Committer: Cheng Lian <lian.cs@gmail.com> Committed: Wed Jan 10 14:25:04 2018 -0800 -- .../sql/catalyst/optimizer/Optimizer.scala | 8 ++- .../optimizer/ReplaceOperatorSuite.scala| 10 +++- .../spark/sql/DataFrameAggregateSuite.scala | 24 ++-- 3 files changed, 38 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9b33dfc4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index df0af82..c794ba8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1222,7 +1222,13 @@ object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] { Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId) } } - Aggregate(keys, aggCols, child) + // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping + // aggregations by checking the number of grouping keys. The key difference here is that a + // global aggregation always returns at least one row even if there are no input rows. Here + // we append a literal when the grouping key list is empty so that the result aggregate + // operator is properly treated as a grouping aggregation. + val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys + Aggregate(nonemptyKeys, aggCols, child) } } http://git-wip-us.apache.org/repos/asf/spark/blob/9b33dfc4/sql/catalyst/src/test/scala/org/apache/spark/sq
spark git commit: [SPARK-22951][SQL] fix aggregation after dropDuplicates on empty data frames
Repository: spark Updated Branches: refs/heads/branch-2.3 5b5851cb6 -> eb4fa551e [SPARK-22951][SQL] fix aggregation after dropDuplicates on empty data frames ## What changes were proposed in this pull request? (courtesy of liancheng) Spark SQL supports both global aggregation and grouping aggregation. Global aggregation always return a single row with the initial aggregation state as the output, even there are zero input rows. Spark implements this by simply checking the number of grouping keys and treats an aggregation as a global aggregation if it has zero grouping keys. However, this simple principle drops the ball in the following case: ```scala spark.emptyDataFrame.dropDuplicates().agg(count($"*") as "c").show() // +---+ // | c | // +---+ // | 1 | // +---+ ``` The reason is that: 1. `df.dropDuplicates()` is roughly translated into something equivalent to: ```scala val allColumns = df.columns.map { col } df.groupBy(allColumns: _*).agg(allColumns.head, allColumns.tail: _*) ``` This translation is implemented in the rule `ReplaceDeduplicateWithAggregate`. 2. `spark.emptyDataFrame` contains zero columns and zero rows. Therefore, rule `ReplaceDeduplicateWithAggregate` makes a confusing transformation roughly equivalent to the following one: ```scala spark.emptyDataFrame.dropDuplicates() => spark.emptyDataFrame.groupBy().agg(Map.empty[String, String]) ``` The above transformation is confusing because the resulting aggregate operator contains no grouping keys (because `emptyDataFrame` contains no columns), and gets recognized as a global aggregation. As a result, Spark SQL allocates a single row filled by the initial aggregation state and uses it as the output, and returns a wrong result. To fix this issue, this PR tweaks `ReplaceDeduplicateWithAggregate` by appending a literal `1` to the grouping key list of the resulting `Aggregate` operator when the input plan contains zero output columns. In this way, `spark.emptyDataFrame.dropDuplicates()` is now translated into a grouping aggregation, roughly depicted as: ```scala spark.emptyDataFrame.dropDuplicates() => spark.emptyDataFrame.groupBy(lit(1)).agg(Map.empty[String, String]) ``` Which is now properly treated as a grouping aggregation and returns the correct answer. ## How was this patch tested? New unit tests added Author: Feng Liu <feng...@databricks.com> Closes #20174 from liufengdb/fix-duplicate. (cherry picked from commit 9b33dfc408de986f4203bb0ac0c3f5c56effd69d) Signed-off-by: Cheng Lian <lian.cs@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb4fa551 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb4fa551 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb4fa551 Branch: refs/heads/branch-2.3 Commit: eb4fa551e60800269a939b2c1c0ad69e3a801264 Parents: 5b5851c Author: Feng Liu <feng...@databricks.com> Authored: Wed Jan 10 14:25:04 2018 -0800 Committer: Cheng Lian <lian.cs@gmail.com> Committed: Wed Jan 10 14:25:33 2018 -0800 -- .../sql/catalyst/optimizer/Optimizer.scala | 8 ++- .../optimizer/ReplaceOperatorSuite.scala| 10 +++- .../spark/sql/DataFrameAggregateSuite.scala | 24 ++-- 3 files changed, 38 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eb4fa551/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index df0af82..c794ba8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1222,7 +1222,13 @@ object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] { Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId) } } - Aggregate(keys, aggCols, child) + // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping + // aggregations by checking the number of grouping keys. The key difference here is that a + // global aggregation always returns at least one row even if there are no input rows. Here + // we append a literal when the grouping key list is empty so that the result aggregate + // operator is properly treated as a grouping aggregation. + val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys + Aggregate(nonemptyKeys, aggCols, child) } } http://git-wip-us.apache.org/repo
spark git commit: [SPARK-17528][SQL][FOLLOWUP] remove unnecessary data copy in object hash aggregate
Repository: spark Updated Branches: refs/heads/master 481f07929 -> 86664338f [SPARK-17528][SQL][FOLLOWUP] remove unnecessary data copy in object hash aggregate ## What changes were proposed in this pull request? In #18483 , we fixed the data copy bug when saving into `InternalRow`, and removed all workarounds for this bug in the aggregate code path. However, the object hash aggregate was missed, this PR fixes it. This patch is also a requirement for #17419 , which shows that DataFrame version is slower than RDD version because of this issue. ## How was this patch tested? existing tests Author: Wenchen Fan <wenc...@databricks.com> Closes #18712 from cloud-fan/minor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/86664338 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/86664338 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/86664338 Branch: refs/heads/master Commit: 86664338f25f58b2f59db93b68cd57de671a4c0b Parents: 481f079 Author: Wenchen Fan <wenc...@databricks.com> Authored: Mon Jul 24 10:18:28 2017 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 24 10:18:28 2017 -0700 -- .../aggregate/ObjectAggregationIterator.scala | 20 1 file changed, 4 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/86664338/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala index 6e47f9d..eef2c4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala @@ -70,10 +70,6 @@ class ObjectAggregationIterator( generateProcessRow(newExpressions, newFunctions, newInputAttributes) } - // A safe projection used to do deep clone of input rows to prevent false sharing. - private[this] val safeProjection: Projection = -FromUnsafeProjection(outputAttributes.map(_.dataType)) - /** * Start processing input rows. */ @@ -151,12 +147,11 @@ class ObjectAggregationIterator( val groupingKey = groupingProjection.apply(null) val buffer: InternalRow = getAggregationBufferByKey(hashMap, groupingKey) while (inputRows.hasNext) { -val newInput = safeProjection(inputRows.next()) -processRow(buffer, newInput) +processRow(buffer, inputRows.next()) } } else { while (inputRows.hasNext && !sortBased) { -val newInput = safeProjection(inputRows.next()) +val newInput = inputRows.next() val groupingKey = groupingProjection.apply(newInput) val buffer: InternalRow = getAggregationBufferByKey(hashMap, groupingKey) processRow(buffer, newInput) @@ -266,9 +261,7 @@ class SortBasedAggregator( // Firstly, update the aggregation buffer with input rows. while (hasNextInput && groupingKeyOrdering.compare(inputIterator.getKey, groupingKey) == 0) { -// Since `inputIterator.getValue` is an `UnsafeRow` whose underlying buffer will be -// overwritten when `inputIterator` steps forward, we need to do a deep copy here. -processRow(result.aggregationBuffer, inputIterator.getValue.copy()) +processRow(result.aggregationBuffer, inputIterator.getValue) hasNextInput = inputIterator.next() } @@ -277,12 +270,7 @@ class SortBasedAggregator( // be called after calling processRow. while (hasNextAggBuffer && groupingKeyOrdering.compare(initialAggBufferIterator.getKey, groupingKey) == 0) { -mergeAggregationBuffers( - result.aggregationBuffer, - // Since `inputIterator.getValue` is an `UnsafeRow` whose underlying buffer will be - // overwritten when `inputIterator` steps forward, we need to do a deep copy here. - initialAggBufferIterator.getValue.copy() -) +mergeAggregationBuffers(result.aggregationBuffer, initialAggBufferIterator.getValue) hasNextAggBuffer = initialAggBufferIterator.next() } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20132][DOCS] Add documentation for column string functions
Repository: spark Updated Branches: refs/heads/branch-2.2 24fffacad -> f59c74a94 [SPARK-20132][DOCS] Add documentation for column string functions ## What changes were proposed in this pull request? Add docstrings to column.py for the Column functions `rlike`, `like`, `startswith`, and `endswith`. Pass these docstrings through `_bin_op` There may be a better place to put the docstrings. I put them immediately above the Column class. ## How was this patch tested? I ran `make html` on my local computer to remake the documentation, and verified that the html pages were displaying the docstrings correctly. I tried running `dev-tests`, and the formatting tests passed. However, my mvn build didn't work I think due to issues on my computer. These docstrings are my original work and free license. davies has done the most recent work reorganizing `_bin_op` Author: Michael Patterson <map...@gmail.com> Closes #17469 from map222/patterson-documentation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f59c74a9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f59c74a9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f59c74a9 Branch: refs/heads/branch-2.2 Commit: f59c74a9460b0db4e6c3ecbe872e2eaadc43e2cc Parents: 24fffac Author: Michael Patterson <map...@gmail.com> Authored: Sat Apr 22 19:58:54 2017 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri May 5 13:26:49 2017 -0700 -- python/pyspark/sql/column.py | 70 +++ 1 file changed, 64 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f59c74a9/python/pyspark/sql/column.py -- diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index ec05c18..46c1707 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -250,11 +250,50 @@ class Column(object): raise TypeError("Column is not iterable") # string methods +_rlike_doc = """ +Return a Boolean :class:`Column` based on a regex match. + +:param other: an extended regex expression + +>>> df.filter(df.name.rlike('ice$')).collect() +[Row(age=2, name=u'Alice')] +""" +_like_doc = """ +Return a Boolean :class:`Column` based on a SQL LIKE match. + +:param other: a SQL LIKE pattern + +See :func:`rlike` for a regex version + +>>> df.filter(df.name.like('Al%')).collect() +[Row(age=2, name=u'Alice')] +""" +_startswith_doc = """ +Return a Boolean :class:`Column` based on a string match. + +:param other: string at end of line (do not use a regex `^`) + +>>> df.filter(df.name.startswith('Al')).collect() +[Row(age=2, name=u'Alice')] +>>> df.filter(df.name.startswith('^Al')).collect() +[] +""" +_endswith_doc = """ +Return a Boolean :class:`Column` based on matching end of string. + +:param other: string at end of line (do not use a regex `$`) + +>>> df.filter(df.name.endswith('ice')).collect() +[Row(age=2, name=u'Alice')] +>>> df.filter(df.name.endswith('ice$')).collect() +[] +""" + contains = _bin_op("contains") -rlike = _bin_op("rlike") -like = _bin_op("like") -startswith = _bin_op("startsWith") -endswith = _bin_op("endsWith") +rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) +like = ignore_unicode_prefix(_bin_op("like", _like_doc)) +startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc)) +endswith = ignore_unicode_prefix(_bin_op("endsWith", _endswith_doc)) @ignore_unicode_prefix @since(1.3) @@ -303,8 +342,27 @@ class Column(object): desc = _unary_op("desc", "Returns a sort expression based on the" " descending order of the given column name.") -isNull = _unary_op("isNull", "True if the current expression is null.") -isNotNull = _unary_op("isNotNull", "True if the current expression is not null.") +_isNull_doc = """ +True if the current expression is null. Often combined with +:func:`DataFrame.filter` to select rows with null values. + +>>> from pyspark.sql import Row +>>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() +>>> df2.filter(df2.height.isNull()).collect() +[Row(height=None, na
spark git commit: [SPARK-19716][SQL] support by-name resolution for struct type elements in array
Repository: spark Updated Branches: refs/heads/master 402bf2a50 -> 295747e59 [SPARK-19716][SQL] support by-name resolution for struct type elements in array ## What changes were proposed in this pull request? Previously when we construct deserializer expression for array type, we will first cast the corresponding field to expected array type and then apply `MapObjects`. However, by doing that, we lose the opportunity to do by-name resolution for struct type inside array type. In this PR, I introduce a `UnresolvedMapObjects` to hold the lambda function and the input array expression. Then during analysis, after the input array expression is resolved, we get the actual array element type and apply by-name resolution. Then we don't need to add `Cast` for array type when constructing the deserializer expression, as the element type is determined later at analyzer. ## How was this patch tested? new regression test Author: Wenchen Fan <wenc...@databricks.com> Closes #17398 from cloud-fan/dataset. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/295747e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/295747e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/295747e5 Branch: refs/heads/master Commit: 295747e59739ee8a697ac3eba485d3439e4a04c3 Parents: 402bf2a Author: Wenchen Fan <wenc...@databricks.com> Authored: Tue Apr 4 16:38:32 2017 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Apr 4 16:38:32 2017 -0700 -- .../spark/sql/catalyst/ScalaReflection.scala| 66 +++- .../spark/sql/catalyst/analysis/Analyzer.scala | 19 +- .../expressions/complexTypeExtractors.scala | 2 +- .../catalyst/expressions/objects/objects.scala | 32 +++--- .../encoders/EncoderResolutionSuite.scala | 52 +++ .../sql/expressions/ReduceAggregator.scala | 2 +- .../org/apache/spark/sql/DatasetSuite.scala | 9 +++ 7 files changed, 141 insertions(+), 41 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/295747e5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index da37eb0..206ae2f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -92,7 +92,7 @@ object ScalaReflection extends ScalaReflection { * Array[T]. Special handling is performed for primitive types to map them back to their raw * JVM form instead of the Scala Array that handles auto boxing. */ - private def arrayClassFor(tpe: `Type`): DataType = ScalaReflectionLock.synchronized { + private def arrayClassFor(tpe: `Type`): ObjectType = ScalaReflectionLock.synchronized { val cls = tpe match { case t if t <:< definitions.IntTpe => classOf[Array[Int]] case t if t <:< definitions.LongTpe => classOf[Array[Long]] @@ -178,15 +178,17 @@ object ScalaReflection extends ScalaReflection { * is [a: int, b: long], then we will hit runtime error and say that we can't construct class * `Data` with int and long, because we lost the information that `b` should be a string. * - * This method help us "remember" the required data type by adding a `UpCast`. Note that we - * don't need to cast struct type because there must be `UnresolvedExtractValue` or - * `GetStructField` wrapping it, thus we only need to handle leaf type. + * This method help us "remember" the required data type by adding a `UpCast`. Note that we + * only need to do this for leaf nodes. */ def upCastToExpectedType( expr: Expression, expected: DataType, walkedTypePath: Seq[String]): Expression = expected match { case _: StructType => expr + case _: ArrayType => expr + // TODO: ideally we should also skip MapType, but nested StructType inside MapType is rare and + // it's not trivial to support by-name resolution for StructType inside MapType. case _ => UpCast(expr, expected, walkedTypePath) } @@ -265,42 +267,48 @@ object ScalaReflection extends ScalaReflection { case t if t <:< localTypeOf[Array[_]] => val TypeRef(_, _, Seq(elementType)) = t +val Schema(_, elementNullable) = schemaFor(elementType) +val className = getClassNameFromType(elementType) +val newTypePath = s"""- array element class: "$className&q
spark git commit: [SPARK-20125][SQL] Dataset of type option of map does not work
Repository: spark Updated Branches: refs/heads/branch-2.1 4bcb7d676 -> fd2e40614 [SPARK-20125][SQL] Dataset of type option of map does not work When we build the deserializer expression for map type, we will use `StaticInvoke` to call `ArrayBasedMapData.toScalaMap`, and declare the return type as `scala.collection.immutable.Map`. If the map is inside an Option, we will wrap this `StaticInvoke` with `WrapOption`, which requires the input to be `scala.collect.Map`. Ideally this should be fine, as `scala.collection.immutable.Map` extends `scala.collect.Map`, but our `ObjectType` is too strict about this, this PR fixes it. new regression test Author: Wenchen Fan <wenc...@databricks.com> Closes #17454 from cloud-fan/map. (cherry picked from commit d4fac410e0554b7ccd44be44b7ce2fe07ed7f206) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd2e4061 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd2e4061 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd2e4061 Branch: refs/heads/branch-2.1 Commit: fd2e40614b511fb9ef3e52cc1351659fdbfd612a Parents: 4bcb7d6 Author: Wenchen Fan <wenc...@databricks.com> Authored: Tue Mar 28 11:47:43 2017 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Mar 28 12:36:27 2017 -0700 -- .../src/main/scala/org/apache/spark/sql/types/ObjectType.scala | 5 + .../src/test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++ 2 files changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fd2e4061/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala index b18fba2..2d49fe0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala @@ -44,4 +44,9 @@ case class ObjectType(cls: Class[_]) extends DataType { def asNullable: DataType = this override def simpleString: String = cls.getName + + override def acceptsType(other: DataType): Boolean = other match { +case ObjectType(otherCls) => cls.isAssignableFrom(otherCls) +case _ => false + } } http://git-wip-us.apache.org/repos/asf/spark/blob/fd2e4061/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 381652d..9cc49b6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1072,10 +1072,16 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val ds2 = Seq(WithMap("hi", Map(42L -> "foo"))).toDS checkDataset(ds2.map(t => t), WithMap("hi", Map(42L -> "foo"))) } + + test("SPARK-20125: option of map") { +val ds = Seq(WithMapInOption(Some(Map(1 -> 1.toDS() +checkDataset(ds, WithMapInOption(Some(Map(1 -> 1 + } } case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String]) case class WithMap(id: String, map_test: scala.collection.Map[Long, String]) +case class WithMapInOption(m: Option[scala.collection.Map[Int, Int]]) case class Generic[T](id: T, value: Double) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20125][SQL] Dataset of type option of map does not work
Repository: spark Updated Branches: refs/heads/master 17eddb35a -> d4fac410e [SPARK-20125][SQL] Dataset of type option of map does not work ## What changes were proposed in this pull request? When we build the deserializer expression for map type, we will use `StaticInvoke` to call `ArrayBasedMapData.toScalaMap`, and declare the return type as `scala.collection.immutable.Map`. If the map is inside an Option, we will wrap this `StaticInvoke` with `WrapOption`, which requires the input to be `scala.collect.Map`. Ideally this should be fine, as `scala.collection.immutable.Map` extends `scala.collect.Map`, but our `ObjectType` is too strict about this, this PR fixes it. ## How was this patch tested? new regression test Author: Wenchen Fan <wenc...@databricks.com> Closes #17454 from cloud-fan/map. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4fac410 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4fac410 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4fac410 Branch: refs/heads/master Commit: d4fac410e0554b7ccd44be44b7ce2fe07ed7f206 Parents: 17eddb3 Author: Wenchen Fan <wenc...@databricks.com> Authored: Tue Mar 28 11:47:43 2017 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Mar 28 11:47:43 2017 -0700 -- .../src/main/scala/org/apache/spark/sql/types/ObjectType.scala | 5 + .../src/test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++ 2 files changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4fac410/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala index b18fba2..2d49fe0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala @@ -44,4 +44,9 @@ case class ObjectType(cls: Class[_]) extends DataType { def asNullable: DataType = this override def simpleString: String = cls.getName + + override def acceptsType(other: DataType): Boolean = other match { +case ObjectType(otherCls) => cls.isAssignableFrom(otherCls) +case _ => false + } } http://git-wip-us.apache.org/repos/asf/spark/blob/d4fac410/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 6417e7a..68e071a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1154,10 +1154,16 @@ class DatasetSuite extends QueryTest with SharedSQLContext { assert(errMsg3.getMessage.startsWith("cannot have circular references in class, but got the " + "circular reference of class")) } + + test("SPARK-20125: option of map") { +val ds = Seq(WithMapInOption(Some(Map(1 -> 1.toDS() +checkDataset(ds, WithMapInOption(Some(Map(1 -> 1 + } } case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String]) case class WithMap(id: String, map_test: scala.collection.Map[Long, String]) +case class WithMapInOption(m: Option[scala.collection.Map[Int, Int]]) case class Generic[T](id: T, value: Double) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19737][SQL] New analysis rule for reporting unregistered functions without relying on relation resolution
Repository: spark Updated Branches: refs/heads/master 2a0bc867a -> 339b53a13 [SPARK-19737][SQL] New analysis rule for reporting unregistered functions without relying on relation resolution ## What changes were proposed in this pull request? This PR adds a new `Once` analysis rule batch consists of a single analysis rule `LookupFunctions` that performs simple existence check over `UnresolvedFunctions` without actually resolving them. The benefit of this rule is that it doesn't require function arguments to be resolved first and therefore doesn't rely on relation resolution, which may incur potentially expensive partition/schema discovery cost. Please refer to [SPARK-19737][1] for more details about the motivation. ## How was this patch tested? New test case added in `AnalysisErrorSuite`. [1]: https://issues.apache.org/jira/browse/SPARK-19737 Author: Cheng Lian <l...@databricks.com> Closes #17168 from liancheng/spark-19737-lookup-functions. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/339b53a1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/339b53a1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/339b53a1 Branch: refs/heads/master Commit: 339b53a1311e08521d84a83c94201fcf3c766fb2 Parents: 2a0bc86 Author: Cheng Lian <l...@databricks.com> Authored: Mon Mar 6 10:36:50 2017 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Mar 6 10:36:50 2017 -0800 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 21 ++ .../catalyst/catalog/SessionCatalogSuite.scala | 23 +++- .../spark/sql/hive/HiveSessionCatalog.scala | 5 + 3 files changed, 48 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/339b53a1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 6d569b6..2f8489d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -117,6 +117,8 @@ class Analyzer( Batch("Hints", fixedPoint, new ResolveHints.ResolveBroadcastHints(conf), ResolveHints.RemoveAllHints), +Batch("Simple Sanity Check", Once, + LookupFunctions), Batch("Substitution", fixedPoint, CTESubstitution, WindowsSubstitution, @@ -1039,6 +1041,25 @@ class Analyzer( } /** + * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the + * function registry. Note that this rule doesn't try to resolve the [[UnresolvedFunction]]. It + * only performs simple existence check according to the function identifier to quickly identify + * undefined functions without triggering relation resolution, which may incur potentially + * expensive partition/schema discovery process in some cases. + * + * @see [[ResolveFunctions]] + * @see https://issues.apache.org/jira/browse/SPARK-19737 + */ + object LookupFunctions extends Rule[LogicalPlan] { +override def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressions { + case f: UnresolvedFunction if !catalog.functionExists(f.name) => +withPosition(f) { + throw new NoSuchFunctionException(f.name.database.getOrElse("default"), f.name.funcName) +} +} + } + + /** * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s. */ object ResolveFunctions extends Rule[LogicalPlan] { http://git-wip-us.apache.org/repos/asf/spark/blob/339b53a1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala index a755231..ffc272c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.catalog import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, SimpleCatalystConf, TableIdentifier} import org.apache.spark.sql.catalyst.analys
spark git commit: [SPARK-19529] TransportClientFactory.createClient() shouldn't call awaitUninterruptibly()
Repository: spark Updated Branches: refs/heads/branch-2.0 23050c8a1 -> f50c4372c [SPARK-19529] TransportClientFactory.createClient() shouldn't call awaitUninterruptibly() This patch replaces a single `awaitUninterruptibly()` call with a plain `await()` call in Spark's `network-common` library in order to fix a bug which may cause tasks to be uncancellable. In Spark's Netty RPC layer, `TransportClientFactory.createClient()` calls `awaitUninterruptibly()` on a Netty future while waiting for a connection to be established. This creates problem when a Spark task is interrupted while blocking in this call (which can happen in the event of a slow connection which will eventually time out). This has bad impacts on task cancellation when `interruptOnCancel = true`. As an example of the impact of this problem, I experienced significant numbers of uncancellable "zombie tasks" on a production cluster where several tasks were blocked trying to connect to a dead shuffle server and then continued running as zombies after I cancelled the associated Spark stage. The zombie tasks ran for several minutes with the following stack: ``` java.lang.Object.wait(Native Method) java.lang.Object.wait(Object.java:460) io.netty.util.concurrent.DefaultPromise.await0(DefaultPromise.java:607) io.netty.util.concurrent.DefaultPromise.awaitUninterruptibly(DefaultPromise.java:301) org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:224) org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179) => holding Monitor(java.lang.Object1849476028}) org.apache.spark.network.shuffle.ExternalShuffleClient$1.createAndStart(ExternalShuffleClient.java:105) org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) org.apache.spark.network.shuffle.RetryingBlockFetcher.start(RetryingBlockFetcher.java:120) org.apache.spark.network.shuffle.ExternalShuffleClient.fetchBlocks(ExternalShuffleClient.java:114) org.apache.spark.storage.ShuffleBlockFetcherIterator.sendRequest(ShuffleBlockFetcherIterator.scala:169) org.apache.spark.storage.ShuffleBlockFetcherIterator.fetchUpToMaxBytes(ShuffleBlockFetcherIterator.scala: 350) org.apache.spark.storage.ShuffleBlockFetcherIterator.initialize(ShuffleBlockFetcherIterator.scala:286) org.apache.spark.storage.ShuffleBlockFetcherIterator.(ShuffleBlockFetcherIterator.scala:120) org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:45) org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:169) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) org.apache.spark.rdd.RDD.iterator(RDD.scala:287) [...] ``` As far as I can tell, `awaitUninterruptibly()` might have been used in order to avoid having to declare that methods throw `InterruptedException` (this code is written in Java, hence the need to use checked exceptions). This patch simply replaces this with a regular, interruptible `await()` call,. This required several interface changes to declare a new checked exception (these are internal interfaces, though, and this change doesn't significantly impact binary compatibility). An alternative approach would be to wrap `InterruptedException` into `IOException` in order to avoid having to change interfaces. The problem with this approach is that the `network-shuffle` project's `RetryingBlockFetcher` code treats `IOExceptions` as transitive failures when deciding whether to retry fetches, so throwing a wrapped `IOException` might cause an interrupted shuffle fetch to be retried, further prolonging the lifetime of a cancelled zombie task. Note that there are three other `awaitUninterruptibly()` in the codebase, but those calls have a hard 10 second timeout and are waiting on a `close()` operation which is expected to complete near instantaneously, so the impact of uninterruptibility there is much smaller. Manually. Author: Josh Rosen <joshro...@databricks.com> Closes #16866 from JoshRosen/SPARK-19529. (cherry picked from commit 1c4d10b10c78d138b55e381ec6828e04fef70d6f) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f50c4372 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f50c4372 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f50c4372 Branch: refs/heads/branch-2.0 Commit: f50c4372c3ebd91c0f6c094a7c4d1dd08f3cdb30 Parents: 23050c8 Author: Josh Rosen <joshro...@databricks.com> Authored: Mon Feb 13 11:04:27 2017 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Feb 13 12:57:29 2017 -0800 -- .../network/client/TransportClientFactory.java | 10 ++ .../spark/network/TransportClientFactorySuite.java | 6 -- .../network/shuf
spark git commit: [SPARK-19529] TransportClientFactory.createClient() shouldn't call awaitUninterruptibly()
Repository: spark Updated Branches: refs/heads/branch-2.1 2968d8c06 -> 5db234730 [SPARK-19529] TransportClientFactory.createClient() shouldn't call awaitUninterruptibly() This patch replaces a single `awaitUninterruptibly()` call with a plain `await()` call in Spark's `network-common` library in order to fix a bug which may cause tasks to be uncancellable. In Spark's Netty RPC layer, `TransportClientFactory.createClient()` calls `awaitUninterruptibly()` on a Netty future while waiting for a connection to be established. This creates problem when a Spark task is interrupted while blocking in this call (which can happen in the event of a slow connection which will eventually time out). This has bad impacts on task cancellation when `interruptOnCancel = true`. As an example of the impact of this problem, I experienced significant numbers of uncancellable "zombie tasks" on a production cluster where several tasks were blocked trying to connect to a dead shuffle server and then continued running as zombies after I cancelled the associated Spark stage. The zombie tasks ran for several minutes with the following stack: ``` java.lang.Object.wait(Native Method) java.lang.Object.wait(Object.java:460) io.netty.util.concurrent.DefaultPromise.await0(DefaultPromise.java:607) io.netty.util.concurrent.DefaultPromise.awaitUninterruptibly(DefaultPromise.java:301) org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:224) org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179) => holding Monitor(java.lang.Object1849476028}) org.apache.spark.network.shuffle.ExternalShuffleClient$1.createAndStart(ExternalShuffleClient.java:105) org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) org.apache.spark.network.shuffle.RetryingBlockFetcher.start(RetryingBlockFetcher.java:120) org.apache.spark.network.shuffle.ExternalShuffleClient.fetchBlocks(ExternalShuffleClient.java:114) org.apache.spark.storage.ShuffleBlockFetcherIterator.sendRequest(ShuffleBlockFetcherIterator.scala:169) org.apache.spark.storage.ShuffleBlockFetcherIterator.fetchUpToMaxBytes(ShuffleBlockFetcherIterator.scala: 350) org.apache.spark.storage.ShuffleBlockFetcherIterator.initialize(ShuffleBlockFetcherIterator.scala:286) org.apache.spark.storage.ShuffleBlockFetcherIterator.(ShuffleBlockFetcherIterator.scala:120) org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:45) org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:169) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) org.apache.spark.rdd.RDD.iterator(RDD.scala:287) [...] ``` As far as I can tell, `awaitUninterruptibly()` might have been used in order to avoid having to declare that methods throw `InterruptedException` (this code is written in Java, hence the need to use checked exceptions). This patch simply replaces this with a regular, interruptible `await()` call,. This required several interface changes to declare a new checked exception (these are internal interfaces, though, and this change doesn't significantly impact binary compatibility). An alternative approach would be to wrap `InterruptedException` into `IOException` in order to avoid having to change interfaces. The problem with this approach is that the `network-shuffle` project's `RetryingBlockFetcher` code treats `IOExceptions` as transitive failures when deciding whether to retry fetches, so throwing a wrapped `IOException` might cause an interrupted shuffle fetch to be retried, further prolonging the lifetime of a cancelled zombie task. Note that there are three other `awaitUninterruptibly()` in the codebase, but those calls have a hard 10 second timeout and are waiting on a `close()` operation which is expected to complete near instantaneously, so the impact of uninterruptibility there is much smaller. Manually. Author: Josh Rosen <joshro...@databricks.com> Closes #16866 from JoshRosen/SPARK-19529. (cherry picked from commit 1c4d10b10c78d138b55e381ec6828e04fef70d6f) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5db23473 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5db23473 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5db23473 Branch: refs/heads/branch-2.1 Commit: 5db23473008a58fb9a7f77ad8b01bcdc2c5f2d9c Parents: 2968d8c Author: Josh Rosen <joshro...@databricks.com> Authored: Mon Feb 13 11:04:27 2017 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Feb 13 12:49:37 2017 -0800 -- .../network/client/TransportClientFactory.java | 10 ++ .../spark/network/TransportClientFactorySuite.java | 6 -- .../network/shuf
spark git commit: [SPARK-19529] TransportClientFactory.createClient() shouldn't call awaitUninterruptibly()
Repository: spark Updated Branches: refs/heads/master ab88b2410 -> 1c4d10b10 [SPARK-19529] TransportClientFactory.createClient() shouldn't call awaitUninterruptibly() ## What changes were proposed in this pull request? This patch replaces a single `awaitUninterruptibly()` call with a plain `await()` call in Spark's `network-common` library in order to fix a bug which may cause tasks to be uncancellable. In Spark's Netty RPC layer, `TransportClientFactory.createClient()` calls `awaitUninterruptibly()` on a Netty future while waiting for a connection to be established. This creates problem when a Spark task is interrupted while blocking in this call (which can happen in the event of a slow connection which will eventually time out). This has bad impacts on task cancellation when `interruptOnCancel = true`. As an example of the impact of this problem, I experienced significant numbers of uncancellable "zombie tasks" on a production cluster where several tasks were blocked trying to connect to a dead shuffle server and then continued running as zombies after I cancelled the associated Spark stage. The zombie tasks ran for several minutes with the following stack: ``` java.lang.Object.wait(Native Method) java.lang.Object.wait(Object.java:460) io.netty.util.concurrent.DefaultPromise.await0(DefaultPromise.java:607) io.netty.util.concurrent.DefaultPromise.awaitUninterruptibly(DefaultPromise.java:301) org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:224) org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179) => holding Monitor(java.lang.Object1849476028}) org.apache.spark.network.shuffle.ExternalShuffleClient$1.createAndStart(ExternalShuffleClient.java:105) org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) org.apache.spark.network.shuffle.RetryingBlockFetcher.start(RetryingBlockFetcher.java:120) org.apache.spark.network.shuffle.ExternalShuffleClient.fetchBlocks(ExternalShuffleClient.java:114) org.apache.spark.storage.ShuffleBlockFetcherIterator.sendRequest(ShuffleBlockFetcherIterator.scala:169) org.apache.spark.storage.ShuffleBlockFetcherIterator.fetchUpToMaxBytes(ShuffleBlockFetcherIterator.scala: 350) org.apache.spark.storage.ShuffleBlockFetcherIterator.initialize(ShuffleBlockFetcherIterator.scala:286) org.apache.spark.storage.ShuffleBlockFetcherIterator.(ShuffleBlockFetcherIterator.scala:120) org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:45) org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:169) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) org.apache.spark.rdd.RDD.iterator(RDD.scala:287) [...] ``` As far as I can tell, `awaitUninterruptibly()` might have been used in order to avoid having to declare that methods throw `InterruptedException` (this code is written in Java, hence the need to use checked exceptions). This patch simply replaces this with a regular, interruptible `await()` call,. This required several interface changes to declare a new checked exception (these are internal interfaces, though, and this change doesn't significantly impact binary compatibility). An alternative approach would be to wrap `InterruptedException` into `IOException` in order to avoid having to change interfaces. The problem with this approach is that the `network-shuffle` project's `RetryingBlockFetcher` code treats `IOExceptions` as transitive failures when deciding whether to retry fetches, so throwing a wrapped `IOException` might cause an interrupted shuffle fetch to be retried, further prolonging the lifetime of a cancelled zombie task. Note that there are three other `awaitUninterruptibly()` in the codebase, but those calls have a hard 10 second timeout and are waiting on a `close()` operation which is expected to complete near instantaneously, so the impact of uninterruptibility there is much smaller. ## How was this patch tested? Manually. Author: Josh Rosen <joshro...@databricks.com> Closes #16866 from JoshRosen/SPARK-19529. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1c4d10b1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1c4d10b1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1c4d10b1 Branch: refs/heads/master Commit: 1c4d10b10c78d138b55e381ec6828e04fef70d6f Parents: ab88b24 Author: Josh Rosen <joshro...@databricks.com> Authored: Mon Feb 13 11:04:27 2017 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Feb 13 11:04:27 2017 -0800 -- .../network/client/TransportClientFactory.java | 10 ++ .../spark/network/TransportClientFactorySuite.java | 6 -- .../network/shuffle/ExternalShuffleClient.java |
spark git commit: [SPARK-18717][SQL] Make code generation for Scala Map work with immutable.Map also
Repository: spark Updated Branches: refs/heads/branch-2.1 7b5ea000e -> e580bb035 [SPARK-18717][SQL] Make code generation for Scala Map work with immutable.Map also ## What changes were proposed in this pull request? Fixes compile errors in generated code when user has case class with a `scala.collections.immutable.Map` instead of a `scala.collections.Map`. Since ArrayBasedMapData.toScalaMap returns the immutable version we can make it work with both. ## How was this patch tested? Additional unit tests. Author: Andrew Ray <ray.and...@gmail.com> Closes #16161 from aray/fix-map-codegen. (cherry picked from commit 46d30ac4846b3ec94426cc482c42cff72ebd6d92) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e580bb03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e580bb03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e580bb03 Branch: refs/heads/branch-2.1 Commit: e580bb035236dd92ade126af6bb98288d88179c4 Parents: 7b5ea00 Author: Andrew Ray <ray.and...@gmail.com> Authored: Tue Dec 13 15:49:22 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Feb 10 16:10:16 2017 -0800 -- .../org/apache/spark/sql/catalyst/ScalaReflection.scala | 2 +- .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 12 2 files changed, 13 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e580bb03/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 0aa21b9..fa1b900 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -342,7 +342,7 @@ object ScalaReflection extends ScalaReflection { StaticInvoke( ArrayBasedMapData.getClass, - ObjectType(classOf[Map[_, _]]), + ObjectType(classOf[scala.collection.immutable.Map[_, _]]), "toScalaMap", keyData :: valueData :: Nil) http://git-wip-us.apache.org/repos/asf/spark/blob/e580bb03/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index bdf6264..381652d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1063,8 +1063,20 @@ class DatasetSuite extends QueryTest with SharedSQLContext { // sizeInBytes is 2404280404, before the fix, it overflows to a negative number assert(sizeInBytes > 0) } + + test("SPARK-18717: code generation works for both scala.collection.Map" + +" and scala.collection.imutable.Map") { +val ds = Seq(WithImmutableMap("hi", Map(42L -> "foo"))).toDS +checkDataset(ds.map(t => t), WithImmutableMap("hi", Map(42L -> "foo"))) + +val ds2 = Seq(WithMap("hi", Map(42L -> "foo"))).toDS +checkDataset(ds2.map(t => t), WithMap("hi", Map(42L -> "foo"))) + } } +case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String]) +case class WithMap(id: String, map_test: scala.collection.Map[Long, String]) + case class Generic[T](id: T, value: Double) case class OtherTuple(_1: String, _2: Int) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19016][SQL][DOC] Document scalable partition handling
Repository: spark Updated Branches: refs/heads/branch-2.1 47ab4afed -> 20ae11722 [SPARK-19016][SQL][DOC] Document scalable partition handling This PR documents the scalable partition handling feature in the body of the programming guide. Before this PR, we only mention it in the migration guide. It's not super clear that external datasource tables require an extra `MSCK REPAIR TABLE` command is to have per-partition information persisted since 2.1. N/A. Author: Cheng Lian <l...@databricks.com> Closes #16424 from liancheng/scalable-partition-handling-doc. (cherry picked from commit 871f6114ac0075a1b45eda8701113fa20d647de9) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20ae1172 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20ae1172 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20ae1172 Branch: refs/heads/branch-2.1 Commit: 20ae11722d82cf3cdaa8c4023e37c1416664917d Parents: 47ab4af Author: Cheng Lian <l...@databricks.com> Authored: Fri Dec 30 14:46:30 2016 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Dec 30 14:50:56 2016 -0800 -- docs/sql-programming-guide.md | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/20ae1172/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d57f22e..58de0e1 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -515,7 +515,7 @@ new data. ### Saving to Persistent Tables `DataFrames` can also be saved as persistent tables into Hive metastore using the `saveAsTable` -command. Notice existing Hive deployment is not necessary to use this feature. Spark will create a +command. Notice that an existing Hive deployment is not necessary to use this feature. Spark will create a default local Hive metastore (using Derby) for you. Unlike the `createOrReplaceTempView` command, `saveAsTable` will materialize the contents of the DataFrame and create a pointer to the data in the Hive metastore. Persistent tables will still exist even after your Spark program has restarted, as @@ -526,6 +526,18 @@ By default `saveAsTable` will create a "managed table", meaning that the locatio be controlled by the metastore. Managed tables will also have their data deleted automatically when a table is dropped. +Currently, `saveAsTable` does not expose an API supporting the creation of an "external table" from a `DataFrame`. +However, this functionality can be achieved by providing a `path` option to the `DataFrameWriter` with `path` as the key +and location of the external table as its value (a string) when saving the table with `saveAsTable`. When an External table +is dropped only its metadata is removed. + +Starting from Spark 2.1, persistent datasource tables have per-partition metadata stored in the Hive metastore. This brings several benefits: + +- Since the metastore can return only necessary partitions for a query, discovering all the partitions on the first query to the table is no longer needed. +- Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API. + +Note that partition information is not gathered by default when creating external datasource tables (those with a `path` option). To sync the partition information in the metastore, you can invoke `MSCK REPAIR TABLE`. + ## Parquet Files [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19016][SQL][DOC] Document scalable partition handling
Repository: spark Updated Branches: refs/heads/master b85e29437 -> 871f6114a [SPARK-19016][SQL][DOC] Document scalable partition handling ## What changes were proposed in this pull request? This PR documents the scalable partition handling feature in the body of the programming guide. Before this PR, we only mention it in the migration guide. It's not super clear that external datasource tables require an extra `MSCK REPAIR TABLE` command is to have per-partition information persisted since 2.1. ## How was this patch tested? N/A. Author: Cheng Lian <l...@databricks.com> Closes #16424 from liancheng/scalable-partition-handling-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/871f6114 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/871f6114 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/871f6114 Branch: refs/heads/master Commit: 871f6114ac0075a1b45eda8701113fa20d647de9 Parents: b85e294 Author: Cheng Lian <l...@databricks.com> Authored: Fri Dec 30 14:46:30 2016 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Dec 30 14:46:30 2016 -0800 -- docs/sql-programming-guide.md | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/871f6114/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 6287e2b..4cd21ae 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -515,7 +515,7 @@ new data. ### Saving to Persistent Tables `DataFrames` can also be saved as persistent tables into Hive metastore using the `saveAsTable` -command. Notice existing Hive deployment is not necessary to use this feature. Spark will create a +command. Notice that an existing Hive deployment is not necessary to use this feature. Spark will create a default local Hive metastore (using Derby) for you. Unlike the `createOrReplaceTempView` command, `saveAsTable` will materialize the contents of the DataFrame and create a pointer to the data in the Hive metastore. Persistent tables will still exist even after your Spark program has restarted, as @@ -526,11 +526,18 @@ By default `saveAsTable` will create a "managed table", meaning that the locatio be controlled by the metastore. Managed tables will also have their data deleted automatically when a table is dropped. -Currently, `saveAsTable` does not expose an API supporting the creation of an "External table" from a `DataFrame`, -however, this functionality can be achieved by providing a `path` option to the `DataFrameWriter` with `path` as the key -and location of the external table as its value (String) when saving the table with `saveAsTable`. When an External table +Currently, `saveAsTable` does not expose an API supporting the creation of an "external table" from a `DataFrame`. +However, this functionality can be achieved by providing a `path` option to the `DataFrameWriter` with `path` as the key +and location of the external table as its value (a string) when saving the table with `saveAsTable`. When an External table is dropped only its metadata is removed. +Starting from Spark 2.1, persistent datasource tables have per-partition metadata stored in the Hive metastore. This brings several benefits: + +- Since the metastore can return only necessary partitions for a query, discovering all the partitions on the first query to the table is no longer needed. +- Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API. + +Note that partition information is not gathered by default when creating external datasource tables (those with a `path` option). To sync the partition information in the metastore, you can invoke `MSCK REPAIR TABLE`. + ## Parquet Files [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18753][SQL] Keep pushed-down null literal as a filter in Spark-side post-filter for FileFormat datasources
Repository: spark Updated Branches: refs/heads/branch-2.1 16d4bd4a2 -> af12a21ca [SPARK-18753][SQL] Keep pushed-down null literal as a filter in Spark-side post-filter for FileFormat datasources ## What changes were proposed in this pull request? Currently, `FileSourceStrategy` does not handle the case when the pushed-down filter is `Literal(null)` and removes it at the post-filter in Spark-side. For example, the codes below: ```scala val df = Seq(Tuple1(Some(true)), Tuple1(None), Tuple1(Some(false))).toDF() df.filter($"_1" === "true").explain(true) ``` shows it keeps `null` properly. ``` == Parsed Logical Plan == 'Filter ('_1 = true) +- LocalRelation [_1#17] == Analyzed Logical Plan == _1: boolean Filter (cast(_1#17 as double) = cast(true as double)) +- LocalRelation [_1#17] == Optimized Logical Plan == Filter (isnotnull(_1#17) && null) +- LocalRelation [_1#17] == Physical Plan == *Filter (isnotnull(_1#17) && null) << Here `null` is there +- LocalTableScan [_1#17] ``` However, when we read it back from Parquet, ```scala val path = "/tmp/testfile" df.write.parquet(path) spark.read.parquet(path).filter($"_1" === "true").explain(true) ``` `null` is removed at the post-filter. ``` == Parsed Logical Plan == 'Filter ('_1 = true) +- Relation[_1#11] parquet == Analyzed Logical Plan == _1: boolean Filter (cast(_1#11 as double) = cast(true as double)) +- Relation[_1#11] parquet == Optimized Logical Plan == Filter (isnotnull(_1#11) && null) +- Relation[_1#11] parquet == Physical Plan == *Project [_1#11] +- *Filter isnotnull(_1#11) << Here `null` is missing +- *FileScan parquet [_1#11] Batched: true, Format: ParquetFormat, Location: InMemoryFileIndex[file:/tmp/testfile], PartitionFilters: [null], PushedFilters: [IsNotNull(_1)], ReadSchema: struct<_1:boolean> ``` This PR fixes it to keep it properly. In more details, ```scala val partitionKeyFilters = ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet))) ``` This keeps this `null` in `partitionKeyFilters` as `Literal` always don't have `children` and `references` is being empty which is always the subset of `partitionSet`. And then in ```scala val afterScanFilters = filterSet -- partitionKeyFilters ``` `null` is always removed from the post filter. So, if the referenced fields are empty, it should be applied into data columns too. After this PR, it becomes as below: ``` == Parsed Logical Plan == 'Filter ('_1 = true) +- Relation[_1#276] parquet == Analyzed Logical Plan == _1: boolean Filter (cast(_1#276 as double) = cast(true as double)) +- Relation[_1#276] parquet == Optimized Logical Plan == Filter (isnotnull(_1#276) && null) +- Relation[_1#276] parquet == Physical Plan == *Project [_1#276] +- *Filter (isnotnull(_1#276) && null) +- *FileScan parquet [_1#276] Batched: true, Format: ParquetFormat, Location: InMemoryFileIndex[file:/private/var/folders/9j/gf_c342d7d150mwrxvkqnc18gn/T/spark-a5d59bdb-5b..., PartitionFilters: [null], PushedFilters: [IsNotNull(_1)], ReadSchema: struct<_1:boolean> ``` ## How was this patch tested? Unit test in `FileSourceStrategySuite` Author: hyukjinkwon <gurwls...@gmail.com> Closes #16184 from HyukjinKwon/SPARK-18753. (cherry picked from commit 89ae26dcdb73266fbc3a8b6da9f5dff30dc4ec95) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af12a21c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af12a21c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af12a21c Branch: refs/heads/branch-2.1 Commit: af12a21ca7145751acdec400134b1bd5c8168f74 Parents: 16d4bd4 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Wed Dec 14 11:29:11 2016 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Dec 14 11:29:23 2016 -0800 -- .../sql/execution/datasources/FileSourceStrategy.scala | 2 +- .../execution/datasources/FileSourceStrategySuite.scala | 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af12a21c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 55ca4f1..ead3233 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSource
spark git commit: [SPARK-18753][SQL] Keep pushed-down null literal as a filter in Spark-side post-filter for FileFormat datasources
Repository: spark Updated Branches: refs/heads/master 169b9d73e -> 89ae26dcd [SPARK-18753][SQL] Keep pushed-down null literal as a filter in Spark-side post-filter for FileFormat datasources ## What changes were proposed in this pull request? Currently, `FileSourceStrategy` does not handle the case when the pushed-down filter is `Literal(null)` and removes it at the post-filter in Spark-side. For example, the codes below: ```scala val df = Seq(Tuple1(Some(true)), Tuple1(None), Tuple1(Some(false))).toDF() df.filter($"_1" === "true").explain(true) ``` shows it keeps `null` properly. ``` == Parsed Logical Plan == 'Filter ('_1 = true) +- LocalRelation [_1#17] == Analyzed Logical Plan == _1: boolean Filter (cast(_1#17 as double) = cast(true as double)) +- LocalRelation [_1#17] == Optimized Logical Plan == Filter (isnotnull(_1#17) && null) +- LocalRelation [_1#17] == Physical Plan == *Filter (isnotnull(_1#17) && null) << Here `null` is there +- LocalTableScan [_1#17] ``` However, when we read it back from Parquet, ```scala val path = "/tmp/testfile" df.write.parquet(path) spark.read.parquet(path).filter($"_1" === "true").explain(true) ``` `null` is removed at the post-filter. ``` == Parsed Logical Plan == 'Filter ('_1 = true) +- Relation[_1#11] parquet == Analyzed Logical Plan == _1: boolean Filter (cast(_1#11 as double) = cast(true as double)) +- Relation[_1#11] parquet == Optimized Logical Plan == Filter (isnotnull(_1#11) && null) +- Relation[_1#11] parquet == Physical Plan == *Project [_1#11] +- *Filter isnotnull(_1#11) << Here `null` is missing +- *FileScan parquet [_1#11] Batched: true, Format: ParquetFormat, Location: InMemoryFileIndex[file:/tmp/testfile], PartitionFilters: [null], PushedFilters: [IsNotNull(_1)], ReadSchema: struct<_1:boolean> ``` This PR fixes it to keep it properly. In more details, ```scala val partitionKeyFilters = ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet))) ``` This keeps this `null` in `partitionKeyFilters` as `Literal` always don't have `children` and `references` is being empty which is always the subset of `partitionSet`. And then in ```scala val afterScanFilters = filterSet -- partitionKeyFilters ``` `null` is always removed from the post filter. So, if the referenced fields are empty, it should be applied into data columns too. After this PR, it becomes as below: ``` == Parsed Logical Plan == 'Filter ('_1 = true) +- Relation[_1#276] parquet == Analyzed Logical Plan == _1: boolean Filter (cast(_1#276 as double) = cast(true as double)) +- Relation[_1#276] parquet == Optimized Logical Plan == Filter (isnotnull(_1#276) && null) +- Relation[_1#276] parquet == Physical Plan == *Project [_1#276] +- *Filter (isnotnull(_1#276) && null) +- *FileScan parquet [_1#276] Batched: true, Format: ParquetFormat, Location: InMemoryFileIndex[file:/private/var/folders/9j/gf_c342d7d150mwrxvkqnc18gn/T/spark-a5d59bdb-5b..., PartitionFilters: [null], PushedFilters: [IsNotNull(_1)], ReadSchema: struct<_1:boolean> ``` ## How was this patch tested? Unit test in `FileSourceStrategySuite` Author: hyukjinkwon <gurwls...@gmail.com> Closes #16184 from HyukjinKwon/SPARK-18753. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89ae26dc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89ae26dc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89ae26dc Branch: refs/heads/master Commit: 89ae26dcdb73266fbc3a8b6da9f5dff30dc4ec95 Parents: 169b9d7 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Wed Dec 14 11:29:11 2016 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Dec 14 11:29:11 2016 -0800 -- .../sql/execution/datasources/FileSourceStrategy.scala | 2 +- .../execution/datasources/FileSourceStrategySuite.scala | 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/89ae26dc/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 55ca4f1..ead3233 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -86,7 +86,7 @@ object FileSourceStrategy extends Strategy with Logging { val dataFilters = normalizedFilters.fi
spark git commit: [SPARK-18251][SQL] the type of Dataset can't be Option of non-flat type
Repository: spark Updated Branches: refs/heads/branch-2.1 f542df310 -> 9e96ac5a9 [SPARK-18251][SQL] the type of Dataset can't be Option of non-flat type ## What changes were proposed in this pull request? For input object of non-flat type, we can't encode it to row if it's null, as Spark SQL doesn't allow the entire row to be null, only its columns can be null. That's the reason we forbid users to use top level null objects in https://github.com/apache/spark/pull/13469 However, if users wrap non-flat type with `Option`, then we may still encoder top level null object to row, which is not allowed. This PR fixes this case, and suggests users to wrap their type with `Tuple1` if they do wanna top level null objects. ## How was this patch tested? new test Author: Wenchen Fan <wenc...@databricks.com> Closes #15979 from cloud-fan/option. (cherry picked from commit f135b70fd590438bebb2a54012a6f73074219758) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e96ac5a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e96ac5a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e96ac5a Branch: refs/heads/branch-2.1 Commit: 9e96ac5a986c53ca1689e3d1f1365cc5107b5d88 Parents: f542df3 Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Nov 30 13:36:17 2016 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Nov 30 13:54:37 2016 -0800 -- .../apache/spark/sql/catalyst/ScalaReflection.scala | 13 + .../sql/catalyst/encoders/ExpressionEncoder.scala | 14 -- .../scala/org/apache/spark/sql/DatasetSuite.scala | 13 +++-- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 2 +- 4 files changed, 37 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e96ac5a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7..0aa21b9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -606,6 +606,19 @@ object ScalaReflection extends ScalaReflection { } /** + * Returns true if the given type is option of product type, e.g. `Option[Tuple2]`. Note that, + * we also treat [[DefinedByConstructorParams]] as product type. + */ + def optionOfProductType(tpe: `Type`): Boolean = ScalaReflectionLock.synchronized { +tpe match { + case t if t <:< localTypeOf[Option[_]] => +val TypeRef(_, _, Seq(optType)) = t +definedByConstructorParams(optType) + case _ => false +} + } + + /** * Returns the parameter names and types for the primary constructor of this class. * * Note that it only works for scala classes with primary constructor, and currently doesn't http://git-wip-us.apache.org/repos/asf/spark/blob/9e96ac5a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 82e1a8a..9c4818d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -47,6 +47,16 @@ object ExpressionEncoder { // We convert the not-serializable TypeTag into StructType and ClassTag. val mirror = typeTag[T].mirror val tpe = typeTag[T].tpe + +if (ScalaReflection.optionOfProductType(tpe)) { + throw new UnsupportedOperationException( +"Cannot create encoder for Option of Product type, because Product type is represented " + + "as a row, and the entire row can not be null in Spark SQL like normal databases. " + + "You can wrap your type with Tuple1 if you do want top level null Product objects, " + + "e.g. instead of creating `Dataset[Option[MyClass]]`, you can do something like " + + "`val ds: Dataset[Tuple1[MyClass]] = Seq(Tuple1(MyClass(...)), Tuple1(null)).toDS`") +} + val cls = mirror.runtimeClass(tpe) val flat = !ScalaReflection.definedByConstructorParams(tpe) @@ -54,9 +64,9 @@ objec
spark git commit: [SPARK-18251][SQL] the type of Dataset can't be Option of non-flat type
Repository: spark Updated Branches: refs/heads/master 60022bfd6 -> f135b70fd [SPARK-18251][SQL] the type of Dataset can't be Option of non-flat type ## What changes were proposed in this pull request? For input object of non-flat type, we can't encode it to row if it's null, as Spark SQL doesn't allow the entire row to be null, only its columns can be null. That's the reason we forbid users to use top level null objects in https://github.com/apache/spark/pull/13469 However, if users wrap non-flat type with `Option`, then we may still encoder top level null object to row, which is not allowed. This PR fixes this case, and suggests users to wrap their type with `Tuple1` if they do wanna top level null objects. ## How was this patch tested? new test Author: Wenchen Fan <wenc...@databricks.com> Closes #15979 from cloud-fan/option. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f135b70f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f135b70f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f135b70f Branch: refs/heads/master Commit: f135b70fd590438bebb2a54012a6f73074219758 Parents: 60022bf Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Nov 30 13:36:17 2016 -0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Nov 30 13:36:17 2016 -0800 -- .../apache/spark/sql/catalyst/ScalaReflection.scala | 13 + .../sql/catalyst/encoders/ExpressionEncoder.scala | 14 -- .../scala/org/apache/spark/sql/DatasetSuite.scala | 13 +++-- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 2 +- 4 files changed, 37 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f135b70f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7..0aa21b9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -606,6 +606,19 @@ object ScalaReflection extends ScalaReflection { } /** + * Returns true if the given type is option of product type, e.g. `Option[Tuple2]`. Note that, + * we also treat [[DefinedByConstructorParams]] as product type. + */ + def optionOfProductType(tpe: `Type`): Boolean = ScalaReflectionLock.synchronized { +tpe match { + case t if t <:< localTypeOf[Option[_]] => +val TypeRef(_, _, Seq(optType)) = t +definedByConstructorParams(optType) + case _ => false +} + } + + /** * Returns the parameter names and types for the primary constructor of this class. * * Note that it only works for scala classes with primary constructor, and currently doesn't http://git-wip-us.apache.org/repos/asf/spark/blob/f135b70f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 82e1a8a..9c4818d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -47,6 +47,16 @@ object ExpressionEncoder { // We convert the not-serializable TypeTag into StructType and ClassTag. val mirror = typeTag[T].mirror val tpe = typeTag[T].tpe + +if (ScalaReflection.optionOfProductType(tpe)) { + throw new UnsupportedOperationException( +"Cannot create encoder for Option of Product type, because Product type is represented " + + "as a row, and the entire row can not be null in Spark SQL like normal databases. " + + "You can wrap your type with Tuple1 if you do want top level null Product objects, " + + "e.g. instead of creating `Dataset[Option[MyClass]]`, you can do something like " + + "`val ds: Dataset[Tuple1[MyClass]] = Seq(Tuple1(MyClass(...)), Tuple1(null)).toDS`") +} + val cls = mirror.runtimeClass(tpe) val flat = !ScalaReflection.definedByConstructorParams(tpe) @@ -54,9 +64,9 @@ object ExpressionEncoder { val nullSafeInput = if (flat) { inputObject } else { - // For input object of non-flat type, we can't enco
spark git commit: [SPARK-18012][SQL] Simplify WriterContainer
Repository: spark Updated Branches: refs/heads/master 4b2011ec9 -> f313117bc [SPARK-18012][SQL] Simplify WriterContainer ## What changes were proposed in this pull request? This patch refactors WriterContainer to simplify the logic and make control flow more obvious.The previous code setup made it pretty difficult to track the actual dependencies on variables and setups because the driver side and the executor side were using the same set of variables. ## How was this patch tested? N/A - this should be covered by existing tests. Author: Reynold Xin <r...@databricks.com> Closes #15551 from rxin/writercontainer-refactor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f313117b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f313117b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f313117b Branch: refs/heads/master Commit: f313117bc93b0bf560528b316d3e6947caa96296 Parents: 4b2011e Author: Reynold Xin <r...@databricks.com> Authored: Wed Oct 19 22:22:35 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Oct 19 22:22:35 2016 -0700 -- .../InsertIntoHadoopFsRelationCommand.scala | 79 +-- .../sql/execution/datasources/WriteOutput.scala | 480 +++ .../execution/datasources/WriterContainer.scala | 445 - .../org/apache/spark/sql/internal/SQLConf.scala | 9 - 4 files changed, 492 insertions(+), 521 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f313117b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 99ca3df..22dbe71 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -20,18 +20,12 @@ package org.apache.spark.sql.execution.datasources import java.io.IOException import org.apache.hadoop.fs.Path -import org.apache.hadoop.mapreduce._ -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat -import org.apache.spark._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet} +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.RunnableCommand -import org.apache.spark.sql.internal.SQLConf /** * A command for writing data to a [[HadoopFsRelation]]. Supports both overwriting and appending. @@ -40,20 +34,6 @@ import org.apache.spark.sql.internal.SQLConf * implementation of [[HadoopFsRelation]] should use this UUID together with task id to generate * unique file path for each task output file. This UUID is passed to executor side via a * property named `spark.sql.sources.writeJobUUID`. - * - * Different writer containers, [[DefaultWriterContainer]] and [[DynamicPartitionWriterContainer]] - * are used to write to normal tables and tables with dynamic partitions. - * - * Basic work flow of this command is: - * - * 1. Driver side setup, including output committer initialization and data source specific - * preparation work for the write job to be issued. - * 2. Issues a write job consists of one or more executor side tasks, each of which writes all - * rows within an RDD partition. - * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task; If any - * exception is thrown during task commitment, also aborts that task. - * 4. If all tasks are committed, commit the job, otherwise aborts the job; If any exception is - * thrown during job commitment, also aborts the job. */ case class InsertIntoHadoopFsRelationCommand( outputPath: Path, @@ -103,52 +83,17 @@ case class InsertIntoHadoopFsRelationCommand( val isAppend = pathExists && (mode == SaveMode.Append) if (doInsertion) { - val job = Job.getInstance(hadoopConf) - job.setOutputKeyClass(classOf[Void]) - job.setOutputValueClass(classOf[InternalRow]) - FileOutputFormat.setOutputPath(job, qualifiedOutputPath) - - val partitionSet = AttributeSet(partitionColumns) - val dataColumns = query.output.filterNot(partitionSet.contains
spark git commit: [SPARK-16516][SQL] Support for pushing down filters for decimal and timestamp types in ORC
Repository: spark Updated Branches: refs/heads/master 5de1737b0 -> 2cac3b2d4 [SPARK-16516][SQL] Support for pushing down filters for decimal and timestamp types in ORC ## What changes were proposed in this pull request? It seems ORC supports all the types in ([`PredicateLeaf.Type`](https://github.com/apache/hive/blob/e085b7e9bd059d91aaf013df0db4d71dca90ec6f/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java#L50-L56)) which includes timestamp type and decimal type. In more details, the types listed in [`SearchArgumentImpl.boxLiteral()`](https://github.com/apache/hive/blob/branch-1.2/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java#L1068-L1093) can be used as a filter value. FYI, inital `case` caluse for supported types was introduced in https://github.com/apache/spark/commit/65d71bd9fbfe6fe1b741c80fed72d6ae3d22b028 and this was not changed overtime. At that time, Hive version was, 0.13 which supports only some types for filter-push down (See [SearchArgumentImpl.java#L945-L965](https://github.com/apache/hive/blob/branch-0.13/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java#L945-L965) at 0.13). However, the version was upgraded into 1.2.x and now it supports more types (See [SearchArgumentImpl.java#L1068-L1093](https://github.com/apache/hive/blob/branch-1.2/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java#L1068-L1093) at 1.2.0) ## How was this patch tested? Unit tests in `OrcFilterSuite` and `OrcQuerySuite` Author: hyukjinkwon <gurwls...@gmail.com> Closes #14172 from HyukjinKwon/SPARK-16516. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2cac3b2d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2cac3b2d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2cac3b2d Branch: refs/heads/master Commit: 2cac3b2d4a4a4f3d0d45af4defc23bb0ba53484b Parents: 5de1737 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Wed Sep 28 00:50:12 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Sep 28 00:50:12 2016 +0800 -- .../apache/spark/sql/hive/orc/OrcFilters.scala | 1 + .../spark/sql/hive/orc/OrcFilterSuite.scala | 62 +--- .../spark/sql/hive/orc/OrcQuerySuite.scala | 35 +++ 3 files changed, 89 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2cac3b2d/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala index 6ab8244..d9efd0c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala @@ -84,6 +84,7 @@ private[orc] object OrcFilters extends Logging { // the `SearchArgumentImpl.BuilderImpl.boxLiteral()` method. case ByteType | ShortType | FloatType | DoubleType => true case IntegerType | LongType | StringType | BooleanType => true + case TimestampType | _: DecimalType => true case _ => false } http://git-wip-us.apache.org/repos/asf/spark/blob/2cac3b2d/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala index 471192a..222c249 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala @@ -229,6 +229,59 @@ class OrcFilterSuite extends QueryTest with OrcTest { } } + test("filter pushdown - decimal") { +withOrcDataFrame((1 to 4).map(i => Tuple1.apply(BigDecimal.valueOf(i { implicit df => + checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate('_1 === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate('_1 <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate('_1 < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate('_1 > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate('_1 <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate('_1 >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate( +Li
spark git commit: [SPARK-16777][SQL] Do not use deprecated listType API in ParquetSchemaConverter
Repository: spark Updated Branches: refs/heads/master 6a68c5d7b -> 5de1737b0 [SPARK-16777][SQL] Do not use deprecated listType API in ParquetSchemaConverter ## What changes were proposed in this pull request? This PR removes build waning as below. ```scala [WARNING] .../spark/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala:448: method listType in object ConversionPatterns is deprecated: see corresponding Javadoc for more information. [WARNING] ConversionPatterns.listType( [WARNING]^ [WARNING] .../spark/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala:464: method listType in object ConversionPatterns is deprecated: see corresponding Javadoc for more information. [WARNING] ConversionPatterns.listType( [WARNING]^ ``` This should not use `listOfElements` (recommended to be replaced from `listType`) instead because the new method checks if the name of elements in Parquet's `LIST` is `element` in Parquet schema and throws an exception if not. However, It seems Spark prior to 1.4.x writes `ArrayType` with Parquet's `LIST` but with `array` as its element name. Therefore, this PR avoids to use both `listOfElements` and `listType` but just use the existing schema builder to construct the same `GroupType`. ## How was this patch tested? Existing tests should cover this. Author: hyukjinkwon <gurwls...@gmail.com> Closes #14399 from HyukjinKwon/SPARK-16777. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5de1737b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5de1737b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5de1737b Branch: refs/heads/master Commit: 5de1737b02710e36f6804d2ae243d1aeb30a0b32 Parents: 6a68c5d Author: hyukjinkwon <gurwls...@gmail.com> Authored: Wed Sep 28 00:39:47 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Sep 28 00:39:47 2016 +0800 -- .../parquet/ParquetSchemaConverter.scala| 26 +--- 1 file changed, 17 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5de1737b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala index c81a65f..b4f36ce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala @@ -445,14 +445,20 @@ private[parquet] class ParquetSchemaConverter( // repeated array; // } // } -ConversionPatterns.listType( - repetition, - field.name, - Types + +// This should not use `listOfElements` here because this new method checks if the +// element name is `element` in the `GroupType` and throws an exception if not. +// As mentioned above, Spark prior to 1.4.x writes `ArrayType` as `LIST` but with +// `array` as its element name as below. Therefore, we build manually +// the correct group type here via the builder. (See SPARK-16777) +Types + .buildGroup(repetition).as(LIST) + .addField(Types .buildGroup(REPEATED) -// "array_element" is the name chosen by parquet-hive (1.7.0 and prior version) +// "array" is the name chosen by parquet-hive (1.7.0 and prior version) .addField(convertField(StructField("array", elementType, nullable))) .named("bag")) + .named(field.name) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is @@ -461,11 +467,13 @@ private[parquet] class ParquetSchemaConverter( // group (LIST) { // repeated element; // } -ConversionPatterns.listType( - repetition, - field.name, + +// Here too, we should not use `listOfElements`. (See SPARK-16777) +Types + .buildGroup(repetition).as(LIST) // "array" is the name chosen by parquet-avro (1.7.0 and prior version) - convertField(StructField("array", elementType, nullable), REPEAT
spark git commit: [SQL][MINOR] correct the comment of SortBasedAggregationIterator.safeProj
Repository: spark Updated Branches: refs/heads/master 72d9fba26 -> 8a02410a9 [SQL][MINOR] correct the comment of SortBasedAggregationIterator.safeProj ## What changes were proposed in this pull request? This comment went stale long time ago, this PR fixes it according to my understanding. ## How was this patch tested? N/A Author: Wenchen Fan <wenc...@databricks.com> Closes #15095 from cloud-fan/update-comment. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a02410a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a02410a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a02410a Branch: refs/heads/master Commit: 8a02410a92429bff50d6ce082f873cea9e9fa91e Parents: 72d9fba Author: Wenchen Fan <wenc...@databricks.com> Authored: Thu Sep 22 23:25:32 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Sep 22 23:25:32 2016 +0800 -- .../aggregate/SortBasedAggregationIterator.scala | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a02410a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala index 3f7f849..c2b1ef0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala @@ -86,8 +86,15 @@ class SortBasedAggregationIterator( // The aggregation buffer used by the sort-based aggregation. private[this] val sortBasedAggregationBuffer: MutableRow = newBuffer - // A SafeProjection to turn UnsafeRow into GenericInternalRow, because UnsafeRow can't be - // compared to MutableRow (aggregation buffer) directly. + // This safe projection is used to turn the input row into safe row. This is necessary + // because the input row may be produced by unsafe projection in child operator and all the + // produced rows share one byte array. However, when we update the aggregate buffer according to + // the input row, we may cache some values from input row, e.g. `Max` will keep the max value from + // input row via MutableProjection, `CollectList` will keep all values in an array via + // ImperativeAggregate framework. These values may get changed unexpectedly if the underlying + // unsafe projection update the shared byte array. By applying a safe projection to the input row, + // we can cut down the connection from input row to the shared byte array, and thus it's safe to + // cache values from input row while updating the aggregation buffer. private[this] val safeProj: Projection = FromUnsafeProjection(valueAttributes.map(_.dataType)) protected def initialize(): Unit = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17289][SQL] Fix a bug to satisfy sort requirements in partial aggregations
Repository: spark Updated Branches: refs/heads/master 8fb445d9b -> 94922d79e [SPARK-17289][SQL] Fix a bug to satisfy sort requirements in partial aggregations ## What changes were proposed in this pull request? Partial aggregations are generated in `EnsureRequirements`, but the planner fails to check if partial aggregation satisfies sort requirements. For the following query: ``` val df2 = (0 to 1000).map(x => (x % 2, x.toString)).toDF("a", "b").createOrReplaceTempView("t2") spark.sql("select max(b) from t2 group by a").explain(true) ``` Now, the SortAggregator won't insert Sort operator before partial aggregation, this will break sort-based partial aggregation. ``` == Physical Plan == SortAggregate(key=[a#5], functions=[max(b#6)], output=[max(b)#17]) +- *Sort [a#5 ASC], false, 0 +- Exchange hashpartitioning(a#5, 200) +- SortAggregate(key=[a#5], functions=[partial_max(b#6)], output=[a#5, max#19]) +- LocalTableScan [a#5, b#6] ``` Actually, a correct plan is: ``` == Physical Plan == SortAggregate(key=[a#5], functions=[max(b#6)], output=[max(b)#17]) +- *Sort [a#5 ASC], false, 0 +- Exchange hashpartitioning(a#5, 200) +- SortAggregate(key=[a#5], functions=[partial_max(b#6)], output=[a#5, max#19]) +- *Sort [a#5 ASC], false, 0 +- LocalTableScan [a#5, b#6] ``` ## How was this patch tested? Added tests in `PlannerSuite`. Author: Takeshi YAMAMURO <linguin@gmail.com> Closes #14865 from maropu/SPARK-17289. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/94922d79 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/94922d79 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/94922d79 Branch: refs/heads/master Commit: 94922d79e9f90fac3777db0974ccf7566b8ac3b3 Parents: 8fb445d Author: Takeshi YAMAMURO <linguin@gmail.com> Authored: Tue Aug 30 16:43:47 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Aug 30 16:43:47 2016 +0800 -- .../execution/exchange/EnsureRequirements.scala | 3 ++- .../spark/sql/execution/PlannerSuite.scala | 22 +++- 2 files changed, 23 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/94922d79/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index fee7010..66e99de 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -164,7 +164,8 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { // If an aggregation needs a shuffle and support partial aggregations, a map-side partial // aggregation and a shuffle are added as children. val (mergeAgg, mapSideAgg) = AggUtils.createMapMergeAggregatePair(operator) -(mergeAgg, createShuffleExchange(requiredChildDistributions.head, mapSideAgg) :: Nil) +(mergeAgg, createShuffleExchange( + requiredChildDistributions.head, ensureDistributionAndOrdering(mapSideAgg)) :: Nil) case _ => // Ensure that the operator's children satisfy their output distribution requirements: val childrenWithDist = operator.children.zip(requiredChildDistributions) http://git-wip-us.apache.org/repos/asf/spark/blob/94922d79/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 07efc72..b0aa337 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -18,12 +18,13 @@ package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{execution, DataFrame, Row} +import org.apache.spark.sql.{execution, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition} import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.aggregate.SortAggregateExec import org.apache.spark.sql.execution.columnar.InMemoryRelation
spark git commit: [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly
Repository: spark Updated Branches: refs/heads/branch-2.0 0fb01496c -> b4047fc21 [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly Currently, Spark ignores path names starting with underscore `_` and `.`. This causes read-failures for the column-partitioned file data sources whose partition column names starts from '_', e.g. `_col`. **Before** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") org.apache.spark.sql.AnalysisException: Unable to infer schema for ParquetFormat at /tmp/parquet20. It must be specified manually; ``` **After** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") res2: org.apache.spark.sql.DataFrame = [id: bigint, _locality_code: int] ``` Pass the Jenkins with a new test case. Author: Dongjoon Hyun <dongj...@apache.org> Closes #14585 from dongjoon-hyun/SPARK-16975-PARQUET. (cherry picked from commit abff92bfdc7d4c9d2308794f0350561fe0ceb4dd) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b4047fc2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b4047fc2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b4047fc2 Branch: refs/heads/branch-2.0 Commit: b4047fc21cefcf6a43c1ee88af330a042f02bebc Parents: 0fb0149 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Aug 12 14:40:12 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Aug 12 14:52:50 2016 +0800 -- .../datasources/PartitioningAwareFileCatalog.scala | 2 +- .../sql/execution/datasources/fileSourceInterfaces.scala| 2 +- .../sql/execution/datasources/json/JsonFileFormat.scala | 2 +- .../execution/datasources/parquet/ParquetFileFormat.scala | 3 ++- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 9 + 5 files changed, 14 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b4047fc2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala index 811e96c..cef9d4d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala @@ -204,6 +204,6 @@ abstract class PartitioningAwareFileCatalog( private def isDataPath(path: Path): Boolean = { val name = path.getName -!(name.startsWith("_") || name.startsWith(".")) +!((name.startsWith("_") && !name.contains("=")) || name.startsWith(".")) } } http://git-wip-us.apache.org/repos/asf/spark/blob/b4047fc2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index 0b5a19f..438fccb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -363,7 +363,7 @@ private[sql] object HadoopFsRelation extends Logging { // We filter everything that starts with _ and ., except _common_metadata and _metadata // because Parquet needs to find those metadata files from leaf files returned by this method. // We should refactor this logic to not mix metadata files with data files. -(pathName.startsWith("_") || pathName.startsWith(".")) && +((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) && !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata") } http://git-wip-us.apache.org/repos/asf/spark/blob/b4047fc2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
spark git commit: [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly
Repository: spark Updated Branches: refs/heads/master ccc6dc0f4 -> abff92bfd [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly ## What changes were proposed in this pull request? Currently, Spark ignores path names starting with underscore `_` and `.`. This causes read-failures for the column-partitioned file data sources whose partition column names starts from '_', e.g. `_col`. **Before** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") org.apache.spark.sql.AnalysisException: Unable to infer schema for ParquetFormat at /tmp/parquet20. It must be specified manually; ``` **After** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") res2: org.apache.spark.sql.DataFrame = [id: bigint, _locality_code: int] ``` ## How was this patch tested? Pass the Jenkins with a new test case. Author: Dongjoon Hyun <dongj...@apache.org> Closes #14585 from dongjoon-hyun/SPARK-16975-PARQUET. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/abff92bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/abff92bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/abff92bf Branch: refs/heads/master Commit: abff92bfdc7d4c9d2308794f0350561fe0ceb4dd Parents: ccc6dc0 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Aug 12 14:40:12 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Aug 12 14:40:12 2016 +0800 -- .../datasources/PartitioningAwareFileCatalog.scala | 2 +- .../sql/execution/datasources/fileSourceInterfaces.scala| 2 +- .../sql/execution/datasources/json/JsonFileFormat.scala | 2 +- .../execution/datasources/parquet/ParquetFileFormat.scala | 3 ++- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 9 + 5 files changed, 14 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/abff92bf/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala index 811e96c..cef9d4d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala @@ -204,6 +204,6 @@ abstract class PartitioningAwareFileCatalog( private def isDataPath(path: Path): Boolean = { val name = path.getName -!(name.startsWith("_") || name.startsWith(".")) +!((name.startsWith("_") && !name.contains("=")) || name.startsWith(".")) } } http://git-wip-us.apache.org/repos/asf/spark/blob/abff92bf/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index f068779..e03a232 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -364,7 +364,7 @@ object HadoopFsRelation extends Logging { // We filter everything that starts with _ and ., except _common_metadata and _metadata // because Parquet needs to find those metadata files from leaf files returned by this method. // We should refactor this logic to not mix metadata files with data files. -(pathName.startsWith("_") || pathName.startsWith(".")) && +((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) && !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata") } http://git-wip-us.apache.org/repos/asf/spark/blob/abff92bf/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala -- di
spark git commit: [SPARK-16867][SQL] createTable and alterTable in ExternalCatalog should not take db
Repository: spark Updated Branches: refs/heads/master 27e815c31 -> 43f4fd6f9 [SPARK-16867][SQL] createTable and alterTable in ExternalCatalog should not take db ## What changes were proposed in this pull request? These 2 methods take `CatalogTable` as parameter, which already have the database information. ## How was this patch tested? existing test Author: Wenchen Fan <wenc...@databricks.com> Closes #14476 from cloud-fan/minor5. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43f4fd6f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43f4fd6f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43f4fd6f Branch: refs/heads/master Commit: 43f4fd6f9bfff749af17e3c65b53a33f5ecb0922 Parents: 27e815c Author: Wenchen Fan <wenc...@databricks.com> Authored: Thu Aug 4 16:48:30 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Aug 4 16:48:30 2016 +0800 -- .../sql/catalyst/catalog/ExternalCatalog.scala | 9 + .../sql/catalyst/catalog/InMemoryCatalog.scala | 7 +-- .../sql/catalyst/catalog/SessionCatalog.scala | 4 ++-- .../catalyst/catalog/ExternalCatalogSuite.scala | 20 ++-- .../spark/sql/hive/HiveExternalCatalog.scala| 17 + .../sql/hive/MetastoreDataSourcesSuite.scala| 2 +- 6 files changed, 28 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/43f4fd6f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 35fc6dd..27e1810 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -69,20 +69,21 @@ abstract class ExternalCatalog { // Tables // -- - def createTable(db: String, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit + def createTable(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit def dropTable(db: String, table: String, ignoreIfNotExists: Boolean, purge: Boolean): Unit def renameTable(db: String, oldName: String, newName: String): Unit /** - * Alter a table whose name that matches the one specified in `tableDefinition`, - * assuming the table exists. + * Alter a table whose database and name match the ones specified in `tableDefinition`, assuming + * the table exists. Note that, even though we can specify database in `tableDefinition`, it's + * used to identify the table, not to alter the table's database, which is not allowed. * * Note: If the underlying implementation does not support altering a certain field, * this becomes a no-op. */ - def alterTable(db: String, tableDefinition: CatalogTable): Unit + def alterTable(tableDefinition: CatalogTable): Unit def getTable(db: String, table: String): CatalogTable http://git-wip-us.apache.org/repos/asf/spark/blob/43f4fd6f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 67a90c8..9ebf7de 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -192,9 +192,10 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E // -- override def createTable( - db: String, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = synchronized { +assert(tableDefinition.identifier.database.isDefined) +val db = tableDefinition.identifier.database.get requireDbExists(db) val table = tableDefinition.identifier.table if (tableExists(db, table)) { @@ -266,7 +267,9 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E catalog(db).tables.remove(oldName) } - override def alterTable(db: String, tableDefinition: CatalogTable): Unit = synchronized { + override def alterTable(tableDefinition: CatalogTable): Unit = synchronized { +assert(tableDefinition.identifier.database.i
spark git commit: [MINOR][SQL] Fix minor formatting issue of SortAggregateExec.toString
Repository: spark Updated Branches: refs/heads/master 583d91a19 -> 780c7224a [MINOR][SQL] Fix minor formatting issue of SortAggregateExec.toString ## What changes were proposed in this pull request? This PR fixes a minor formatting issue (missing space after comma) of `SorgAggregateExec.toString`. Before: ``` SortAggregate(key=[a#76,b#77], functions=[max(c#78),min(c#78)], output=[a#76,b#77,max(c)#89,min(c)#90]) +- *Sort [a#76 ASC, b#77 ASC], false, 0 +- Exchange hashpartitioning(a#76, b#77, 200) +- SortAggregate(key=[a#76,b#77], functions=[partial_max(c#78),partial_min(c#78)], output=[a#76,b#77,max#99,min#100]) +- *Sort [a#76 ASC, b#77 ASC], false, 0 +- LocalTableScan , [a#76, b#77, c#78] ``` After: ``` SortAggregate(key=[a#76, b#77], functions=[max(c#78), min(c#78)], output=[a#76, b#77, max(c)#89, min(c)#90]) +- *Sort [a#76 ASC, b#77 ASC], false, 0 +- Exchange hashpartitioning(a#76, b#77, 200) +- SortAggregate(key=[a#76, b#77], functions=[partial_max(c#78), partial_min(c#78)], output=[a#76, b#77, max#99, min#100]) +- *Sort [a#76 ASC, b#77 ASC], false, 0 +- LocalTableScan , [a#76, b#77, c#78] ``` ## How was this patch tested? Manually tested. Author: Cheng Lian <l...@databricks.com> Closes #14480 from liancheng/fix-sort-based-agg-string-format. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/780c7224 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/780c7224 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/780c7224 Branch: refs/heads/master Commit: 780c7224a5b8dd3bf7838c6f280c61daeef1dcbc Parents: 583d91a Author: Cheng Lian <l...@databricks.com> Authored: Thu Aug 4 13:32:43 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Aug 4 13:32:43 2016 +0800 -- .../spark/sql/execution/aggregate/SortAggregateExec.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/780c7224/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala index 05dbacf..00e4525 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala @@ -111,9 +111,9 @@ case class SortAggregateExec( private def toString(verbose: Boolean): String = { val allAggregateExpressions = aggregateExpressions -val keyString = Utils.truncatedString(groupingExpressions, "[", ",", "]") -val functionString = Utils.truncatedString(allAggregateExpressions, "[", ",", "]") -val outputString = Utils.truncatedString(output, "[", ",", "]") +val keyString = Utils.truncatedString(groupingExpressions, "[", ", ", "]") +val functionString = Utils.truncatedString(allAggregateExpressions, "[", ", ", "]") +val outputString = Utils.truncatedString(output, "[", ", ", "]") if (verbose) { s"SortAggregate(key=$keyString, functions=$functionString, output=$outputString)" } else { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL][MINOR] use stricter type parameter to make it clear that parquet reader returns UnsafeRow
Repository: spark Updated Branches: refs/heads/master 386127377 -> ae226283e [SQL][MINOR] use stricter type parameter to make it clear that parquet reader returns UnsafeRow ## What changes were proposed in this pull request? a small code style change, it's better to make the type parameter more accurate. ## How was this patch tested? N/A Author: Wenchen Fan <wenc...@databricks.com> Closes #14458 from cloud-fan/parquet. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ae226283 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ae226283 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ae226283 Branch: refs/heads/master Commit: ae226283e19ce396216c73b0ae2470efa122b65b Parents: 3861273 Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Aug 3 08:23:26 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Aug 3 08:23:26 2016 +0800 -- .../execution/datasources/parquet/ParquetFileFormat.scala | 4 ++-- .../datasources/parquet/ParquetReadSupport.scala | 10 +- .../datasources/parquet/ParquetRecordMaterializer.scala | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ae226283/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 772e031..c3e75f1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -370,11 +370,11 @@ private[sql] class ParquetFileFormat logDebug(s"Falling back to parquet-mr") val reader = pushed match { case Some(filter) => -new ParquetRecordReader[InternalRow]( +new ParquetRecordReader[UnsafeRow]( new ParquetReadSupport, FilterCompat.get(filter, null)) case _ => -new ParquetRecordReader[InternalRow](new ParquetReadSupport) +new ParquetRecordReader[UnsafeRow](new ParquetReadSupport) } reader.initialize(split, hadoopAttemptContext) reader http://git-wip-us.apache.org/repos/asf/spark/blob/ae226283/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 8a2e0d7..f1a35dd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -29,12 +29,12 @@ import org.apache.parquet.schema._ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ /** * A Parquet [[ReadSupport]] implementation for reading Parquet records as Catalyst - * [[InternalRow]]s. + * [[UnsafeRow]]s. * * The API interface of [[ReadSupport]] is a little bit over complicated because of historical * reasons. In older versions of parquet-mr (say 1.6.0rc3 and prior), [[ReadSupport]] need to be @@ -48,7 +48,7 @@ import org.apache.spark.sql.types._ * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]] * to [[prepareForRead()]], but use a private `var` for simplicity. */ -private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with Logging { +private[parquet] class ParquetReadSupport extends ReadSupport[UnsafeRow] with Logging { private var catalystRequestedSchema: StructType = _ /** @@ -72,13 +72,13 @@ private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with /** * Called on executor side after [[init()]], before instantiating actual Parquet record readers. * Responsible for instantiating [[RecordMaterializer]], which is used for converting Parquet - * records to Catalyst [[InternalRow]]s. + * records to Catalyst [[UnsafeRow]]s. */ override def prepareForRead( conf: Configuration, keyValueMetaData: JM
[2/2] spark git commit: [SPARK-16621][SQL] Generate stable SQLs in SQLBuilder
[SPARK-16621][SQL] Generate stable SQLs in SQLBuilder ## What changes were proposed in this pull request? Currently, the generated SQLs have not-stable IDs for generated attributes. The stable generated SQL will give more benefit for understanding or testing the queries. This PR provides stable SQL generation by the followings. - Provide unique ids for generated subqueries, `gen_subquery_xxx`. - Provide unique and stable ids for generated attributes, `gen_attr_xxx`. **Before** ```scala scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res0: String = SELECT `gen_attr_0` AS `1` FROM (SELECT 1 AS `gen_attr_0`) AS gen_subquery_0 scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res1: String = SELECT `gen_attr_4` AS `1` FROM (SELECT 1 AS `gen_attr_4`) AS gen_subquery_0 ``` **After** ```scala scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res1: String = SELECT `gen_attr_0` AS `1` FROM (SELECT 1 AS `gen_attr_0`) AS gen_subquery_0 scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res2: String = SELECT `gen_attr_0` AS `1` FROM (SELECT 1 AS `gen_attr_0`) AS gen_subquery_0 ``` ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun <dongj...@apache.org> Closes #14257 from dongjoon-hyun/SPARK-16621. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b8e848b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b8e848b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b8e848b Branch: refs/heads/master Commit: 5b8e848bbfbc0c99a5faf758e40b188b0bbebb7b Parents: 738b4cc Author: Dongjoon Hyun <dongj...@apache.org> Authored: Wed Jul 27 13:23:59 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jul 27 13:23:59 2016 +0800 -- .../apache/spark/sql/catalyst/SQLBuilder.scala | 23 +++- sql/hive/src/test/resources/sqlgen/agg1.sql | 2 +- sql/hive/src/test/resources/sqlgen/agg2.sql | 2 +- sql/hive/src/test/resources/sqlgen/agg3.sql | 2 +- .../sqlgen/aggregate_functions_and_window.sql | 2 +- sql/hive/src/test/resources/sqlgen/case.sql | 2 +- .../test/resources/sqlgen/case_with_else.sql| 2 +- .../src/test/resources/sqlgen/case_with_key.sql | 2 +- .../resources/sqlgen/case_with_key_and_else.sql | 2 +- .../src/test/resources/sqlgen/cluster_by.sql| 2 +- .../sqlgen/data_source_json_parquet_t0.sql | 2 +- .../sqlgen/data_source_orc_parquet_t0.sql | 2 +- .../sqlgen/data_source_parquet_parquet_t0.sql | 2 +- .../resources/sqlgen/distinct_aggregation.sql | 2 +- .../src/test/resources/sqlgen/distribute_by.sql | 2 +- .../sqlgen/distribute_by_with_sort_by.sql | 2 +- sql/hive/src/test/resources/sqlgen/except.sql | 2 +- .../resources/sqlgen/filter_after_subquery.sql | 2 +- .../resources/sqlgen/generate_with_other_1.sql | 2 +- .../resources/sqlgen/generate_with_other_2.sql | 2 +- .../sqlgen/generator_in_lateral_view_1.sql | 2 +- .../sqlgen/generator_in_lateral_view_2.sql | 2 +- .../sqlgen/generator_non_referenced_table_1.sql | 2 +- .../sqlgen/generator_non_referenced_table_2.sql | 2 +- .../resources/sqlgen/generator_non_udtf_1.sql | 2 +- .../resources/sqlgen/generator_non_udtf_2.sql | 2 +- .../sqlgen/generator_referenced_table_1.sql | 2 +- .../sqlgen/generator_referenced_table_2.sql | 2 +- .../sqlgen/generator_with_ambiguous_names_1.sql | 2 +- .../sqlgen/generator_with_ambiguous_names_2.sql | 2 +- .../sqlgen/generator_without_from_1.sql | 2 +- .../sqlgen/generator_without_from_2.sql | 2 +- .../test/resources/sqlgen/grouping_sets_1.sql | 2 +- .../test/resources/sqlgen/grouping_sets_2_1.sql | 2 +- .../test/resources/sqlgen/grouping_sets_2_2.sql | 2 +- .../test/resources/sqlgen/grouping_sets_2_3.sql | 2 +- .../test/resources/sqlgen/grouping_sets_2_4.sql | 2 +- .../test/resources/sqlgen/grouping_sets_2_5.sql | 2 +- sql/hive/src/test/resources/sqlgen/in.sql | 2 +- .../src/test/resources/sqlgen/intersect.sql | 2 +- .../src/test/resources/sqlgen/join_2_tables.sql | 2 +- .../resources/sqlgen/json_tuple_generator_1.sql | 2 +- .../resources/sqlgen/json_tuple_generator_2.sql | 2 +- .../test/resources/sqlgen/multi_distinct.sql| 2 +- .../nested_generator_in_lateral_view_1.sql | 2 +- .../nested_generator_in_lateral_view_2.sql | 2 +- sql/hive/src/test/resources/sqlgen/not_in.sql | 2 +- sql/hive/src/test/resources/sqlgen/not_like.sql | 2 +- .../resources/sqlgen/predicate_subquery.sql | 2 +- .../sqlgen/regular_expressions_and_window.sql | 2 +- .../test/resources/sqlgen/rollup_cube_1_1.sql | 2 +- .../test/resources/sqlgen/rollup_cube_1
[1/2] spark git commit: [SPARK-16621][SQL] Generate stable SQLs in SQLBuilder
Repository: spark Updated Branches: refs/heads/master 738b4cc54 -> 5b8e848bb http://git-wip-us.apache.org/repos/asf/spark/blob/5b8e848b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql -- diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql index eebef6a..8bf1645 100644 --- a/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql @@ -2,4 +2,4 @@ SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1 GROUP BY key % 5, key - 5 WITH CUBE -SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `grouping_id()` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT))), ((`gen_attr` % CAST(5 AS BIGINT))), ((`gen_attr` - CAST(5 AS BIGINT))), ())) AS gen_subquery_1 +SELECT `gen_attr_3` AS `cnt`, `gen_attr_4` AS `k1`, `gen_attr_5` AS `k2`, `gen_attr_6` AS `grouping_id()` FROM (SELECT count(1) AS `gen_attr_3`, (`gen_attr_7` % CAST(5 AS BIGINT)) AS `gen_attr_4`, (`gen_attr_7` - CAST(5 AS BIGINT)) AS `gen_attr_5`, grouping_id() AS `gen_attr_6` FROM (SELECT `key` AS `gen_attr_7`, `value` AS `gen_attr_8` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY (`gen_attr_7` % CAST(5 AS BIGINT)), (`gen_attr_7` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr_7` % CAST(5 AS BIGINT)), (`gen_attr_7` - CAST(5 AS BIGINT))), ((`gen_attr_7` % CAST(5 AS BIGINT))), ((`gen_attr_7` - CAST(5 AS BIGINT))), ())) AS gen_subquery_1 http://git-wip-us.apache.org/repos/asf/spark/blob/5b8e848b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql -- diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql index 9474233..17e78a0 100644 --- a/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql @@ -3,4 +3,4 @@ SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - FROM (SELECT key, key%2, key - 5 FROM parquet_t1) t GROUP BY key%5, key-5 WITH ROLLUP -SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `k3` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` % CAST(2 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT))), ((`gen_attr` % CAST(5 AS BIGINT))), ())) AS gen_subquery_1 +SELECT `gen_attr_3` AS `cnt`, `gen_attr_4` AS `k1`, `gen_attr_5` AS `k2`, `gen_attr_6` AS `k3` FROM (SELECT count(1) AS `gen_attr_3`, (`gen_attr_7` % CAST(5 AS BIGINT)) AS `gen_attr_4`, (`gen_attr_7` - CAST(5 AS BIGINT)) AS `gen_attr_5`, grouping_id() AS `gen_attr_6` FROM (SELECT `gen_attr_7`, (`gen_attr_7` % CAST(2 AS BIGINT)) AS `gen_attr_8`, (`gen_attr_7` - CAST(5 AS BIGINT)) AS `gen_attr_9` FROM (SELECT `key` AS `gen_attr_7`, `value` AS `gen_attr_12` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY (`gen_attr_7` % CAST(5 AS BIGINT)), (`gen_attr_7` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr_7` % CAST(5 AS BIGINT)), (`gen_attr_7` - CAST(5 AS BIGINT))), ((`gen_attr_7` % CAST(5 AS BIGINT))), ())) AS gen_subquery_1 http://git-wip-us.apache.org/repos/asf/spark/blob/5b8e848b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql -- diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql index d36f43d..72506ef 100644 --- a/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql @@ -3,4 +3,4 @@ SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5 WITH CUBE -SELECT
spark git commit: [SPARK-16663][SQL] desc table should be consistent between data source and hive serde tables
Repository: spark Updated Branches: refs/heads/master 4c9695598 -> a2abb583c [SPARK-16663][SQL] desc table should be consistent between data source and hive serde tables ## What changes were proposed in this pull request? Currently there are 2 inconsistence: 1. for data source table, we only print partition names, for hive table, we also print partition schema. After this PR, we will always print schema 2. if column doesn't have comment, data source table will print empty string, hive table will print null. After this PR, we will always print null ## How was this patch tested? new test in `HiveDDLSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #14302 from cloud-fan/minor3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2abb583 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2abb583 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2abb583 Branch: refs/heads/master Commit: a2abb583caaec9a2cecd5d65b05d172fc096c125 Parents: 4c96955 Author: Wenchen Fan <wenc...@databricks.com> Authored: Tue Jul 26 18:46:12 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jul 26 18:46:12 2016 +0800 -- .../spark/sql/execution/command/tables.scala| 12 .../apache/spark/sql/sources/DDLTestSuite.scala | 30 ++-- .../sql/hive/MetastoreDataSourcesSuite.scala| 2 +- .../spark/sql/hive/execution/HiveDDLSuite.scala | 30 +++- .../sql/hive/execution/HiveQuerySuite.scala | 4 +-- 5 files changed, 47 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2abb583/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index c6daa95..8263380 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -439,11 +439,12 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF private def describePartitionInfo(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { if (DDLUtils.isDatasourceTable(table)) { - val partCols = DDLUtils.getPartitionColumnsFromTableProperties(table) - if (partCols.nonEmpty) { + val userSpecifiedSchema = DDLUtils.getSchemaFromTableProperties(table) + val partColNames = DDLUtils.getPartitionColumnsFromTableProperties(table) + for (schema <- userSpecifiedSchema if partColNames.nonEmpty) { append(buffer, "# Partition Information", "", "") -append(buffer, s"# ${output.head.name}", "", "") -partCols.foreach(col => append(buffer, col, "", "")) +append(buffer, s"# ${output.head.name}", output(1).name, output(2).name) +describeSchema(StructType(partColNames.map(schema(_))), buffer) } } else { if (table.partitionColumns.nonEmpty) { @@ -525,8 +526,7 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF private def describeSchema(schema: StructType, buffer: ArrayBuffer[Row]): Unit = { schema.foreach { column => - val comment = column.getComment().getOrElse("") - append(buffer, column.name, column.dataType.simpleString, comment) + append(buffer, column.name, column.dataType.simpleString, column.getComment().orNull) } } http://git-wip-us.apache.org/repos/asf/spark/blob/a2abb583/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala index d0ad319..e535d4d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala @@ -97,21 +97,21 @@ class DDLTestSuite extends DataSourceTest with SharedSQLContext { "describe ddlPeople", Seq( Row("intType", "int", "test comment test1"), -Row("stringType", "string", ""), -Row("dateType", "date", ""), -Row("timestampType", "timestamp", ""), -Row("doubleType", "double", ""), -Row("bigin
spark git commit: [SPARK-16706][SQL] support java map in encoder
Repository: spark Updated Branches: refs/heads/master 7b06a8948 -> 6959061f0 [SPARK-16706][SQL] support java map in encoder ## What changes were proposed in this pull request? finish the TODO, create a new expression `ExternalMapToCatalyst` to iterate the map directly. ## How was this patch tested? new test in `JavaDatasetSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #14344 from cloud-fan/java-map. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6959061f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6959061f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6959061f Branch: refs/heads/master Commit: 6959061f02b02afd4cef683b5eea0b7097eedee7 Parents: 7b06a89 Author: Wenchen Fan <wenc...@databricks.com> Authored: Tue Jul 26 15:33:05 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jul 26 15:33:05 2016 +0800 -- .../spark/sql/catalyst/JavaTypeInference.scala | 12 +- .../spark/sql/catalyst/ScalaReflection.scala| 34 ++-- .../catalyst/expressions/objects/objects.scala | 158 ++- .../encoders/ExpressionEncoderSuite.scala | 6 + .../org/apache/spark/sql/JavaDatasetSuite.java | 58 ++- 5 files changed, 236 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6959061f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala index b3a233a..e6f61b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala @@ -395,10 +395,14 @@ object JavaTypeInference { toCatalystArray(inputObject, elementType(typeToken)) case _ if mapType.isAssignableFrom(typeToken) => - // TODO: for java map, if we get the keys and values by `keySet` and `values`, we can - // not guarantee they have same iteration order(which is different from scala map). - // A possible solution is creating a new `MapObjects` that can iterate a map directly. - throw new UnsupportedOperationException("map type is not supported currently") + val (keyType, valueType) = mapKeyValueType(typeToken) + ExternalMapToCatalyst( +inputObject, +ObjectType(keyType.getRawType), +serializerFor(_, keyType), +ObjectType(valueType.getRawType), +serializerFor(_, valueType) + ) case other => val properties = getJavaBeanProperties(other) http://git-wip-us.apache.org/repos/asf/spark/blob/6959061f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 8affb03..76f87f6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -472,29 +472,17 @@ object ScalaReflection extends ScalaReflection { case t if t <:< localTypeOf[Map[_, _]] => val TypeRef(_, _, Seq(keyType, valueType)) = t - -val keys = - Invoke( -Invoke(inputObject, "keysIterator", - ObjectType(classOf[scala.collection.Iterator[_]])), -"toSeq", -ObjectType(classOf[scala.collection.Seq[_]])) -val convertedKeys = toCatalystArray(keys, keyType) - -val values = - Invoke( -Invoke(inputObject, "valuesIterator", - ObjectType(classOf[scala.collection.Iterator[_]])), -"toSeq", -ObjectType(classOf[scala.collection.Seq[_]])) -val convertedValues = toCatalystArray(values, valueType) - -val Schema(keyDataType, _) = schemaFor(keyType) -val Schema(valueDataType, valueNullable) = schemaFor(valueType) -NewInstance( - classOf[ArrayBasedMapData], - convertedKeys :: convertedValues :: Nil, - dataType = MapType(keyDataType, valueDataType, valueNullable)) +val keyClsName = getClassNameFromType(keyType) +val valueClsName = getClassNameFromType(valueType) +val keyPath = s"""- map key class: "$keyClsName&q
spark git commit: [SPARK-16698][SQL] Field names having dots should be allowed for datasources based on FileFormat
Repository: spark Updated Branches: refs/heads/branch-2.0 fcbb7f653 -> b52e639a8 [SPARK-16698][SQL] Field names having dots should be allowed for datasources based on FileFormat ## What changes were proposed in this pull request? It seems this is a regression assuming from https://issues.apache.org/jira/browse/SPARK-16698. Field name having dots throws an exception. For example the codes below: ```scala val path = "/tmp/path" val json =""" {"a.b":"data"}""" spark.sparkContext .parallelize(json :: Nil) .saveAsTextFile(path) spark.read.json(path).collect() ``` throws an exception as below: ``` Unable to resolve a.b given [a.b]; org.apache.spark.sql.AnalysisException: Unable to resolve a.b given [a.b]; at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134) at scala.Option.getOrElse(Option.scala:121) ``` This problem was introduced in https://github.com/apache/spark/commit/17eec0a71ba8713c559d641e3f43a1be726b037c#diff-27c76f96a7b2733ecfd6f46a1716e153R121 When extracting the data columns, it does not count that it can contains dots in field names. Actually, it seems the fields name are not expected as quoted when defining schema. So, It not have to consider whether this is wrapped with quotes because the actual schema (inferred or user-given schema) would not have the quotes for fields. For example, this throws an exception. (**Loading JSON from RDD is fine**) ```scala val json =""" {"a.b":"data"}""" val rdd = spark.sparkContext.parallelize(json :: Nil) spark.read.schema(StructType(Seq(StructField("`a.b`", StringType, true .json(rdd).select("`a.b`").printSchema() ``` as below: ``` cannot resolve '```a.b```' given input columns: [`a.b`]; org.apache.spark.sql.AnalysisException: cannot resolve '```a.b```' given input columns: [`a.b`]; at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) ``` ## How was this patch tested? Unit tests in `FileSourceStrategySuite`. Author: hyukjinkwon <gurwls...@gmail.com> Closes #14339 from HyukjinKwon/SPARK-16698-regression. (cherry picked from commit 79826f3c7936ee27457d030c7115d5cac69befd7) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b52e639a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b52e639a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b52e639a Branch: refs/heads/branch-2.0 Commit: b52e639a84a851e0b9159a0f6dae92664425042e Parents: fcbb7f6 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Mon Jul 25 22:51:30 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 25 22:51:56 2016 +0800 -- .../sql/catalyst/plans/logical/LogicalPlan.scala | 2 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 15 +++ 2 files changed, 16 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b52e639a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index d0b2b5d..6d77991 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -127,7 +127,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { */ def resolve(schema: StructType, resolver: Resolver): Seq[Attribute] = { schema.map { field => - resolveQuoted(field.name, resolver).map { + resolve(field.name :: Nil, resolver).map { case a: AttributeReference => a case other => sys.error(s"can not handle nested schema yet... plan $this") }.getOrElse { http://git-wip-us.apache.org/repos/asf/spark/blob/b52e639a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index f1a2410..be84dff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/c
spark git commit: [SPARK-16698][SQL] Field names having dots should be allowed for datasources based on FileFormat
Repository: spark Updated Branches: refs/heads/master d6a52176a -> 79826f3c7 [SPARK-16698][SQL] Field names having dots should be allowed for datasources based on FileFormat ## What changes were proposed in this pull request? It seems this is a regression assuming from https://issues.apache.org/jira/browse/SPARK-16698. Field name having dots throws an exception. For example the codes below: ```scala val path = "/tmp/path" val json =""" {"a.b":"data"}""" spark.sparkContext .parallelize(json :: Nil) .saveAsTextFile(path) spark.read.json(path).collect() ``` throws an exception as below: ``` Unable to resolve a.b given [a.b]; org.apache.spark.sql.AnalysisException: Unable to resolve a.b given [a.b]; at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134) at scala.Option.getOrElse(Option.scala:121) ``` This problem was introduced in https://github.com/apache/spark/commit/17eec0a71ba8713c559d641e3f43a1be726b037c#diff-27c76f96a7b2733ecfd6f46a1716e153R121 When extracting the data columns, it does not count that it can contains dots in field names. Actually, it seems the fields name are not expected as quoted when defining schema. So, It not have to consider whether this is wrapped with quotes because the actual schema (inferred or user-given schema) would not have the quotes for fields. For example, this throws an exception. (**Loading JSON from RDD is fine**) ```scala val json =""" {"a.b":"data"}""" val rdd = spark.sparkContext.parallelize(json :: Nil) spark.read.schema(StructType(Seq(StructField("`a.b`", StringType, true .json(rdd).select("`a.b`").printSchema() ``` as below: ``` cannot resolve '```a.b```' given input columns: [`a.b`]; org.apache.spark.sql.AnalysisException: cannot resolve '```a.b```' given input columns: [`a.b`]; at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) ``` ## How was this patch tested? Unit tests in `FileSourceStrategySuite`. Author: hyukjinkwon <gurwls...@gmail.com> Closes #14339 from HyukjinKwon/SPARK-16698-regression. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79826f3c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79826f3c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79826f3c Branch: refs/heads/master Commit: 79826f3c7936ee27457d030c7115d5cac69befd7 Parents: d6a5217 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Mon Jul 25 22:51:30 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 25 22:51:30 2016 +0800 -- .../sql/catalyst/plans/logical/LogicalPlan.scala | 2 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 15 +++ 2 files changed, 16 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79826f3c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index d0b2b5d..6d77991 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -127,7 +127,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { */ def resolve(schema: StructType, resolver: Resolver): Seq[Attribute] = { schema.map { field => - resolveQuoted(field.name, resolver).map { + resolve(field.name :: Nil, resolver).map { case a: AttributeReference => a case other => sys.error(s"can not handle nested schema yet... plan $this") }.getOrElse { http://git-wip-us.apache.org/repos/asf/spark/blob/79826f3c/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index aa80d61..06cc2a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2982,4 +2982,19 @@ class SQLQuerySuite extends QueryTest with SharedSQLCont
spark git commit: [SPARK-16668][TEST] Test parquet reader for row groups containing both dictionary and plain encoded pages
Repository: spark Updated Branches: refs/heads/master 64529b186 -> d6a52176a [SPARK-16668][TEST] Test parquet reader for row groups containing both dictionary and plain encoded pages ## What changes were proposed in this pull request? This patch adds an explicit test for [SPARK-14217] by setting the parquet dictionary and page size the generated parquet file spans across 3 pages (within a single row group) where the first page is dictionary encoded and the remaining two are plain encoded. ## How was this patch tested? 1. ParquetEncodingSuite 2. Also manually tested that this test fails without https://github.com/apache/spark/pull/12279 Author: Sameer Agarwal <samee...@cs.berkeley.edu> Closes #14304 from sameeragarwal/hybrid-encoding-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d6a52176 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d6a52176 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d6a52176 Branch: refs/heads/master Commit: d6a52176ade92853f37167ad27631977dc79bc76 Parents: 64529b1 Author: Sameer Agarwal <samee...@cs.berkeley.edu> Authored: Mon Jul 25 22:31:01 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 25 22:31:01 2016 +0800 -- .../parquet/ParquetEncodingSuite.scala | 29 1 file changed, 29 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d6a52176/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 88fcfce..c754188 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -16,6 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.parquet +import scala.collection.JavaConverters._ + +import org.apache.parquet.hadoop.ParquetOutputFormat + import org.apache.spark.sql.test.SharedSQLContext // TODO: this needs a lot more testing but it's currently not easy to test with the parquet @@ -78,4 +82,29 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSQLContex }} } } + + test("Read row group containing both dictionary and plain encoded pages") { +withSQLConf(ParquetOutputFormat.DICTIONARY_PAGE_SIZE -> "2048", + ParquetOutputFormat.PAGE_SIZE -> "4096") { + withTempPath { dir => +// In order to explicitly test for SPARK-14217, we set the parquet dictionary and page size +// such that the following data spans across 3 pages (within a single row group) where the +// first page is dictionary encoded and the remaining two are plain encoded. +val data = (0 until 512).flatMap(i => Seq.fill(3)(i.toString)) +data.toDF("f").coalesce(1).write.parquet(dir.getCanonicalPath) +val file = SpecificParquetRecordReaderBase.listDirectory(dir).asScala.head + +val reader = new VectorizedParquetRecordReader +reader.initialize(file, null /* set columns to null to project all columns */) +val column = reader.resultBatch().column(0) +assert(reader.nextBatch()) + +(0 until 512).foreach { i => + assert(column.getUTF8String(3 * i).toString == i.toString) + assert(column.getUTF8String(3 * i + 1).toString == i.toString) + assert(column.getUTF8String(3 * i + 2).toString == i.toString) +} + } +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16691][SQL] move BucketSpec to catalyst module and use it in CatalogTable
Repository: spark Updated Branches: refs/heads/master d27d362eb -> 64529b186 [SPARK-16691][SQL] move BucketSpec to catalyst module and use it in CatalogTable ## What changes were proposed in this pull request? It's weird that we have `BucketSpec` to abstract bucket info, but don't use it in `CatalogTable`. This PR moves `BucketSpec` into catalyst module. ## How was this patch tested? existing tests. Author: Wenchen Fan <wenc...@databricks.com> Closes #14331 from cloud-fan/check. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/64529b18 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/64529b18 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/64529b18 Branch: refs/heads/master Commit: 64529b186a1c33740067cc7639d630bc5b9ae6e8 Parents: d27d362 Author: Wenchen Fan <wenc...@databricks.com> Authored: Mon Jul 25 22:05:48 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 25 22:05:48 2016 +0800 -- .../spark/sql/catalyst/catalog/interface.scala | 49 .../catalyst/catalog/ExternalCatalogSuite.scala | 2 +- .../org/apache/spark/sql/DataFrameWriter.scala | 5 +- .../spark/sql/execution/command/ddl.scala | 3 +- .../spark/sql/execution/command/tables.scala| 30 +- .../execution/datasources/BucketingUtils.scala | 39 + .../sql/execution/datasources/DataSource.scala | 1 + .../datasources/FileSourceStrategy.scala| 1 + .../InsertIntoHadoopFsRelationCommand.scala | 2 +- .../execution/datasources/WriterContainer.scala | 1 + .../sql/execution/datasources/bucket.scala | 59 .../spark/sql/execution/datasources/ddl.scala | 1 + .../datasources/fileSourceInterfaces.scala | 2 +- .../apache/spark/sql/internal/CatalogImpl.scala | 2 +- .../sql/execution/command/DDLCommandSuite.scala | 6 +- .../spark/sql/execution/command/DDLSuite.scala | 3 +- .../datasources/FileSourceStrategySuite.scala | 1 + .../spark/sql/internal/CatalogSuite.scala | 5 +- .../sql/sources/CreateTableAsSelectSuite.scala | 2 +- .../spark/sql/hive/client/HiveClientImpl.scala | 9 +-- .../spark/sql/hive/HiveDDLCommandSuite.scala| 8 +-- .../spark/sql/sources/BucketedReadSuite.scala | 3 +- 22 files changed, 117 insertions(+), 117 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/64529b18/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 2a20651..710bce5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} +import org.apache.spark.sql.catalyst.util.quoteIdentifier /** @@ -110,6 +111,24 @@ case class CatalogTablePartition( /** + * A container for bucketing information. + * Bucketing is a technology for decomposing data sets into more manageable parts, and the number + * of buckets is fixed so it does not fluctuate with data. + * + * @param numBuckets number of buckets. + * @param bucketColumnNames the names of the columns that used to generate the bucket id. + * @param sortColumnNames the names of the columns that used to sort data in each bucket. + */ +case class BucketSpec( +numBuckets: Int, +bucketColumnNames: Seq[String], +sortColumnNames: Seq[String]) { + if (numBuckets <= 0) { +throw new AnalysisException(s"Expected positive number of buckets, but got `$numBuckets`.") + } +} + +/** * A table defined in the catalog. * * Note that Hive's metastore also tracks skewed columns. We should consider adding that in the @@ -124,9 +143,7 @@ case class CatalogTable( storage: CatalogStorageFormat, schema: Seq[CatalogColumn], partitionColumnNames: Seq[String] = Seq.empty, -sortColumnNames: Seq[String] = Seq.empty, -bucketColumnNames: Seq[String] = Seq.empty, -numBuckets: Int = -1, +bucketSpec: Option[BucketSpec] = None, owner: String = "", createTime: Long = System.currentTimeMillis, lastAccessTime: Long = -1, @@ -143,8 +160,8 @@ case class CatalogTable( s"must be a subset of schema ($
spark git commit: [SPARK-16660][SQL] CreateViewCommand should not take CatalogTable
Repository: spark Updated Branches: refs/heads/master 7ffd99ec5 -> d27d362eb [SPARK-16660][SQL] CreateViewCommand should not take CatalogTable ## What changes were proposed in this pull request? `CreateViewCommand` only needs some information of a `CatalogTable`, but not all of them. We have some tricks(e.g. we need to check the table type is `VIEW`, we need to make `CatalogColumn.dataType` nullable) to allow it to take a `CatalogTable`. This PR cleans it up and only pass in necessary information to `CreateViewCommand`. ## How was this patch tested? existing tests. Author: Wenchen Fan <wenc...@databricks.com> Closes #14297 from cloud-fan/minor2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d27d362e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d27d362e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d27d362e Branch: refs/heads/master Commit: d27d362ebae0c4a5cc6c99f13ef20049214dd4f9 Parents: 7ffd99e Author: Wenchen Fan <wenc...@databricks.com> Authored: Mon Jul 25 22:02:00 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 25 22:02:00 2016 +0800 -- .../spark/sql/catalyst/catalog/interface.scala | 6 +- .../scala/org/apache/spark/sql/Dataset.scala| 27 ++--- .../spark/sql/execution/SparkSqlParser.scala| 51 - .../spark/sql/execution/command/views.scala | 111 ++- .../spark/sql/hive/HiveMetastoreCatalog.scala | 2 - .../spark/sql/hive/HiveDDLCommandSuite.scala| 46 +++- 6 files changed, 116 insertions(+), 127 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d27d362e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index b7f35b3..2a20651 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -81,9 +81,9 @@ object CatalogStorageFormat { */ case class CatalogColumn( name: String, -// This may be null when used to create views. TODO: make this type-safe; this is left -// as a string due to issues in converting Hive varchars to and from SparkSQL strings. -@Nullable dataType: String, +// TODO: make this type-safe; this is left as a string due to issues in converting Hive +// varchars to and from SparkSQL strings. +dataType: String, nullable: Boolean = true, comment: Option[String] = None) { http://git-wip-us.apache.org/repos/asf/spark/blob/d27d362e/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index b28ecb7..8b6443c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -2421,13 +2421,7 @@ class Dataset[T] private[sql]( */ @throws[AnalysisException] def createTempView(viewName: String): Unit = withPlan { -val tableDesc = CatalogTable( - identifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), - tableType = CatalogTableType.VIEW, - schema = Seq.empty[CatalogColumn], - storage = CatalogStorageFormat.empty) -CreateViewCommand(tableDesc, logicalPlan, allowExisting = false, replace = false, - isTemporary = true) +createViewCommand(viewName, replace = false) } /** @@ -2438,12 +2432,19 @@ class Dataset[T] private[sql]( * @since 2.0.0 */ def createOrReplaceTempView(viewName: String): Unit = withPlan { -val tableDesc = CatalogTable( - identifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), - tableType = CatalogTableType.VIEW, - schema = Seq.empty[CatalogColumn], - storage = CatalogStorageFormat.empty) -CreateViewCommand(tableDesc, logicalPlan, allowExisting = false, replace = true, +createViewCommand(viewName, replace = true) + } + + private def createViewCommand(viewName: String, replace: Boolean): CreateViewCommand = { +CreateViewCommand( + name = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), + userSpecifiedColumns = Nil, + comment = None, + properties = Map.empty, + originalText = None, + child = logicalPlan, + allowExisting = false, + replace = replace, isTemporary = true)
spark git commit: [SPARK-16632][SQL] Revert PR #14272: Respect Hive schema when merging parquet schema
Repository: spark Updated Branches: refs/heads/branch-2.0 f9367d6a0 -> 933d76a22 [SPARK-16632][SQL] Revert PR #14272: Respect Hive schema when merging parquet schema ## What changes were proposed in this pull request? PR #14278 is a more general and simpler fix for SPARK-16632 than PR #14272. After merging #14278, we no longer need changes made in #14272. So here I revert them. This PR targets both master and branch-2.0. ## How was this patch tested? Existing tests. Author: Cheng Lian <l...@databricks.com> Closes #14300 from liancheng/revert-pr-14272. (cherry picked from commit 69626adddc0441a4834b70a32e2d95b11d69a219) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/933d76a2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/933d76a2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/933d76a2 Branch: refs/heads/branch-2.0 Commit: 933d76a2265176e0efee2d2b03ea53b235f2e175 Parents: f9367d6 Author: Cheng Lian <l...@databricks.com> Authored: Thu Jul 21 22:08:34 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jul 21 22:08:46 2016 +0800 -- .../parquet/ParquetReadSupport.scala| 18 - .../parquet/ParquetSchemaSuite.scala| 39 2 files changed, 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/933d76a2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 1628e4c..12f4974 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -26,8 +26,6 @@ import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ -import org.apache.parquet.schema.OriginalType._ -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging @@ -118,12 +116,6 @@ private[parquet] object ParquetReadSupport { } private def clipParquetType(parquetType: Type, catalystType: DataType): Type = { -val primName = if (parquetType.isPrimitive()) { - parquetType.asPrimitiveType().getPrimitiveTypeName() -} else { - null -} - catalystType match { case t: ArrayType if !isPrimitiveCatalystType(t.elementType) => // Only clips array types with nested type as element type. @@ -138,16 +130,6 @@ private[parquet] object ParquetReadSupport { case t: StructType => clipParquetGroup(parquetType.asGroupType(), t) - case _: ByteType if primName == INT32 => -// SPARK-16632: Handle case where Hive stores bytes in a int32 field without specifying -// the original type. -Types.primitive(INT32, parquetType.getRepetition()).as(INT_8).named(parquetType.getName()) - - case _: ShortType if primName == INT32 => -// SPARK-16632: Handle case where Hive stores shorts in a int32 field without specifying -// the original type. -Types.primitive(INT32, parquetType.getRepetition()).as(INT_16).named(parquetType.getName()) - case _ => // UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able // to be mapped to desired user-space types. So UDTs shouldn't participate schema merging. http://git-wip-us.apache.org/repos/asf/spark/blob/933d76a2/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 215c138..51bb236 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -1573,43 +1573,4 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin) - - testSchemaClipping( -"int
spark git commit: [SPARK-16632][SQL] Revert PR #14272: Respect Hive schema when merging parquet schema
Repository: spark Updated Branches: refs/heads/master 6203668d5 -> 69626addd [SPARK-16632][SQL] Revert PR #14272: Respect Hive schema when merging parquet schema ## What changes were proposed in this pull request? PR #14278 is a more general and simpler fix for SPARK-16632 than PR #14272. After merging #14278, we no longer need changes made in #14272. So here I revert them. This PR targets both master and branch-2.0. ## How was this patch tested? Existing tests. Author: Cheng Lian <l...@databricks.com> Closes #14300 from liancheng/revert-pr-14272. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69626add Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69626add Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69626add Branch: refs/heads/master Commit: 69626adddc0441a4834b70a32e2d95b11d69a219 Parents: 6203668 Author: Cheng Lian <l...@databricks.com> Authored: Thu Jul 21 22:08:34 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jul 21 22:08:34 2016 +0800 -- .../parquet/ParquetReadSupport.scala| 18 - .../parquet/ParquetSchemaSuite.scala| 39 2 files changed, 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69626add/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 0bee874..8a2e0d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -26,8 +26,6 @@ import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ -import org.apache.parquet.schema.OriginalType._ -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging @@ -123,12 +121,6 @@ private[parquet] object ParquetReadSupport { } private def clipParquetType(parquetType: Type, catalystType: DataType): Type = { -val primName = if (parquetType.isPrimitive()) { - parquetType.asPrimitiveType().getPrimitiveTypeName() -} else { - null -} - catalystType match { case t: ArrayType if !isPrimitiveCatalystType(t.elementType) => // Only clips array types with nested type as element type. @@ -143,16 +135,6 @@ private[parquet] object ParquetReadSupport { case t: StructType => clipParquetGroup(parquetType.asGroupType(), t) - case _: ByteType if primName == INT32 => -// SPARK-16632: Handle case where Hive stores bytes in a int32 field without specifying -// the original type. -Types.primitive(INT32, parquetType.getRepetition()).as(INT_8).named(parquetType.getName()) - - case _: ShortType if primName == INT32 => -// SPARK-16632: Handle case where Hive stores shorts in a int32 field without specifying -// the original type. -Types.primitive(INT32, parquetType.getRepetition()).as(INT_16).named(parquetType.getName()) - case _ => // UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able // to be mapped to desired user-space types. So UDTs shouldn't participate schema merging. http://git-wip-us.apache.org/repos/asf/spark/blob/69626add/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 31ebec0..8a980a7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -1581,43 +1581,4 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin) - - testSchemaClipping( -"int32 parquet field with byte schema field", - -parquetSchema = - """message root { -| opt
spark git commit: [SPARK-16632][SQL] Use Spark requested schema to guide vectorized Parquet reader initialization
Repository: spark Updated Branches: refs/heads/branch-2.0 c2b4228d7 -> f9367d6a0 [SPARK-16632][SQL] Use Spark requested schema to guide vectorized Parquet reader initialization In `SpecificParquetRecordReaderBase`, which is used by the vectorized Parquet reader, we convert the Parquet requested schema into a Spark schema to guide column reader initialization. However, the Parquet requested schema is tailored from the schema of the physical file being scanned, and may have inaccurate type information due to bugs of other systems (e.g. HIVE-14294). On the other hand, we already set the real Spark requested schema into Hadoop configuration in [`ParquetFileFormat`][1]. This PR simply reads out this schema to replace the converted one. New test case added in `ParquetQuerySuite`. [1]: https://github.com/apache/spark/blob/v2.0.0-rc5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala#L292-L294 Author: Cheng Lian <l...@databricks.com> Closes #14278 from liancheng/spark-16632-simpler-fix. (cherry picked from commit 8674054d3402b400a4766fe1c9214001cebf2106) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9367d6a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9367d6a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9367d6a Branch: refs/heads/branch-2.0 Commit: f9367d6a045ca171f86845b92c0def1d212a4fcc Parents: c2b4228 Author: Cheng Lian <l...@databricks.com> Authored: Thu Jul 21 17:15:07 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jul 21 17:18:53 2016 +0800 -- .../SpecificParquetRecordReaderBase.java| 5 +++- .../datasources/parquet/ParquetQuerySuite.scala | 24 2 files changed, 28 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f9367d6a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index 1a25679..0d624d1 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -60,6 +60,7 @@ import org.apache.parquet.hadoop.util.ConfigurationUtil; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Types; import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; /** * Base class for custom RecordReaders for Parquet that directly materialize to `T`. @@ -136,7 +137,9 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReaderhttp://git-wip-us.apache.org/repos/asf/spark/blob/f9367d6a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 83d1001..3201f8e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -668,6 +668,30 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext } } } + + test("SPARK-16632: read Parquet int32 as ByteType and ShortType") { +withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { + withTempPath { dir => +val path = dir.getCanonicalPath + +// When being written to Parquet, `TINYINT` and `SMALLINT` should be converted into +// `int32 (INT_8)` and `int32 (INT_16)` respectively. However, Hive doesn't add the `INT_8` +// and `INT_16` annotation properly (HIVE-14294). Thus, when reading files written by Hive +// using Spark with the vectorized Parquet reader enabled, we may hit error due to type +// mismatch. +// +// Here we are simulating Hive's behavior by writing a single `INT` field and then read it +// back as `TINYINT` and `SMALLINT` in Spark to verify this issue. +Seq(1).toDF("f").write.parquet(path) + +val withByteF
spark git commit: [SPARK-16632][SQL] Use Spark requested schema to guide vectorized Parquet reader initialization
Repository: spark Updated Branches: refs/heads/master 864b764ea -> 8674054d3 [SPARK-16632][SQL] Use Spark requested schema to guide vectorized Parquet reader initialization ## What changes were proposed in this pull request? In `SpecificParquetRecordReaderBase`, which is used by the vectorized Parquet reader, we convert the Parquet requested schema into a Spark schema to guide column reader initialization. However, the Parquet requested schema is tailored from the schema of the physical file being scanned, and may have inaccurate type information due to bugs of other systems (e.g. HIVE-14294). On the other hand, we already set the real Spark requested schema into Hadoop configuration in [`ParquetFileFormat`][1]. This PR simply reads out this schema to replace the converted one. ## How was this patch tested? New test case added in `ParquetQuerySuite`. [1]: https://github.com/apache/spark/blob/v2.0.0-rc5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala#L292-L294 Author: Cheng Lian <l...@databricks.com> Closes #14278 from liancheng/spark-16632-simpler-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8674054d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8674054d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8674054d Branch: refs/heads/master Commit: 8674054d3402b400a4766fe1c9214001cebf2106 Parents: 864b764 Author: Cheng Lian <l...@databricks.com> Authored: Thu Jul 21 17:15:07 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jul 21 17:15:07 2016 +0800 -- .../SpecificParquetRecordReaderBase.java| 5 +++- .../datasources/parquet/ParquetQuerySuite.scala | 24 2 files changed, 28 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8674054d/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index d823275..04752ec 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -60,6 +60,7 @@ import org.apache.parquet.hadoop.util.ConfigurationUtil; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Types; import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; /** * Base class for custom RecordReaders for Parquet that directly materialize to `T`. @@ -136,7 +137,9 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReaderhttp://git-wip-us.apache.org/repos/asf/spark/blob/8674054d/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 02b9445..7e83bcb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -680,6 +680,30 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext ) } } + + test("SPARK-16632: read Parquet int32 as ByteType and ShortType") { +withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { + withTempPath { dir => +val path = dir.getCanonicalPath + +// When being written to Parquet, `TINYINT` and `SMALLINT` should be converted into +// `int32 (INT_8)` and `int32 (INT_16)` respectively. However, Hive doesn't add the `INT_8` +// and `INT_16` annotation properly (HIVE-14294). Thus, when reading files written by Hive +// using Spark with the vectorized Parquet reader enabled, we may hit error due to type +// mismatch. +// +// Here we are simulating Hive's behavior by writing a single `INT` field and then read it +// back as `TINYINT` and `SMALLINT` in Spark to verify this issue. +Seq(1).toDF("f").write.parquet(path) + +val withByteField = new StructType().add("f", Byte
spark git commit: [SPARK-16632][SQL] Respect Hive schema when merging parquet schema.
Repository: spark Updated Branches: refs/heads/branch-2.0 6f209c8fa -> c2b5b3ca5 [SPARK-16632][SQL] Respect Hive schema when merging parquet schema. When Hive (or at least certain versions of Hive) creates parquet files containing tinyint or smallint columns, it stores them as int32, but doesn't annotate the parquet field as containing the corresponding int8 / int16 data. When Spark reads those files using the vectorized reader, it follows the parquet schema for these fields, but when actually reading the data it tries to use the type fetched from the metastore, and then fails because data has been loaded into the wrong fields in OnHeapColumnVector. So instead of blindly trusting the parquet schema, check whether the Catalyst-provided schema disagrees with it, and adjust the types so that the necessary metadata is present when loading the data into the ColumnVector instance. Tested with unit tests and with tests that create byte / short columns in Hive and try to read them from Spark. Author: Marcelo Vanzin <van...@cloudera.com> Closes #14272 from vanzin/SPARK-16632. (cherry picked from commit 75146be6ba5e9f559f5f15430310bb476ee0812c) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2b5b3ca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2b5b3ca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2b5b3ca Branch: refs/heads/branch-2.0 Commit: c2b5b3ca538aaaef946653e60bd68e38c58dc41f Parents: 6f209c8 Author: Marcelo Vanzin <van...@cloudera.com> Authored: Wed Jul 20 13:00:22 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jul 20 13:49:45 2016 +0800 -- .../parquet/ParquetReadSupport.scala| 18 + .../parquet/ParquetSchemaSuite.scala| 39 2 files changed, 57 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c2b5b3ca/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 12f4974..1628e4c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -26,6 +26,8 @@ import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ +import org.apache.parquet.schema.OriginalType._ +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging @@ -116,6 +118,12 @@ private[parquet] object ParquetReadSupport { } private def clipParquetType(parquetType: Type, catalystType: DataType): Type = { +val primName = if (parquetType.isPrimitive()) { + parquetType.asPrimitiveType().getPrimitiveTypeName() +} else { + null +} + catalystType match { case t: ArrayType if !isPrimitiveCatalystType(t.elementType) => // Only clips array types with nested type as element type. @@ -130,6 +138,16 @@ private[parquet] object ParquetReadSupport { case t: StructType => clipParquetGroup(parquetType.asGroupType(), t) + case _: ByteType if primName == INT32 => +// SPARK-16632: Handle case where Hive stores bytes in a int32 field without specifying +// the original type. +Types.primitive(INT32, parquetType.getRepetition()).as(INT_8).named(parquetType.getName()) + + case _: ShortType if primName == INT32 => +// SPARK-16632: Handle case where Hive stores shorts in a int32 field without specifying +// the original type. +Types.primitive(INT32, parquetType.getRepetition()).as(INT_16).named(parquetType.getName()) + case _ => // UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able // to be mapped to desired user-space types. So UDTs shouldn't participate schema merging. http://git-wip-us.apache.org/repos/asf/spark/blob/c2b5b3ca/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sq
spark git commit: [SPARK-16632][SQL] Respect Hive schema when merging parquet schema.
Repository: spark Updated Branches: refs/heads/master fc2326362 -> 75146be6b [SPARK-16632][SQL] Respect Hive schema when merging parquet schema. When Hive (or at least certain versions of Hive) creates parquet files containing tinyint or smallint columns, it stores them as int32, but doesn't annotate the parquet field as containing the corresponding int8 / int16 data. When Spark reads those files using the vectorized reader, it follows the parquet schema for these fields, but when actually reading the data it tries to use the type fetched from the metastore, and then fails because data has been loaded into the wrong fields in OnHeapColumnVector. So instead of blindly trusting the parquet schema, check whether the Catalyst-provided schema disagrees with it, and adjust the types so that the necessary metadata is present when loading the data into the ColumnVector instance. Tested with unit tests and with tests that create byte / short columns in Hive and try to read them from Spark. Author: Marcelo Vanzin <van...@cloudera.com> Closes #14272 from vanzin/SPARK-16632. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/75146be6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/75146be6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/75146be6 Branch: refs/heads/master Commit: 75146be6ba5e9f559f5f15430310bb476ee0812c Parents: fc23263 Author: Marcelo Vanzin <van...@cloudera.com> Authored: Wed Jul 20 13:00:22 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jul 20 13:00:22 2016 +0800 -- .../parquet/ParquetReadSupport.scala| 18 + .../parquet/ParquetSchemaSuite.scala| 39 2 files changed, 57 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/75146be6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index e6ef634..46d786d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -26,6 +26,8 @@ import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ +import org.apache.parquet.schema.OriginalType._ +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging @@ -120,6 +122,12 @@ private[parquet] object ParquetReadSupport { } private def clipParquetType(parquetType: Type, catalystType: DataType): Type = { +val primName = if (parquetType.isPrimitive()) { + parquetType.asPrimitiveType().getPrimitiveTypeName() +} else { + null +} + catalystType match { case t: ArrayType if !isPrimitiveCatalystType(t.elementType) => // Only clips array types with nested type as element type. @@ -134,6 +142,16 @@ private[parquet] object ParquetReadSupport { case t: StructType => clipParquetGroup(parquetType.asGroupType(), t) + case _: ByteType if primName == INT32 => +// SPARK-16632: Handle case where Hive stores bytes in a int32 field without specifying +// the original type. +Types.primitive(INT32, parquetType.getRepetition()).as(INT_8).named(parquetType.getName()) + + case _: ShortType if primName == INT32 => +// SPARK-16632: Handle case where Hive stores shorts in a int32 field without specifying +// the original type. +Types.primitive(INT32, parquetType.getRepetition()).as(INT_16).named(parquetType.getName()) + case _ => // UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able // to be mapped to desired user-space types. So UDTs shouldn't participate schema merging. http://git-wip-us.apache.org/repos/asf/spark/blob/75146be6/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 8a980a7..31ebec0 10064
spark git commit: [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping
Repository: spark Updated Branches: refs/heads/branch-2.0 0a651aa26 -> 741801921 [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping ## What changes were proposed in this pull request? `SQLTestUtils.withTempDatabase` is a frequently used test harness to setup a temporary table and clean up finally. This issue improves like the following for usability. ```scala -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} ``` In case of forgetting to reset the databaes, `withTempDatabase` will not raise Exception. ## How was this patch tested? This improves test harness. Author: Dongjoon Hyun <dongj...@apache.org> Closes #14184 from dongjoon-hyun/SPARK-16529. (cherry picked from commit c576f9fb90853cce2e8e5dcc32a536a0f49cbbd8) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74180192 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74180192 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74180192 Branch: refs/heads/branch-2.0 Commit: 7418019218e5a2bd4ae948bb1984816f161925cf Parents: 0a651aa Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Jul 15 00:51:11 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jul 15 00:51:56 2016 +0800 -- .../test/scala/org/apache/spark/sql/test/SQLTestUtils.scala | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74180192/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 853dd0f..26bd3fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -29,6 +29,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ @@ -196,7 +197,12 @@ private[sql] trait SQLTestUtils fail("Failed to create temporary database", cause) } -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping
Repository: spark Updated Branches: refs/heads/master 12005c88f -> c576f9fb9 [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping ## What changes were proposed in this pull request? `SQLTestUtils.withTempDatabase` is a frequently used test harness to setup a temporary table and clean up finally. This issue improves like the following for usability. ```scala -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} ``` In case of forgetting to reset the databaes, `withTempDatabase` will not raise Exception. ## How was this patch tested? This improves test harness. Author: Dongjoon Hyun <dongj...@apache.org> Closes #14184 from dongjoon-hyun/SPARK-16529. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c576f9fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c576f9fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c576f9fb Branch: refs/heads/master Commit: c576f9fb90853cce2e8e5dcc32a536a0f49cbbd8 Parents: 12005c8 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Jul 15 00:51:11 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jul 15 00:51:11 2016 +0800 -- .../test/scala/org/apache/spark/sql/test/SQLTestUtils.scala | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c576f9fb/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 853dd0f..26bd3fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -29,6 +29,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ @@ -196,7 +197,12 @@ private[sql] trait SQLTestUtils fail("Failed to create temporary database", cause) } -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16448] RemoveAliasOnlyProject should not remove alias with metadata
Repository: spark Updated Branches: refs/heads/master 39c836e97 -> db7317ac3 [SPARK-16448] RemoveAliasOnlyProject should not remove alias with metadata ## What changes were proposed in this pull request? `Alias` with metadata is not a no-op and we should not strip it in `RemoveAliasOnlyProject` rule. This PR also did some improvement for this rule: 1. extend the semantic of `alias-only`. Now we allow the project list to be partially aliased. 2. add unit test for this rule. ## How was this patch tested? new `RemoveAliasOnlyProjectSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #14106 from cloud-fan/bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db7317ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db7317ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db7317ac Branch: refs/heads/master Commit: db7317ac3c2fd2a11088d10060f168178dc99664 Parents: 39c836e Author: Wenchen Fan <wenc...@databricks.com> Authored: Thu Jul 14 15:48:22 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jul 14 15:48:22 2016 +0800 -- .../sql/catalyst/optimizer/Optimizer.scala | 49 - .../optimizer/RemoveAliasOnlyProjectSuite.scala | 77 2 files changed, 108 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db7317ac/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 08fb019..c8e9d8e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -165,36 +165,49 @@ object PushProjectThroughSample extends Rule[LogicalPlan] { * but can also benefit other operators. */ object RemoveAliasOnlyProject extends Rule[LogicalPlan] { - // Check if projectList in the Project node has the same attribute names and ordering - // as its child node. + /** + * Returns true if the project list is semantically same as child output, after strip alias on + * attribute. + */ private def isAliasOnly( projectList: Seq[NamedExpression], childOutput: Seq[Attribute]): Boolean = { -if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != childOutput.length) { +if (projectList.length != childOutput.length) { false } else { - projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case (a, o) => -a.child match { - case attr: Attribute if a.name == attr.name && attr.semanticEquals(o) => true - case _ => false -} + stripAliasOnAttribute(projectList).zip(childOutput).forall { +case (a: Attribute, o) if a semanticEquals o => true +case _ => false } } } + private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = { +projectList.map { + // Alias with metadata can not be stripped, or the metadata will be lost. + // If the alias name is different from attribute name, we can't strip it either, or we may + // accidentally change the output schema name of the root plan. + case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name => +attr + case other => other +} + } + def apply(plan: LogicalPlan): LogicalPlan = { -val aliasOnlyProject = plan.find { - case Project(pList, child) if isAliasOnly(pList, child.output) => true - case _ => false +val aliasOnlyProject = plan.collectFirst { + case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p } -aliasOnlyProject.map { case p: Project => - val aliases = p.projectList.map(_.asInstanceOf[Alias]) - val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child))) - plan.transformAllExpressions { -case a: Attribute if attrMap.contains(a) => attrMap(a) - }.transform { -case op: Project if op.eq(p) => op.child +aliasOnlyProject.map { case proj => + val attributesToReplace = proj.output.zip(proj.child.output).filterNot { +case (a1, a2) => a1 semanticEquals a2 + } + val attrMap = AttributeMap(attributesToReplace) + plan transform { +case plan: Project if plan eq proj => plan.child +case plan => plan transformExpressions { + case a: Attribute if attrMap.contains(a) => attrMap(a) +}
spark git commit: [SPARK-16343][SQL] Improve the PushDownPredicate rule to pushdown predicates correctly in non-deterministic condition.
Repository: spark Updated Branches: refs/heads/master ea06e4ef3 -> f376c3726 [SPARK-16343][SQL] Improve the PushDownPredicate rule to pushdown predicates correctly in non-deterministic condition. ## What changes were proposed in this pull request? Currently our Optimizer may reorder the predicates to run them more efficient, but in non-deterministic condition, change the order between deterministic parts and non-deterministic parts may change the number of input rows. For example: ```SELECT a FROM t WHERE rand() < 0.1 AND a = 1``` And ```SELECT a FROM t WHERE a = 1 AND rand() < 0.1``` may call rand() for different times and therefore the output rows differ. This PR improved this condition by checking whether the predicate is placed before any non-deterministic predicates. ## How was this patch tested? Expanded related testcases in FilterPushdownSuite. Author: èæå <jiangxin...@meituan.com> Closes #14012 from jiangxb1987/ppd. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f376c372 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f376c372 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f376c372 Branch: refs/heads/master Commit: f376c37268848dbb4b2fb57677e22ef2bf207b49 Parents: ea06e4e Author: èæå <jiangxin...@meituan.com> Authored: Thu Jul 14 00:21:27 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jul 14 00:21:27 2016 +0800 -- .../sql/catalyst/optimizer/Optimizer.scala | 44 +--- .../optimizer/FilterPushdownSuite.scala | 8 ++-- 2 files changed, 33 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f376c372/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 368e9a5..08fb019 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1128,19 +1128,23 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild)) // Push [[Filter]] operators through [[Window]] operators. Parts of the predicate that can be -// pushed beneath must satisfy the following two conditions: +// pushed beneath must satisfy the following conditions: // 1. All the expressions are part of window partitioning key. The expressions can be compound. -// 2. Deterministic +// 2. Deterministic. +// 3. Placed before any non-deterministic predicates. case filter @ Filter(condition, w: Window) if w.partitionSpec.forall(_.isInstanceOf[AttributeReference]) => val partitionAttrs = AttributeSet(w.partitionSpec.flatMap(_.references)) - val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond => -cond.references.subsetOf(partitionAttrs) && cond.deterministic && - // This is for ensuring all the partitioning expressions have been converted to alias - // in Analyzer. Thus, we do not need to check if the expressions in conditions are - // the same as the expressions used in partitioning columns. - partitionAttrs.forall(_.isInstanceOf[Attribute]) + + val (candidates, containingNonDeterministic) = +splitConjunctivePredicates(condition).span(_.deterministic) + + val (pushDown, rest) = candidates.partition { cond => +cond.references.subsetOf(partitionAttrs) } + + val stayUp = rest ++ containingNonDeterministic + if (pushDown.nonEmpty) { val pushDownPredicate = pushDown.reduce(And) val newWindow = w.copy(child = Filter(pushDownPredicate, w.child)) @@ -1159,11 +1163,16 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { // For each filter, expand the alias and check if the filter can be evaluated using // attributes produced by the aggregate operator's child operator. - val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond => + val (candidates, containingNonDeterministic) = +splitConjunctivePredicates(condition).span(_.deterministic) + + val (pushDown, rest) = candidates.partition { cond => val replaced = replaceAlias(cond, aliasMap) -replaced.references.subsetOf(aggregate.child.outputSet) && replaced.deterministic +replaced.reference
spark git commit: [SPARK-16303][DOCS][EXAMPLES] Updated SQL programming guide and examples
Repository: spark Updated Branches: refs/heads/branch-2.0 41df62c59 -> 5173f847c [SPARK-16303][DOCS][EXAMPLES] Updated SQL programming guide and examples - Hard-coded Spark SQL sample snippets were moved into source files under examples sub-project. - Removed the inconsistency between Scala and Java Spark SQL examples - Scala and Java Spark SQL examples were updated The work is still in progress. All involved examples were tested manually. An additional round of testing will be done after the code review. ![image](https://cloud.githubusercontent.com/assets/6235869/16710314/51851606-462a-11e6-9fbe-0818daef65e4.png) Author: aokolnychyi <okolnychyyan...@gmail.com> Closes #14119 from aokolnychyi/spark_16303. (cherry picked from commit 772c213ec702c80d0f25aa6f30b2dffebfbe2d0d) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5173f847 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5173f847 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5173f847 Branch: refs/heads/branch-2.0 Commit: 5173f847c55a7b810d1c494c8b23c740ba110c39 Parents: 41df62c Author: aokolnychyi <okolnychyyan...@gmail.com> Authored: Wed Jul 13 16:12:05 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jul 13 16:12:51 2016 +0800 -- docs/sql-programming-guide.md | 572 ++- .../apache/spark/examples/sql/JavaSparkSQL.java | 186 -- .../spark/examples/sql/JavaSparkSqlExample.java | 336 +++ .../examples/sql/JavaSqlDataSourceExample.java | 217 +++ .../examples/sql/hive/JavaSparkHiveExample.java | 131 + .../spark/examples/sql/SparkSqlExample.scala| 254 .../examples/sql/SqlDataSourceExample.scala | 148 + .../spark/examples/sql/hive/HiveFromSpark.scala | 83 --- .../examples/sql/hive/SparkHiveExample.scala| 107 9 files changed, 1228 insertions(+), 806 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5173f847/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 448251c..f5d1fee 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -65,14 +65,14 @@ Throughout this document, we will often refer to Scala/Java Datasets of `Row`s a The entry point into all functionality in Spark is the [`SparkSession`](api/scala/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`: -{% include_example init_session scala/org/apache/spark/examples/sql/RDDRelation.scala %} +{% include_example init_session scala/org/apache/spark/examples/sql/SparkSqlExample.scala %} The entry point into all functionality in Spark is the [`SparkSession`](api/java/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`: -{% include_example init_session java/org/apache/spark/examples/sql/JavaSparkSQL.java %} +{% include_example init_session java/org/apache/spark/examples/sql/JavaSparkSqlExample.java %} @@ -105,14 +105,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight scala %} -val spark: SparkSession // An existing SparkSession. -val df = spark.read.json("examples/src/main/resources/people.json") - -// Displays the content of the DataFrame to stdout -df.show() -{% endhighlight %} - +{% include_example create_df scala/org/apache/spark/examples/sql/SparkSqlExample.scala %} @@ -121,14 +114,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight java %} -SparkSession spark = ...; // An existing SparkSession. -Dataset df = spark.read().json("examples/src/main/resources/people.json"); - -// Displays the content of the DataFrame to stdout -df.show(); -{% endhighlight %} - +{% include_example create_df java/org/apache/spark/examples/sql/JavaSparkSqlExample.java %} @@ -169,110 +155,20 @@ Here we include some basic examples of structured data processing using Datasets -{% highlight scala %} -val spark: SparkSession // An existing SparkSession - -// Create the DataFrame -val df = spark.read.json("examples/src/main/resources/people.json") - -// Show the content of the DataFrame -df.show() -// age name -// null Michael -// 30 Andy -// 19 Justin - -// Print the schema in a tree format -df.printSchema() -// root -// |-- age: long (nullable = true) -// |-- name: str
spark git commit: [SPARK-16303][DOCS][EXAMPLES] Updated SQL programming guide and examples
Repository: spark Updated Branches: refs/heads/master 1c58fa905 -> 772c213ec [SPARK-16303][DOCS][EXAMPLES] Updated SQL programming guide and examples - Hard-coded Spark SQL sample snippets were moved into source files under examples sub-project. - Removed the inconsistency between Scala and Java Spark SQL examples - Scala and Java Spark SQL examples were updated The work is still in progress. All involved examples were tested manually. An additional round of testing will be done after the code review. ![image](https://cloud.githubusercontent.com/assets/6235869/16710314/51851606-462a-11e6-9fbe-0818daef65e4.png) Author: aokolnychyi <okolnychyyan...@gmail.com> Closes #14119 from aokolnychyi/spark_16303. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/772c213e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/772c213e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/772c213e Branch: refs/heads/master Commit: 772c213ec702c80d0f25aa6f30b2dffebfbe2d0d Parents: 1c58fa9 Author: aokolnychyi <okolnychyyan...@gmail.com> Authored: Wed Jul 13 16:12:05 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jul 13 16:12:11 2016 +0800 -- docs/sql-programming-guide.md | 572 ++- .../apache/spark/examples/sql/JavaSparkSQL.java | 186 -- .../spark/examples/sql/JavaSparkSqlExample.java | 336 +++ .../examples/sql/JavaSqlDataSourceExample.java | 217 +++ .../examples/sql/hive/JavaSparkHiveExample.java | 131 + .../spark/examples/sql/SparkSqlExample.scala| 254 .../examples/sql/SqlDataSourceExample.scala | 148 + .../spark/examples/sql/hive/HiveFromSpark.scala | 83 --- .../examples/sql/hive/SparkHiveExample.scala| 107 9 files changed, 1228 insertions(+), 806 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/772c213e/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index e838a13..2076b29 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -65,14 +65,14 @@ Throughout this document, we will often refer to Scala/Java Datasets of `Row`s a The entry point into all functionality in Spark is the [`SparkSession`](api/scala/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`: -{% include_example init_session scala/org/apache/spark/examples/sql/RDDRelation.scala %} +{% include_example init_session scala/org/apache/spark/examples/sql/SparkSqlExample.scala %} The entry point into all functionality in Spark is the [`SparkSession`](api/java/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`: -{% include_example init_session java/org/apache/spark/examples/sql/JavaSparkSQL.java %} +{% include_example init_session java/org/apache/spark/examples/sql/JavaSparkSqlExample.java %} @@ -105,14 +105,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight scala %} -val spark: SparkSession // An existing SparkSession. -val df = spark.read.json("examples/src/main/resources/people.json") - -// Displays the content of the DataFrame to stdout -df.show() -{% endhighlight %} - +{% include_example create_df scala/org/apache/spark/examples/sql/SparkSqlExample.scala %} @@ -121,14 +114,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight java %} -SparkSession spark = ...; // An existing SparkSession. -Dataset df = spark.read().json("examples/src/main/resources/people.json"); - -// Displays the content of the DataFrame to stdout -df.show(); -{% endhighlight %} - +{% include_example create_df java/org/apache/spark/examples/sql/JavaSparkSqlExample.java %} @@ -169,110 +155,20 @@ Here we include some basic examples of structured data processing using Datasets -{% highlight scala %} -val spark: SparkSession // An existing SparkSession - -// Create the DataFrame -val df = spark.read.json("examples/src/main/resources/people.json") - -// Show the content of the DataFrame -df.show() -// age name -// null Michael -// 30 Andy -// 19 Justin - -// Print the schema in a tree format -df.printSchema() -// root -// |-- age: long (nullable = true) -// |-- name: string (nullable = true) - -// Select only the "name" column -df.select("name").show() -// name -// Michael -// Andy -
spark git commit: [SPARK-16381][SQL][SPARKR] Update SQL examples and programming guide for R language binding
Repository: spark Updated Branches: refs/heads/branch-2.0 aa8cbcd19 -> 7e4ba66d9 [SPARK-16381][SQL][SPARKR] Update SQL examples and programming guide for R language binding https://issues.apache.org/jira/browse/SPARK-16381 ## What changes were proposed in this pull request? Update SQL examples and programming guide for R language binding. Here I just follow example https://github.com/apache/spark/compare/master...liancheng:example-snippet-extraction, created a separate R file to store all the example code. ## How was this patch tested? Manual test on my local machine. Screenshot as below: ![screen shot 2016-07-06 at 4 52 25 pm](https://cloud.githubusercontent.com/assets/3925641/16638180/13925a58-439a-11e6-8d57-8451a63dcae9.png) Author: Xin Ren <iamsh...@126.com> Closes #14082 from keypointt/SPARK-16381. (cherry picked from commit 9cb1eb7af779e74165552977002158a7dad9bb09) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e4ba66d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e4ba66d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e4ba66d Branch: refs/heads/branch-2.0 Commit: 7e4ba66d938a8bf312e991dfa034d420a0b7b360 Parents: aa8cbcd Author: Xin Ren <iamsh...@126.com> Authored: Mon Jul 11 20:05:28 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 11 20:05:41 2016 +0800 -- docs/sql-programming-guide.md | 155 ++ examples/src/main/r/RSparkSQLExample.R | 197 examples/src/main/r/dataframe.R| 2 +- examples/src/main/r/ml.R | 2 +- 4 files changed, 212 insertions(+), 144 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7e4ba66d/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 68419e1..448251c 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -86,9 +86,7 @@ The entry point into all functionality in Spark is the [`SparkSession`](api/pyth The entry point into all functionality in Spark is the [`SparkSession`](api/R/sparkR.session.html) class. To initialize a basic `SparkSession`, just call `sparkR.session()`: -{% highlight r %} -sparkR.session() -{% endhighlight %} +{% include_example init_session r/RSparkSQLExample.R %} Note that when invoked for the first time, `sparkR.session()` initializes a global `SparkSession` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SparkSession` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SparkSession` instance around. @@ -155,12 +153,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight r %} -df <- read.json("examples/src/main/resources/people.json") - -# Displays the content of the DataFrame -showDF(df) -{% endhighlight %} +{% include_example create_DataFrames r/RSparkSQLExample.R %} @@ -343,50 +336,8 @@ In addition to simple column references and expressions, DataFrames also have a -{% highlight r %} -# Create the DataFrame -df <- read.json("examples/src/main/resources/people.json") - -# Show the content of the DataFrame -showDF(df) -## age name -## null Michael -## 30 Andy -## 19 Justin - -# Print the schema in a tree format -printSchema(df) -## root -## |-- age: long (nullable = true) -## |-- name: string (nullable = true) -# Select only the "name" column -showDF(select(df, "name")) -## name -## Michael -## Andy -## Justin - -# Select everybody, but increment the age by 1 -showDF(select(df, df$name, df$age + 1)) -## name(age + 1) -## Michael null -## Andy31 -## Justin 20 - -# Select people older than 21 -showDF(where(df, df$age > 21)) -## age name -## 30 Andy - -# Count people by age -showDF(count(groupBy(df, "age"))) -## age count -## null 1 -## 19 1 -## 30 1 - -{% endhighlight %} +{% include_example dataframe_operations r/RSparkSQLExample.R %} For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html). @@ -429,12 +380,10 @@ df = spark.sql("SELECT * FROM table") The `sql` function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. -{% highlight r %} -df <- sql("SELECT * FROM table") -{% endhighlight
spark git commit: [SPARK-16381][SQL][SPARKR] Update SQL examples and programming guide for R language binding
Repository: spark Updated Branches: refs/heads/master e22627894 -> 9cb1eb7af [SPARK-16381][SQL][SPARKR] Update SQL examples and programming guide for R language binding https://issues.apache.org/jira/browse/SPARK-16381 ## What changes were proposed in this pull request? Update SQL examples and programming guide for R language binding. Here I just follow example https://github.com/apache/spark/compare/master...liancheng:example-snippet-extraction, created a separate R file to store all the example code. ## How was this patch tested? Manual test on my local machine. Screenshot as below: ![screen shot 2016-07-06 at 4 52 25 pm](https://cloud.githubusercontent.com/assets/3925641/16638180/13925a58-439a-11e6-8d57-8451a63dcae9.png) Author: Xin Ren <iamsh...@126.com> Closes #14082 from keypointt/SPARK-16381. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9cb1eb7a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9cb1eb7a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9cb1eb7a Branch: refs/heads/master Commit: 9cb1eb7af779e74165552977002158a7dad9bb09 Parents: e226278 Author: Xin Ren <iamsh...@126.com> Authored: Mon Jul 11 20:05:28 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jul 11 20:05:28 2016 +0800 -- docs/sql-programming-guide.md | 155 ++ examples/src/main/r/RSparkSQLExample.R | 197 examples/src/main/r/dataframe.R| 2 +- examples/src/main/r/ml.R | 2 +- 4 files changed, 212 insertions(+), 144 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9cb1eb7a/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 68419e1..448251c 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -86,9 +86,7 @@ The entry point into all functionality in Spark is the [`SparkSession`](api/pyth The entry point into all functionality in Spark is the [`SparkSession`](api/R/sparkR.session.html) class. To initialize a basic `SparkSession`, just call `sparkR.session()`: -{% highlight r %} -sparkR.session() -{% endhighlight %} +{% include_example init_session r/RSparkSQLExample.R %} Note that when invoked for the first time, `sparkR.session()` initializes a global `SparkSession` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SparkSession` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SparkSession` instance around. @@ -155,12 +153,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight r %} -df <- read.json("examples/src/main/resources/people.json") - -# Displays the content of the DataFrame -showDF(df) -{% endhighlight %} +{% include_example create_DataFrames r/RSparkSQLExample.R %} @@ -343,50 +336,8 @@ In addition to simple column references and expressions, DataFrames also have a -{% highlight r %} -# Create the DataFrame -df <- read.json("examples/src/main/resources/people.json") - -# Show the content of the DataFrame -showDF(df) -## age name -## null Michael -## 30 Andy -## 19 Justin - -# Print the schema in a tree format -printSchema(df) -## root -## |-- age: long (nullable = true) -## |-- name: string (nullable = true) -# Select only the "name" column -showDF(select(df, "name")) -## name -## Michael -## Andy -## Justin - -# Select everybody, but increment the age by 1 -showDF(select(df, df$name, df$age + 1)) -## name(age + 1) -## Michael null -## Andy31 -## Justin 20 - -# Select people older than 21 -showDF(where(df, df$age > 21)) -## age name -## 30 Andy - -# Count people by age -showDF(count(groupBy(df, "age"))) -## age count -## null 1 -## 19 1 -## 30 1 - -{% endhighlight %} +{% include_example dataframe_operations r/RSparkSQLExample.R %} For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html). @@ -429,12 +380,10 @@ df = spark.sql("SELECT * FROM table") The `sql` function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. -{% highlight r %} -df <- sql("SELECT * FROM table") -{% endhighlight %} - +{% include_example sql_query r/RSparkSQLExample.R %} + ## Creating Datasets @@ -888,10 +837,7 @@ df.select("name&qu
spark git commit: [SPARK-16388][SQL] Remove spark.sql.nativeView and spark.sql.nativeView.canonical config
Repository: spark Updated Branches: refs/heads/master 5497242c7 -> 7e28fabdf [SPARK-16388][SQL] Remove spark.sql.nativeView and spark.sql.nativeView.canonical config ## What changes were proposed in this pull request? These two configs should always be true after Spark 2.0. This patch removes them from the config list. Note that ideally this should've gone into branch-2.0, but due to the timing of the release we should only merge this in master for Spark 2.1. ## How was this patch tested? Updated test cases. Author: Reynold Xin <r...@databricks.com> Closes #14061 from rxin/SPARK-16388. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e28fabd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e28fabd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e28fabd Branch: refs/heads/master Commit: 7e28fabdff2da1cc374efbf43372d92ae0cd07aa Parents: 5497242 Author: Reynold Xin <r...@databricks.com> Authored: Wed Jul 6 17:40:55 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jul 6 17:40:55 2016 +0800 -- .../spark/sql/execution/command/views.scala | 39 +--- .../org/apache/spark/sql/internal/SQLConf.scala | 23 --- .../spark/sql/internal/SQLConfSuite.scala | 16 +- .../spark/sql/hive/execution/SQLViewSuite.scala | 206 --- 4 files changed, 106 insertions(+), 178 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7e28fabd/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 088f684..007fa46 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -149,37 +149,18 @@ case class CreateViewCommand( * SQL based on the analyzed plan, and also creates the proper schema for the view. */ private def prepareTable(sparkSession: SparkSession, analyzedPlan: LogicalPlan): CatalogTable = { -val viewSQL: String = - if (sparkSession.sessionState.conf.canonicalView) { -val logicalPlan = - if (tableDesc.schema.isEmpty) { -analyzedPlan - } else { -val projectList = analyzedPlan.output.zip(tableDesc.schema).map { - case (attr, col) => Alias(attr, col.name)() -} -sparkSession.sessionState.executePlan(Project(projectList, analyzedPlan)).analyzed - } -new SQLBuilder(logicalPlan).toSQL - } else { -// When user specified column names for view, we should create a project to do the renaming. -// When no column name specified, we still need to create a project to declare the columns -// we need, to make us more robust to top level `*`s. -val viewOutput = { - val columnNames = analyzedPlan.output.map(f => quote(f.name)) - if (tableDesc.schema.isEmpty) { -columnNames.mkString(", ") - } else { -columnNames.zip(tableDesc.schema.map(f => quote(f.name))).map { - case (name, alias) => s"$name AS $alias" -}.mkString(", ") +val viewSQL: String = { + val logicalPlan = +if (tableDesc.schema.isEmpty) { + analyzedPlan +} else { + val projectList = analyzedPlan.output.zip(tableDesc.schema).map { +case (attr, col) => Alias(attr, col.name)() } + sparkSession.sessionState.executePlan(Project(projectList, analyzedPlan)).analyzed } - -val viewText = tableDesc.viewText.get -val viewName = quote(tableDesc.identifier.table) -s"SELECT $viewOutput FROM ($viewText) $viewName" - } + new SQLBuilder(logicalPlan).toSQL +} // Validate the view SQL - make sure we can parse it and analyze it. // If we cannot analyze the generated query, there is probably a bug in SQL generation. http://git-wip-us.apache.org/repos/asf/spark/blob/7e28fabd/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1a9bb6a..5ab0c1d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -258,25 +258,6 @@ object SQLConf { .booleanC
spark git commit: [SPARK-9876][SQL][FOLLOWUP] Enable string and binary tests for Parquet predicate pushdown and replace deprecated fromByteArray.
Repository: spark Updated Branches: refs/heads/master 7f7eb3934 -> 07d9c5327 [SPARK-9876][SQL][FOLLOWUP] Enable string and binary tests for Parquet predicate pushdown and replace deprecated fromByteArray. ## What changes were proposed in this pull request? It seems Parquet has been upgraded to 1.8.1 by https://github.com/apache/spark/pull/13280. So, this PR enables string and binary predicate push down which was disabled due to [SPARK-11153](https://issues.apache.org/jira/browse/SPARK-11153) and [PARQUET-251](https://issues.apache.org/jira/browse/PARQUET-251) and cleans up some comments unremoved (I think by mistake). This PR also replace the API, `fromByteArray()` deprecated in [PARQUET-251](https://issues.apache.org/jira/browse/PARQUET-251). ## How was this patch tested? Unit tests in `ParquetFilters` Author: hyukjinkwon <gurwls...@gmail.com> Closes #13389 from HyukjinKwon/parquet-1.8-followup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/07d9c532 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/07d9c532 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/07d9c532 Branch: refs/heads/master Commit: 07d9c5327f050f9da611d5239f61ed73b36ce4e6 Parents: 7f7eb39 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Tue Jul 5 16:59:40 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jul 5 16:59:40 2016 +0800 -- .../datasources/parquet/VectorizedPlainValuesReader.java | 2 +- .../datasources/parquet/CatalystWriteSupport.scala | 11 ++- .../execution/datasources/parquet/ParquetFilters.scala | 8 .../datasources/parquet/ParquetFilterSuite.scala | 6 ++ 4 files changed, 9 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/07d9c532/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java index 9def455..98018b7 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java @@ -170,7 +170,7 @@ public class VectorizedPlainValuesReader extends ValuesReader implements Vectori @Override public final Binary readBinary(int len) { -Binary result = Binary.fromByteArray(buffer, offset - Platform.BYTE_ARRAY_OFFSET, len); +Binary result = Binary.fromConstantByteArray(buffer, offset - Platform.BYTE_ARRAY_OFFSET, len); offset += len; return result; } http://git-wip-us.apache.org/repos/asf/spark/blob/07d9c532/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala index cf974af..00e1bca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala @@ -150,7 +150,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi case StringType => (row: SpecializedGetters, ordinal: Int) => - recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes)) + recordConsumer.addBinary( +Binary.fromReusedByteArray(row.getUTF8String(ordinal).getBytes)) case TimestampType => (row: SpecializedGetters, ordinal: Int) => { @@ -165,12 +166,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal)) val buf = ByteBuffer.wrap(timestampBuffer) buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay) - recordConsumer.addBinary(Binary.fromByteArray(timestampBuffer)) + recordConsumer.addBinary(Binary.fromReusedByteArray(timestampBuffer)) } case BinaryType => (row: SpecializedGetters, ordinal: Int) => - recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal))) + recordConsumer.addBinary(Binary.fr
spark git commit: [SPARK-16360][SQL] Speed up SQL query performance by removing redundant `executePlan` call
Repository: spark Updated Branches: refs/heads/master 7742d9f15 -> 7f7eb3934 [SPARK-16360][SQL] Speed up SQL query performance by removing redundant `executePlan` call ## What changes were proposed in this pull request? Currently, there are a few reports about Spark 2.0 query performance regression for large queries. This PR speeds up SQL query processing performance by removing redundant **consecutive `executePlan`** call in `Dataset.ofRows` function and `Dataset` instantiation. Specifically, this PR aims to reduce the overhead of SQL query execution plan generation, not real query execution. So, we can not see the result in the Spark Web UI. Please use the following query script. The result is **25.78 sec** -> **12.36 sec** as expected. **Sample Query** ```scala val n = 4000 val values = (1 to n).map(_.toString).mkString(", ") val columns = (1 to n).map("column" + _).mkString(", ") val query = s""" |SELECT $columns |FROM VALUES ($values) T($columns) |WHERE 1=2 AND 1 IN ($columns) |GROUP BY $columns |ORDER BY $columns |""".stripMargin def time[R](block: => R): R = { val t0 = System.nanoTime() val result = block println("Elapsed time: " + ((System.nanoTime - t0) / 1e9) + "s") result } ``` **Before** ```scala scala> time(sql(query)) Elapsed time: 30.138142577s // First query has a little overhead of initialization. res0: org.apache.spark.sql.DataFrame = [column1: int, column2: int ... 3998 more fields] scala> time(sql(query)) Elapsed time: 25.787751452s // Let's compare this one. res1: org.apache.spark.sql.DataFrame = [column1: int, column2: int ... 3998 more fields] ``` **After** ```scala scala> time(sql(query)) Elapsed time: 17.500279659s // First query has a little overhead of initialization. res0: org.apache.spark.sql.DataFrame = [column1: int, column2: int ... 3998 more fields] scala> time(sql(query)) Elapsed time: 12.364812255s // This shows the real difference. The speed up is about 2 times. res1: org.apache.spark.sql.DataFrame = [column1: int, column2: int ... 3998 more fields] ``` ## How was this patch tested? Manual by the above script. Author: Dongjoon Hyun <dongj...@apache.org> Closes #14044 from dongjoon-hyun/SPARK-16360. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f7eb393 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f7eb393 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f7eb393 Branch: refs/heads/master Commit: 7f7eb3934ea258f2b163a87da06766bf5c7d443d Parents: 7742d9f Author: Dongjoon Hyun <dongj...@apache.org> Authored: Tue Jul 5 16:19:22 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jul 5 16:19:22 2016 +0800 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f7eb393/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index e64669a..ededf7f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -62,7 +62,7 @@ private[sql] object Dataset { def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = { val qe = sparkSession.sessionState.executePlan(logicalPlan) qe.assertAnalyzed() -new Dataset[Row](sparkSession, logicalPlan, RowEncoder(qe.analyzed.schema)) +new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15198][SQL] Support for pushing down filters for boolean types in ORC data source
Repository: spark Updated Branches: refs/heads/master 8f6cf00c6 -> 7742d9f15 [SPARK-15198][SQL] Support for pushing down filters for boolean types in ORC data source ## What changes were proposed in this pull request? It seems ORC supports all the types in ([`PredicateLeaf.Type`](https://github.com/apache/hive/blob/e085b7e9bd059d91aaf013df0db4d71dca90ec6f/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java#L50-L56)) which includes boolean types. So, this was tested first. This PR adds the support for pushing filters down for `BooleanType` in ORC data source. This PR also removes `OrcTableScan` class and the companion object, which is not used anymore. ## How was this patch tested? Unittest in `OrcFilterSuite` and `OrcQuerySuite`. Author: hyukjinkwon <gurwls...@gmail.com> Closes #12972 from HyukjinKwon/SPARK-15198. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7742d9f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7742d9f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7742d9f1 Branch: refs/heads/master Commit: 7742d9f1584150befeb2f3d76cdbd4ea1f37c914 Parents: 8f6cf00 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Tue Jul 5 13:59:13 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jul 5 13:59:13 2016 +0800 -- .../spark/sql/hive/orc/OrcFileFormat.scala | 10 .../apache/spark/sql/hive/orc/OrcFilters.scala | 2 +- .../spark/sql/hive/orc/OrcFilterSuite.scala | 25 .../spark/sql/hive/orc/OrcQuerySuite.scala | 13 ++ 4 files changed, 39 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala index 5de3507..1d3c466 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala @@ -111,7 +111,7 @@ private[sql] class OrcFileFormat if (sparkSession.sessionState.conf.orcFilterPushDown) { // Sets pushed predicates OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f => -hadoopConf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo) +hadoopConf.set(OrcRelation.SARG_PUSHDOWN, f.toKryo) hadoopConf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true) } } @@ -258,15 +258,13 @@ private[orc] class OrcOutputWriter( } } -private[orc] object OrcTableScan { - // This constant duplicates `OrcInputFormat.SARG_PUSHDOWN`, which is unfortunately not public. - private[orc] val SARG_PUSHDOWN = "sarg.pushdown" -} - private[orc] object OrcRelation extends HiveInspectors { // The references of Hive's classes will be minimized. val ORC_COMPRESSION = "orc.compress" + // This constant duplicates `OrcInputFormat.SARG_PUSHDOWN`, which is unfortunately not public. + private[orc] val SARG_PUSHDOWN = "sarg.pushdown" + // The extensions for ORC compression codecs val extensionsForCompressionCodecNames = Map( "NONE" -> "", http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala index c463bc8..6ab8244 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala @@ -83,7 +83,7 @@ private[orc] object OrcFilters extends Logging { // Only the values in the Spark types below can be recognized by // the `SearchArgumentImpl.BuilderImpl.boxLiteral()` method. case ByteType | ShortType | FloatType | DoubleType => true - case IntegerType | LongType | StringType => true + case IntegerType | LongType | StringType | BooleanType => true case _ => false } http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala index 8c027f9..7a30e54 100644 -
spark git commit: [SPARK-16208][SQL] Add `PropagateEmptyRelation` optimizer
Repository: spark Updated Branches: refs/heads/master 0ad6ce7e5 -> c55397652 [SPARK-16208][SQL] Add `PropagateEmptyRelation` optimizer ## What changes were proposed in this pull request? This PR adds a new logical optimizer, `PropagateEmptyRelation`, to collapse a logical plans consisting of only empty LocalRelations. **Optimizer Targets** 1. Binary(or Higher)-node Logical Plans - Union with all empty children. - Join with one or two empty children (including Intersect/Except). 2. Unary-node Logical Plans - Project/Filter/Sample/Join/Limit/Repartition with all empty children. - Aggregate with all empty children and without AggregateFunction expressions, COUNT. - Generate with Explode because other UserDefinedGenerators like Hive UDTF returns results. **Sample Query** ```sql WITH t1 AS (SELECT a FROM VALUES 1 t(a)), t2 AS (SELECT b FROM VALUES 1 t(b) WHERE 1=2) SELECT a,b FROM t1, t2 WHERE a=b GROUP BY a,b HAVING a>1 ORDER BY a,b ``` **Before** ```scala scala> sql("with t1 as (select a from values 1 t(a)), t2 as (select b from values 1 t(b) where 1=2) select a,b from t1, t2 where a=b group by a,b having a>1 order by a,b").explain == Physical Plan == *Sort [a#0 ASC, b#1 ASC], true, 0 +- Exchange rangepartitioning(a#0 ASC, b#1 ASC, 200) +- *HashAggregate(keys=[a#0, b#1], functions=[]) +- Exchange hashpartitioning(a#0, b#1, 200) +- *HashAggregate(keys=[a#0, b#1], functions=[]) +- *BroadcastHashJoin [a#0], [b#1], Inner, BuildRight :- *Filter (isnotnull(a#0) && (a#0 > 1)) : +- LocalTableScan [a#0] +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint))) +- *Filter (isnotnull(b#1) && (b#1 > 1)) +- LocalTableScan , [b#1] ``` **After** ```scala scala> sql("with t1 as (select a from values 1 t(a)), t2 as (select b from values 1 t(b) where 1=2) select a,b from t1, t2 where a=b group by a,b having a>1 order by a,b").explain == Physical Plan == LocalTableScan , [a#0, b#1] ``` ## How was this patch tested? Pass the Jenkins tests (including a new testsuite). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13906 from dongjoon-hyun/SPARK-16208. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5539765 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5539765 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5539765 Branch: refs/heads/master Commit: c55397652ad1c6d047a8b8eb7fd92a8a1dc66306 Parents: 0ad6ce7 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Jul 1 22:13:56 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jul 1 22:13:56 2016 +0800 -- .../sql/catalyst/optimizer/Optimizer.scala | 3 +- .../optimizer/PropagateEmptyRelation.scala | 78 + .../optimizer/PropagateEmptyRelationSuite.scala | 162 +++ 3 files changed, 242 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5539765/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 842d6bc..9ee1735 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -113,7 +113,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) Batch("Typed Filter Optimization", fixedPoint, CombineTypedFilters) :: Batch("LocalRelation", fixedPoint, - ConvertToLocalRelation) :: + ConvertToLocalRelation, + PropagateEmptyRelation) :: Batch("OptimizeCodegen", Once, OptimizeCodegen(conf)) :: Batch("RewriteSubquery", Once, http://git-wip-us.apache.org/repos/asf/spark/blob/c5539765/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala new file mode 100644 index 000..50076b1 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foun
spark git commit: [SPARK-15820][PYSPARK][SQL] Add Catalog.refreshTable into python API
Repository: spark Updated Branches: refs/heads/branch-2.0 94d61de9c -> 80a7bff89 [SPARK-15820][PYSPARK][SQL] Add Catalog.refreshTable into python API ## What changes were proposed in this pull request? Add Catalog.refreshTable API into python interface for Spark-SQL. ## How was this patch tested? Existing test. Author: WeichenXu <weichenxu...@outlook.com> Closes #13558 from WeichenXu123/update_python_sql_interface_refreshTable. (cherry picked from commit 5344bade8efb6f12aa43fbfbbbc2e3c0c7d16d98) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80a7bff8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80a7bff8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80a7bff8 Branch: refs/heads/branch-2.0 Commit: 80a7bff897554ce77fe6bc91d62cff8857892322 Parents: 94d61de Author: WeichenXu <weichenxu...@outlook.com> Authored: Thu Jun 30 23:00:39 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jul 1 10:18:44 2016 +0800 -- python/pyspark/sql/catalog.py | 5 + .../src/main/scala/org/apache/spark/sql/catalog/Catalog.scala | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80a7bff8/python/pyspark/sql/catalog.py -- diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 3033f14..4af930a 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -232,6 +232,11 @@ class Catalog(object): """Removes all cached tables from the in-memory cache.""" self._jcatalog.clearCache() +@since(2.0) +def refreshTable(self, tableName): +"""Invalidate and refresh all the cached metadata of the given table.""" +self._jcatalog.refreshTable(tableName) + def _reset(self): """(Internal use only) Drop all existing databases (except "default"), tables, partitions and functions, and set the current database to "default". http://git-wip-us.apache.org/repos/asf/spark/blob/80a7bff8/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index 083a63c..91ed9b3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -214,7 +214,7 @@ abstract class Catalog { def clearCache(): Unit /** - * Invalidate and refresh all the cached the metadata of the given table. For performance reasons, + * Invalidate and refresh all the cached metadata of the given table. For performance reasons, * Spark SQL or the external data source library it uses might cache certain metadata about a * table, such as the location of blocks. When those change outside of Spark SQL, users should * call this function to invalidate the cache. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15820][PYSPARK][SQL] Add Catalog.refreshTable into python API
Repository: spark Updated Branches: refs/heads/master 5320adc86 -> 5344bade8 [SPARK-15820][PYSPARK][SQL] Add Catalog.refreshTable into python API ## What changes were proposed in this pull request? Add Catalog.refreshTable API into python interface for Spark-SQL. ## How was this patch tested? Existing test. Author: WeichenXu <weichenxu...@outlook.com> Closes #13558 from WeichenXu123/update_python_sql_interface_refreshTable. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5344bade Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5344bade Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5344bade Branch: refs/heads/master Commit: 5344bade8efb6f12aa43fbfbbbc2e3c0c7d16d98 Parents: 5320adc Author: WeichenXu <weichenxu...@outlook.com> Authored: Thu Jun 30 23:00:39 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 23:00:39 2016 +0800 -- python/pyspark/sql/catalog.py | 5 + .../src/main/scala/org/apache/spark/sql/catalog/Catalog.scala | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5344bade/python/pyspark/sql/catalog.py -- diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 3033f14..4af930a 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -232,6 +232,11 @@ class Catalog(object): """Removes all cached tables from the in-memory cache.""" self._jcatalog.clearCache() +@since(2.0) +def refreshTable(self, tableName): +"""Invalidate and refresh all the cached metadata of the given table.""" +self._jcatalog.refreshTable(tableName) + def _reset(self): """(Internal use only) Drop all existing databases (except "default"), tables, partitions and functions, and set the current database to "default". http://git-wip-us.apache.org/repos/asf/spark/blob/5344bade/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index 083a63c..91ed9b3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -214,7 +214,7 @@ abstract class Catalog { def clearCache(): Unit /** - * Invalidate and refresh all the cached the metadata of the given table. For performance reasons, + * Invalidate and refresh all the cached metadata of the given table. For performance reasons, * Spark SQL or the external data source library it uses might cache certain metadata about a * table, such as the location of blocks. When those change outside of Spark SQL, users should * call this function to invalidate the cache. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [BUILD] Fix version in poms related to kafka-0-10
Repository: spark Updated Branches: refs/heads/branch-2.0 56207fc3b -> 98056a1f8 [BUILD] Fix version in poms related to kafka-0-10 self explanatory Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #13994 from tdas/SPARK-12177-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/98056a1f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/98056a1f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/98056a1f Branch: refs/heads/branch-2.0 Commit: 98056a1f8683385599f194a4b963769e3342bff3 Parents: 56207fc Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Thu Jun 30 22:10:56 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 22:10:56 2016 +0800 -- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/98056a1f/external/kafka-0-10-assembly/pom.xml -- diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml index f2468d1..59f41f1 100644 --- a/external/kafka-0-10-assembly/pom.xml +++ b/external/kafka-0-10-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/98056a1f/external/kafka-0-10/pom.xml -- diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml index 50395f6..2696561 100644 --- a/external/kafka-0-10/pom.xml +++ b/external/kafka-0-10/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.1-SNAPSHOT ../../pom.xml - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12177][TEST] Removed test to avoid compilation issue in scala 2.10
Repository: spark Updated Branches: refs/heads/branch-2.0 1d274455c -> 6a4f4c1d7 [SPARK-12177][TEST] Removed test to avoid compilation issue in scala 2.10 ## What changes were proposed in this pull request? The commented lines failed scala 2.10 build. This is because of change in behavior of case classes between 2.10 and 2.11. In scala 2.10, if companion object of a case class has explicitly defined apply(), then the implicit apply method is not generated. In scala 2.11 it is generated. Hence, the lines compile fine in 2.11 but not in 2.10. This simply comments the tests to fix broken build. Correct solution is pending. Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #13992 from tdas/SPARK-12177. (cherry picked from commit de8ab313e1fe59f849a62e59349224581ff0b40a) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a4f4c1d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a4f4c1d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a4f4c1d Branch: refs/heads/branch-2.0 Commit: 6a4f4c1d751db9542ba49755e859b55b42be3236 Parents: 1d27445 Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Thu Jun 30 18:06:04 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 18:06:20 2016 +0800 -- .../spark/streaming/kafka010/JavaConsumerStrategySuite.java | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6a4f4c1d/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java -- diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java b/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java index aba45f5..8d7c05b 100644 --- a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java +++ b/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java @@ -50,8 +50,8 @@ public class JavaConsumerStrategySuite implements Serializable { JavaConverters.mapAsScalaMapConverter(offsets).asScala(); // make sure constructors can be called from java -final ConsumerStrategy<String, String> sub0 = - Subscribe.<String, String>apply(topics, kafkaParams, offsets); +// final ConsumerStrategy<String, String> sub0 = // does not compile in Scala 2.10 +// Subscribe.<String, String>apply(topics, kafkaParams, offsets); final ConsumerStrategy<String, String> sub1 = Subscribe.<String, String>apply(sTopics, sKafkaParams, sOffsets); final ConsumerStrategy<String, String> sub2 = @@ -65,8 +65,8 @@ public class JavaConsumerStrategySuite implements Serializable { sub1.executorKafkaParams().get("bootstrap.servers"), sub3.executorKafkaParams().get("bootstrap.servers")); -final ConsumerStrategy<String, String> asn0 = - Assign.<String, String>apply(parts, kafkaParams, offsets); +// final ConsumerStrategy<String, String> asn0 = // does not compile in Scala 2.10 +// Assign.<String, String>apply(parts, kafkaParams, offsets); final ConsumerStrategy<String, String> asn1 = Assign.<String, String>apply(sParts, sKafkaParams, sOffsets); final ConsumerStrategy<String, String> asn2 = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12177][TEST] Removed test to avoid compilation issue in scala 2.10
Repository: spark Updated Branches: refs/heads/master b30a2dc7c -> de8ab313e [SPARK-12177][TEST] Removed test to avoid compilation issue in scala 2.10 ## What changes were proposed in this pull request? The commented lines failed scala 2.10 build. This is because of change in behavior of case classes between 2.10 and 2.11. In scala 2.10, if companion object of a case class has explicitly defined apply(), then the implicit apply method is not generated. In scala 2.11 it is generated. Hence, the lines compile fine in 2.11 but not in 2.10. This simply comments the tests to fix broken build. Correct solution is pending. Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #13992 from tdas/SPARK-12177. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de8ab313 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de8ab313 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de8ab313 Branch: refs/heads/master Commit: de8ab313e1fe59f849a62e59349224581ff0b40a Parents: b30a2dc Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Thu Jun 30 18:06:04 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 18:06:04 2016 +0800 -- .../spark/streaming/kafka010/JavaConsumerStrategySuite.java | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/de8ab313/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java -- diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java b/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java index aba45f5..8d7c05b 100644 --- a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java +++ b/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java @@ -50,8 +50,8 @@ public class JavaConsumerStrategySuite implements Serializable { JavaConverters.mapAsScalaMapConverter(offsets).asScala(); // make sure constructors can be called from java -final ConsumerStrategy<String, String> sub0 = - Subscribe.<String, String>apply(topics, kafkaParams, offsets); +// final ConsumerStrategy<String, String> sub0 = // does not compile in Scala 2.10 +// Subscribe.<String, String>apply(topics, kafkaParams, offsets); final ConsumerStrategy<String, String> sub1 = Subscribe.<String, String>apply(sTopics, sKafkaParams, sOffsets); final ConsumerStrategy<String, String> sub2 = @@ -65,8 +65,8 @@ public class JavaConsumerStrategySuite implements Serializable { sub1.executorKafkaParams().get("bootstrap.servers"), sub3.executorKafkaParams().get("bootstrap.servers")); -final ConsumerStrategy<String, String> asn0 = - Assign.<String, String>apply(parts, kafkaParams, offsets); +// final ConsumerStrategy<String, String> asn0 = // does not compile in Scala 2.10 +// Assign.<String, String>apply(parts, kafkaParams, offsets); final ConsumerStrategy<String, String> asn1 = Assign.<String, String>apply(sParts, sKafkaParams, sOffsets); final ConsumerStrategy<String, String> asn2 = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-16134][SQL] optimizer rules for typed filter"
Repository: spark Updated Branches: refs/heads/branch-2.0 8da431473 -> e1bdf1e02 Revert "[SPARK-16134][SQL] optimizer rules for typed filter" This reverts commit 8da4314735ed55f259642e2977d8d7bf2212474f. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e1bdf1e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e1bdf1e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e1bdf1e0 Branch: refs/heads/branch-2.0 Commit: e1bdf1e02483bf513b6e012e8921d440a5efbc11 Parents: 8da4314 Author: Cheng Lian <l...@databricks.com> Authored: Thu Jun 30 08:17:43 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 08:17:43 2016 +0800 -- .../apache/spark/sql/catalyst/dsl/package.scala | 6 +- .../expressions/ReferenceToExpressions.scala| 1 - .../sql/catalyst/optimizer/Optimizer.scala | 98 +++- .../sql/catalyst/plans/logical/object.scala | 47 +- .../TypedFilterOptimizationSuite.scala | 86 - .../scala/org/apache/spark/sql/Dataset.scala| 12 ++- .../spark/sql/execution/SparkStrategies.scala | 2 - .../scala/org/apache/spark/sql/QueryTest.scala | 1 - 8 files changed, 91 insertions(+), 162 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e1bdf1e0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 84c9cc8..2ca990d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -293,7 +293,11 @@ package object dsl { def where(condition: Expression): LogicalPlan = Filter(condition, logicalPlan) - def filter[T : Encoder](func: T => Boolean): LogicalPlan = TypedFilter(func, logicalPlan) + def filter[T : Encoder](func: T => Boolean): LogicalPlan = { +val deserialized = logicalPlan.deserialize[T] +val condition = expressions.callFunction(func, BooleanType, deserialized.output.head) +Filter(condition, deserialized).serialize[T] + } def serialize[T : Encoder]: LogicalPlan = CatalystSerde.serialize[T](logicalPlan) http://git-wip-us.apache.org/repos/asf/spark/blob/e1bdf1e0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala index 127797c..502d791 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -45,7 +45,6 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal - case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + http://git-wip-us.apache.org/repos/asf/spark/blob/e1bdf1e0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index aa90735..f24f8b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -21,7 +21,6 @@ import scala.annotation.tailrec import scala.collection.immutable.HashSet import scala.collection.mutable.ArrayBuffer -import org.apache.spark.api.java.function.FilterFunction import org.apache.spark.sql.catalyst.{CatalystConf, SimpleCatalystConf} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} @@ -110,7 +109,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) Batch("Decimal Optimizations", fixedPoint, DecimalAggregates) :: Batch("Typed Filter Optimization",
spark git commit: [SPARK-16134][SQL] optimizer rules for typed filter
Repository: spark Updated Branches: refs/heads/branch-2.0 011befd20 -> 8da431473 [SPARK-16134][SQL] optimizer rules for typed filter ## What changes were proposed in this pull request? This PR adds 3 optimizer rules for typed filter: 1. push typed filter down through `SerializeFromObject` and eliminate the deserialization in filter condition. 2. pull typed filter up through `SerializeFromObject` and eliminate the deserialization in filter condition. 3. combine adjacent typed filters and share the deserialized object among all the condition expressions. This PR also adds `TypedFilter` logical plan, to separate it from normal filter, so that the concept is more clear and it's easier to write optimizer rules. ## How was this patch tested? `TypedFilterOptimizationSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #13846 from cloud-fan/filter. (cherry picked from commit d063898bebaaf4ec2aad24c3ac70aabdbf97a190) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8da43147 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8da43147 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8da43147 Branch: refs/heads/branch-2.0 Commit: 8da4314735ed55f259642e2977d8d7bf2212474f Parents: 011befd Author: Wenchen Fan <wenc...@databricks.com> Authored: Thu Jun 30 08:15:08 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 08:15:50 2016 +0800 -- .../apache/spark/sql/catalyst/dsl/package.scala | 6 +- .../expressions/ReferenceToExpressions.scala| 1 + .../sql/catalyst/optimizer/Optimizer.scala | 98 +--- .../sql/catalyst/plans/logical/object.scala | 47 +- .../TypedFilterOptimizationSuite.scala | 86 + .../scala/org/apache/spark/sql/Dataset.scala| 12 +-- .../spark/sql/execution/SparkStrategies.scala | 2 + .../scala/org/apache/spark/sql/QueryTest.scala | 1 + 8 files changed, 162 insertions(+), 91 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8da43147/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 2ca990d..84c9cc8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -293,11 +293,7 @@ package object dsl { def where(condition: Expression): LogicalPlan = Filter(condition, logicalPlan) - def filter[T : Encoder](func: T => Boolean): LogicalPlan = { -val deserialized = logicalPlan.deserialize[T] -val condition = expressions.callFunction(func, BooleanType, deserialized.output.head) -Filter(condition, deserialized).serialize[T] - } + def filter[T : Encoder](func: T => Boolean): LogicalPlan = TypedFilter(func, logicalPlan) def serialize[T : Encoder]: LogicalPlan = CatalystSerde.serialize[T](logicalPlan) http://git-wip-us.apache.org/repos/asf/spark/blob/8da43147/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala index 502d791..127797c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -45,6 +45,7 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal + case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + http://git-wip-us.apache.org/repos/asf/spark/blob/8da43147/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index f24f8b7..aa90735 100644 --- a/sql/catalyst/src/main/scala/
spark git commit: [SPARK-16134][SQL] optimizer rules for typed filter
Repository: spark Updated Branches: refs/heads/master 2eaabfa41 -> d063898be [SPARK-16134][SQL] optimizer rules for typed filter ## What changes were proposed in this pull request? This PR adds 3 optimizer rules for typed filter: 1. push typed filter down through `SerializeFromObject` and eliminate the deserialization in filter condition. 2. pull typed filter up through `SerializeFromObject` and eliminate the deserialization in filter condition. 3. combine adjacent typed filters and share the deserialized object among all the condition expressions. This PR also adds `TypedFilter` logical plan, to separate it from normal filter, so that the concept is more clear and it's easier to write optimizer rules. ## How was this patch tested? `TypedFilterOptimizationSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #13846 from cloud-fan/filter. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d063898b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d063898b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d063898b Branch: refs/heads/master Commit: d063898bebaaf4ec2aad24c3ac70aabdbf97a190 Parents: 2eaabfa Author: Wenchen Fan <wenc...@databricks.com> Authored: Thu Jun 30 08:15:08 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 30 08:15:08 2016 +0800 -- .../apache/spark/sql/catalyst/dsl/package.scala | 6 +- .../expressions/ReferenceToExpressions.scala| 1 + .../sql/catalyst/optimizer/Optimizer.scala | 98 +--- .../sql/catalyst/plans/logical/object.scala | 47 +- .../TypedFilterOptimizationSuite.scala | 86 + .../scala/org/apache/spark/sql/Dataset.scala| 12 +-- .../spark/sql/execution/SparkStrategies.scala | 2 + .../scala/org/apache/spark/sql/QueryTest.scala | 1 + 8 files changed, 162 insertions(+), 91 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d063898b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 2ca990d..84c9cc8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -293,11 +293,7 @@ package object dsl { def where(condition: Expression): LogicalPlan = Filter(condition, logicalPlan) - def filter[T : Encoder](func: T => Boolean): LogicalPlan = { -val deserialized = logicalPlan.deserialize[T] -val condition = expressions.callFunction(func, BooleanType, deserialized.output.head) -Filter(condition, deserialized).serialize[T] - } + def filter[T : Encoder](func: T => Boolean): LogicalPlan = TypedFilter(func, logicalPlan) def serialize[T : Encoder]: LogicalPlan = CatalystSerde.serialize[T](logicalPlan) http://git-wip-us.apache.org/repos/asf/spark/blob/d063898b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala index 502d791..127797c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -45,6 +45,7 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal + case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + http://git-wip-us.apache.org/repos/asf/spark/blob/d063898b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 9bc8cea..842d6bc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/o
spark git commit: [SPARK-16100][SQL] fix bug when use Map as the buffer type of Aggregator
Repository: spark Updated Branches: refs/heads/branch-2.0 5626a0af5 -> d73c38ed0 [SPARK-16100][SQL] fix bug when use Map as the buffer type of Aggregator ## What changes were proposed in this pull request? The root cause is in `MapObjects`. Its parameter `loopVar` is not declared as child, but sometimes can be same with `lambdaFunction`(e.g. the function that takes `loopVar` and produces `lambdaFunction` may be `identity`), which is a child. This brings trouble when call `withNewChildren`, it may mistakenly treat `loopVar` as a child and cause `IndexOutOfBoundsException: 0` later. This PR fixes this bug by simply pulling out the paremters from `LambdaVariable` and pass them to `MapObjects` directly. ## How was this patch tested? new test in `DatasetAggregatorSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #13835 from cloud-fan/map-objects. (cherry picked from commit 8a977b065418f07d2bf4fe1607a5534c32d04c47) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d73c38ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d73c38ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d73c38ed Branch: refs/heads/branch-2.0 Commit: d73c38ed0e129bdcb634000153516fca4b31b9d0 Parents: 5626a0a Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Jun 29 06:39:28 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 29 06:40:05 2016 +0800 -- .../catalyst/expressions/objects/objects.scala | 28 .../spark/sql/DatasetAggregatorSuite.scala | 15 +++ 2 files changed, 32 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d73c38ed/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index c597a2a..ea4dee1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -353,7 +353,7 @@ object MapObjects { val loopValue = "MapObjects_loopValue" + curId.getAndIncrement() val loopIsNull = "MapObjects_loopIsNull" + curId.getAndIncrement() val loopVar = LambdaVariable(loopValue, loopIsNull, elementType) -MapObjects(loopVar, function(loopVar), inputData) +MapObjects(loopValue, loopIsNull, elementType, function(loopVar), inputData) } } @@ -365,14 +365,20 @@ object MapObjects { * The following collection ObjectTypes are currently supported: * Seq, Array, ArrayData, java.util.List * - * @param loopVar A place holder that used as the loop variable when iterate the collection, and - *used as input for the `lambdaFunction`. It also carries the element type info. + * @param loopValue the name of the loop variable that used when iterate the collection, and used + * as input for the `lambdaFunction` + * @param loopIsNull the nullity of the loop variable that used when iterate the collection, and + * used as input for the `lambdaFunction` + * @param loopVarDataType the data type of the loop variable that used when iterate the collection, + *and used as input for the `lambdaFunction` * @param lambdaFunction A function that take the `loopVar` as input, and used as lambda function * to handle collection elements. * @param inputData An expression that when evaluated returns a collection object. */ case class MapObjects private( -loopVar: LambdaVariable, +loopValue: String, +loopIsNull: String, +loopVarDataType: DataType, lambdaFunction: Expression, inputData: Expression) extends Expression with NonSQLExpression { @@ -386,9 +392,9 @@ case class MapObjects private( override def dataType: DataType = ArrayType(lambdaFunction.dataType) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val elementJavaType = ctx.javaType(loopVar.dataType) -ctx.addMutableState("boolean", loopVar.isNull, "") -ctx.addMutableState(elementJavaType, loopVar.value, "") +val elementJavaType = ctx.javaType(loopVarDataType) +ctx.addMutableState("boolean", loopIsNull, "") +ctx.addMutableState(elementJavaType, loopValue, "") val genInputData = inputData.genCode(ctx) val genFunction = lambdaFunction.genCode
spark git commit: [SPARK-16100][SQL] fix bug when use Map as the buffer type of Aggregator
Repository: spark Updated Branches: refs/heads/master 25520e976 -> 8a977b065 [SPARK-16100][SQL] fix bug when use Map as the buffer type of Aggregator ## What changes were proposed in this pull request? The root cause is in `MapObjects`. Its parameter `loopVar` is not declared as child, but sometimes can be same with `lambdaFunction`(e.g. the function that takes `loopVar` and produces `lambdaFunction` may be `identity`), which is a child. This brings trouble when call `withNewChildren`, it may mistakenly treat `loopVar` as a child and cause `IndexOutOfBoundsException: 0` later. This PR fixes this bug by simply pulling out the paremters from `LambdaVariable` and pass them to `MapObjects` directly. ## How was this patch tested? new test in `DatasetAggregatorSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #13835 from cloud-fan/map-objects. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a977b06 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a977b06 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a977b06 Branch: refs/heads/master Commit: 8a977b065418f07d2bf4fe1607a5534c32d04c47 Parents: 25520e9 Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Jun 29 06:39:28 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 29 06:39:28 2016 +0800 -- .../catalyst/expressions/objects/objects.scala | 28 .../spark/sql/DatasetAggregatorSuite.scala | 15 +++ 2 files changed, 32 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a977b06/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index c597a2a..ea4dee1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -353,7 +353,7 @@ object MapObjects { val loopValue = "MapObjects_loopValue" + curId.getAndIncrement() val loopIsNull = "MapObjects_loopIsNull" + curId.getAndIncrement() val loopVar = LambdaVariable(loopValue, loopIsNull, elementType) -MapObjects(loopVar, function(loopVar), inputData) +MapObjects(loopValue, loopIsNull, elementType, function(loopVar), inputData) } } @@ -365,14 +365,20 @@ object MapObjects { * The following collection ObjectTypes are currently supported: * Seq, Array, ArrayData, java.util.List * - * @param loopVar A place holder that used as the loop variable when iterate the collection, and - *used as input for the `lambdaFunction`. It also carries the element type info. + * @param loopValue the name of the loop variable that used when iterate the collection, and used + * as input for the `lambdaFunction` + * @param loopIsNull the nullity of the loop variable that used when iterate the collection, and + * used as input for the `lambdaFunction` + * @param loopVarDataType the data type of the loop variable that used when iterate the collection, + *and used as input for the `lambdaFunction` * @param lambdaFunction A function that take the `loopVar` as input, and used as lambda function * to handle collection elements. * @param inputData An expression that when evaluated returns a collection object. */ case class MapObjects private( -loopVar: LambdaVariable, +loopValue: String, +loopIsNull: String, +loopVarDataType: DataType, lambdaFunction: Expression, inputData: Expression) extends Expression with NonSQLExpression { @@ -386,9 +392,9 @@ case class MapObjects private( override def dataType: DataType = ArrayType(lambdaFunction.dataType) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val elementJavaType = ctx.javaType(loopVar.dataType) -ctx.addMutableState("boolean", loopVar.isNull, "") -ctx.addMutableState(elementJavaType, loopVar.value, "") +val elementJavaType = ctx.javaType(loopVarDataType) +ctx.addMutableState("boolean", loopIsNull, "") +ctx.addMutableState(elementJavaType, loopValue, "") val genInputData = inputData.genCode(ctx) val genFunction = lambdaFunction.genCode(ctx) val dataLength = ctx.freshName("dataLength") @@ -443,11 +449,11 @@ case class MapObjects priva
spark git commit: [SPARK-16221][SQL] Redirect Parquet JUL logger via SLF4J for WRITE operations
Repository: spark Updated Branches: refs/heads/master 50fdd866b -> a0da854fb [SPARK-16221][SQL] Redirect Parquet JUL logger via SLF4J for WRITE operations ## What changes were proposed in this pull request? [SPARK-8118](https://github.com/apache/spark/pull/8196) implements redirecting Parquet JUL logger via SLF4J, but it is currently applied only when READ operations occurs. If users use only WRITE operations, there occurs many Parquet logs. This PR makes the redirection work on WRITE operations, too. **Before** ```scala scala> spark.range(10).write.format("parquet").mode("overwrite").save("/tmp/p") SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. Jun 26, 2016 9:04:38 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY about 70 lines Parquet Log . scala> spark.range(10).write.format("parquet").mode("overwrite").save("/tmp/p") about 70 lines Parquet Log . ``` **After** ```scala scala> spark.range(10).write.format("parquet").mode("overwrite").save("/tmp/p") SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. scala> spark.range(10).write.format("parquet").mode("overwrite").save("/tmp/p") ``` This PR also fixes some typos. ## How was this patch tested? Manual. Author: Dongjoon Hyun <dongj...@apache.org> Closes #13918 from dongjoon-hyun/SPARK-16221. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0da854f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0da854f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0da854f Branch: refs/heads/master Commit: a0da854fb3748aca0128377f0955600cb7a2b5bc Parents: 50fdd86 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Tue Jun 28 13:01:18 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jun 28 13:01:18 2016 +0800 -- .../datasources/parquet/ParquetFileFormat.scala| 17 - 1 file changed, 12 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a0da854f/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 2cce3db..80002d4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -129,6 +129,8 @@ private[sql] class ParquetFileFormat conf.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false) } +ParquetFileFormat.redirectParquetLogs() + new OutputWriterFactory { override def newInstance( path: String, @@ -468,9 +470,9 @@ private[sql] class ParquetOutputWriterFactory( override private[sql] def newWriter(path: String): OutputWriter = new OutputWriter { // Create TaskAttemptContext that is used to pass on Configuration to the ParquetRecordWriter -private val hadoopTaskAttempId = new TaskAttemptID(new TaskID(new JobID, TaskType.MAP, 0), 0) +private val hadoopTaskAttemptId = new TaskAttemptID(new TaskID(new JobID, TaskType.MAP, 0), 0) private val hadoopAttemptContext = new TaskAttemptContextImpl( - serializableConf.value, hadoopTaskAttempId) + serializableConf.value, hadoopTaskAttemptId) // Instance of ParquetRecordWriter that does not use OutputCommitter private val recordWriter = createNoCommitterRecordWriter(path, hadoopAttemptContext) @@ -505,7 +507,7 @@ private[sql] class ParquetOutputWriterFactory( dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { throw new UnsupportedOperationException( - "this verison of newInstance not supported for " + + "this version of newInstance not supported for " + "ParquetOutputWriterFactory") } } @@ -665,7 +667,7 @@ private[sql] object ParquetFileFormat extends Logging { Some(Try(DataType.fromJson(serializedSchema.get)) .recover { case _: Throwable =>
spark git commit: [SPARK-10591][SQL][TEST] Add a testcase to ensure if `checkAnswer` handles map correctly
Repository: spark Updated Branches: refs/heads/branch-2.0 ea8d419c1 -> 664426e00 [SPARK-10591][SQL][TEST] Add a testcase to ensure if `checkAnswer` handles map correctly ## What changes were proposed in this pull request? This PR adds a testcase to ensure if `checkAnswer` handles Map type correctly. ## How was this patch tested? Pass the jenkins tests. Author: Dongjoon Hyun <dongj...@apache.org> Closes #13913 from dongjoon-hyun/SPARK-10591. (cherry picked from commit 11f420b4bbcd607346204fb6fd7db7efe948cdac) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/664426e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/664426e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/664426e0 Branch: refs/heads/branch-2.0 Commit: 664426e003bb83e020800798013cf5d8a68051f2 Parents: ea8d419 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Mon Jun 27 19:04:50 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 27 19:05:18 2016 +0800 -- .../src/test/scala/org/apache/spark/sql/DatasetSuite.scala| 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/664426e0/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index bd8479b..43cbc03 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -34,6 +34,13 @@ class DatasetSuite extends QueryTest with SharedSQLContext { private implicit val ordering = Ordering.by((c: ClassData) => c.a -> c.b) + test("checkAnswer should compare map correctly") { +val data = Seq((1, "2", Map(1 -> 2, 2 -> 1))) +checkAnswer( + data.toDF(), + Seq(Row(1, "2", Map(2 -> 1, 1 -> 2 + } + test("toDS") { val data = Seq(("a", 1), ("b", 2), ("c", 3)) checkDataset( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10591][SQL][TEST] Add a testcase to ensure if `checkAnswer` handles map correctly
Repository: spark Updated Branches: refs/heads/master 52d4fe057 -> 11f420b4b [SPARK-10591][SQL][TEST] Add a testcase to ensure if `checkAnswer` handles map correctly ## What changes were proposed in this pull request? This PR adds a testcase to ensure if `checkAnswer` handles Map type correctly. ## How was this patch tested? Pass the jenkins tests. Author: Dongjoon Hyun <dongj...@apache.org> Closes #13913 from dongjoon-hyun/SPARK-10591. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/11f420b4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/11f420b4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/11f420b4 Branch: refs/heads/master Commit: 11f420b4bbcd607346204fb6fd7db7efe948cdac Parents: 52d4fe0 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Mon Jun 27 19:04:50 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 27 19:04:50 2016 +0800 -- .../src/test/scala/org/apache/spark/sql/DatasetSuite.scala| 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/11f420b4/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index bd8479b..43cbc03 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -34,6 +34,13 @@ class DatasetSuite extends QueryTest with SharedSQLContext { private implicit val ordering = Ordering.by((c: ClassData) => c.a -> c.b) + test("checkAnswer should compare map correctly") { +val data = Seq((1, "2", Map(1 -> 2, 2 -> 1))) +checkAnswer( + data.toDF(), + Seq(Row(1, "2", Map(2 -> 1, 1 -> 2 + } + test("toDS") { val data = Seq(("a", 1), ("b", 2), ("c", 3)) checkDataset( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16165][SQL] Fix the update logic for InMemoryTableScanExec.readBatches
Repository: spark Updated Branches: refs/heads/branch-2.0 05677bb5a -> e8d1bf60a [SPARK-16165][SQL] Fix the update logic for InMemoryTableScanExec.readBatches ## What changes were proposed in this pull request? Currently, `readBatches` accumulator of `InMemoryTableScanExec` is updated only when `spark.sql.inMemoryColumnarStorage.partitionPruning` is true. Although this metric is used for only testing purpose, we had better have correct metric without considering SQL options. ## How was this patch tested? Pass the Jenkins tests (including a new testcase). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13870 from dongjoon-hyun/SPARK-16165. (cherry picked from commit 264bc63623b20529abcf84abcb333e7c16ad1ef9) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8d1bf60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8d1bf60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8d1bf60 Branch: refs/heads/branch-2.0 Commit: e8d1bf60a45c6662c955b5a3618ff7299713b6d8 Parents: 05677bb Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Jun 24 07:19:20 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 24 07:19:44 2016 +0800 -- .../execution/columnar/InMemoryTableScanExec.scala | 6 +++--- .../columnar/PartitionBatchPruningSuite.scala| 15 +++ 2 files changed, 18 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8d1bf60/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala index 2695f35..183e494 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala @@ -147,9 +147,6 @@ private[sql] case class InMemoryTableScanExec( logInfo(s"Skipping partition based on stats $statsString") false } else { - if (enableAccumulators) { -readBatches.add(1) - } true } } @@ -159,6 +156,9 @@ private[sql] case class InMemoryTableScanExec( // update SQL metrics val withMetrics = cachedBatchesToScan.map { batch => +if (enableAccumulators) { + readBatches.add(1) +} numOutputRows += batch.numRows batch } http://git-wip-us.apache.org/repos/asf/spark/blob/e8d1bf60/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala index a118cec..7ca8e04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala @@ -119,6 +119,21 @@ class PartitionBatchPruningSuite } } + // With disable IN_MEMORY_PARTITION_PRUNING option + test("disable IN_MEMORY_PARTITION_PRUNING") { +spark.conf.set(SQLConf.IN_MEMORY_PARTITION_PRUNING.key, false) + +val df = sql("SELECT key FROM pruningData WHERE key = 1") +val result = df.collect().map(_(0)).toArray +assert(result.length === 1) + +val (readPartitions, readBatches) = df.queryExecution.sparkPlan.collect { +case in: InMemoryTableScanExec => (in.readPartitions.value, in.readBatches.value) + }.head +assert(readPartitions === 5) +assert(readBatches === 10) + } + def checkBatchPruning( query: String, expectedReadPartitions: Int, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16165][SQL] Fix the update logic for InMemoryTableScanExec.readBatches
Repository: spark Updated Branches: refs/heads/master 0e4bdebec -> 264bc6362 [SPARK-16165][SQL] Fix the update logic for InMemoryTableScanExec.readBatches ## What changes were proposed in this pull request? Currently, `readBatches` accumulator of `InMemoryTableScanExec` is updated only when `spark.sql.inMemoryColumnarStorage.partitionPruning` is true. Although this metric is used for only testing purpose, we had better have correct metric without considering SQL options. ## How was this patch tested? Pass the Jenkins tests (including a new testcase). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13870 from dongjoon-hyun/SPARK-16165. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/264bc636 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/264bc636 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/264bc636 Branch: refs/heads/master Commit: 264bc63623b20529abcf84abcb333e7c16ad1ef9 Parents: 0e4bdeb Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Jun 24 07:19:20 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 24 07:19:20 2016 +0800 -- .../execution/columnar/InMemoryTableScanExec.scala | 6 +++--- .../columnar/PartitionBatchPruningSuite.scala| 15 +++ 2 files changed, 18 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/264bc636/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala index 2695f35..183e494 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala @@ -147,9 +147,6 @@ private[sql] case class InMemoryTableScanExec( logInfo(s"Skipping partition based on stats $statsString") false } else { - if (enableAccumulators) { -readBatches.add(1) - } true } } @@ -159,6 +156,9 @@ private[sql] case class InMemoryTableScanExec( // update SQL metrics val withMetrics = cachedBatchesToScan.map { batch => +if (enableAccumulators) { + readBatches.add(1) +} numOutputRows += batch.numRows batch } http://git-wip-us.apache.org/repos/asf/spark/blob/264bc636/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala index a118cec..7ca8e04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala @@ -119,6 +119,21 @@ class PartitionBatchPruningSuite } } + // With disable IN_MEMORY_PARTITION_PRUNING option + test("disable IN_MEMORY_PARTITION_PRUNING") { +spark.conf.set(SQLConf.IN_MEMORY_PARTITION_PRUNING.key, false) + +val df = sql("SELECT key FROM pruningData WHERE key = 1") +val result = df.collect().map(_(0)).toArray +assert(result.length === 1) + +val (readPartitions, readBatches) = df.queryExecution.sparkPlan.collect { +case in: InMemoryTableScanExec => (in.readPartitions.value, in.readBatches.value) + }.head +assert(readPartitions === 5) +assert(readBatches === 10) + } + def checkBatchPruning( query: String, expectedReadPartitions: Int, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16097][SQL] Encoders.tuple should handle null object correctly
Repository: spark Updated Branches: refs/heads/branch-2.0 60bd704b5 -> 503eb882c [SPARK-16097][SQL] Encoders.tuple should handle null object correctly ## What changes were proposed in this pull request? Although the top level input object can not be null, but when we use `Encoders.tuple` to combine 2 encoders, their input objects are not top level anymore and can be null. We should handle this case. ## How was this patch tested? new test in DatasetSuite Author: Wenchen Fan <wenc...@databricks.com> Closes #13807 from cloud-fan/bug. (cherry picked from commit 01277d4b259dcf9cad25eece1377162b7a8c946d) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/503eb882 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/503eb882 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/503eb882 Branch: refs/heads/branch-2.0 Commit: 503eb882c14eac9681981199ccf8f699cab23bf0 Parents: 60bd704 Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Jun 22 18:32:14 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 22 18:37:36 2016 +0800 -- .../catalyst/encoders/ExpressionEncoder.scala | 48 ++-- .../org/apache/spark/sql/DatasetSuite.scala | 7 +++ 2 files changed, 42 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/503eb882/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 0023ce6..1fac26c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Invoke, NewInstance} import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LocalRelation} -import org.apache.spark.sql.types.{ObjectType, StructField, StructType} +import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, StructType} import org.apache.spark.util.Utils /** @@ -110,16 +110,34 @@ object ExpressionEncoder { val cls = Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}") -val serializer = encoders.map { - case e if e.flat => e.serializer.head - case other => CreateStruct(other.serializer) -}.zipWithIndex.map { case (expr, index) => - expr.transformUp { -case BoundReference(0, t, _) => - Invoke( -BoundReference(0, ObjectType(cls), nullable = true), -s"_${index + 1}", -t) +val serializer = encoders.zipWithIndex.map { case (enc, index) => + val originalInputObject = enc.serializer.head.collect { case b: BoundReference => b }.head + val newInputObject = Invoke( +BoundReference(0, ObjectType(cls), nullable = true), +s"_${index + 1}", +originalInputObject.dataType) + + val newSerializer = enc.serializer.map(_.transformUp { +case b: BoundReference if b == originalInputObject => newInputObject + }) + + if (enc.flat) { +newSerializer.head + } else { +// For non-flat encoder, the input object is not top level anymore after being combined to +// a tuple encoder, thus it can be null and we should wrap the `CreateStruct` with `If` and +// null check to handle null case correctly. +// e.g. for Encoder[(Int, String)], the serializer expressions will create 2 columns, and is +// not able to handle the case when the input tuple is null. This is not a problem as there +// is a check to make sure the input object won't be null. However, if this encoder is used +// to create a bigger tuple encoder, the original input object becomes a filed of the new +// input tuple and can be null. So instead of creating a struct directly here, we should add +// a null/None check and return a null struct if the null/None check fails. +val struct = CreateStruct(newSerializer) +val nullCheck = Or( + IsNull(newInputObject), + Invoke(Literal.fromObject(None), "equals", BooleanType, newInputObject :: Nil)) +If(nullCheck, Li
spark git commit: [SPARK-16097][SQL] Encoders.tuple should handle null object correctly
Repository: spark Updated Branches: refs/heads/master 39ad53f7f -> 01277d4b2 [SPARK-16097][SQL] Encoders.tuple should handle null object correctly ## What changes were proposed in this pull request? Although the top level input object can not be null, but when we use `Encoders.tuple` to combine 2 encoders, their input objects are not top level anymore and can be null. We should handle this case. ## How was this patch tested? new test in DatasetSuite Author: Wenchen Fan <wenc...@databricks.com> Closes #13807 from cloud-fan/bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01277d4b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01277d4b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01277d4b Branch: refs/heads/master Commit: 01277d4b259dcf9cad25eece1377162b7a8c946d Parents: 39ad53f Author: Wenchen Fan <wenc...@databricks.com> Authored: Wed Jun 22 18:32:14 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 22 18:32:14 2016 +0800 -- .../catalyst/encoders/ExpressionEncoder.scala | 48 ++-- .../org/apache/spark/sql/DatasetSuite.scala | 7 +++ 2 files changed, 42 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/01277d4b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 0023ce6..1fac26c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Invoke, NewInstance} import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LocalRelation} -import org.apache.spark.sql.types.{ObjectType, StructField, StructType} +import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, StructType} import org.apache.spark.util.Utils /** @@ -110,16 +110,34 @@ object ExpressionEncoder { val cls = Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}") -val serializer = encoders.map { - case e if e.flat => e.serializer.head - case other => CreateStruct(other.serializer) -}.zipWithIndex.map { case (expr, index) => - expr.transformUp { -case BoundReference(0, t, _) => - Invoke( -BoundReference(0, ObjectType(cls), nullable = true), -s"_${index + 1}", -t) +val serializer = encoders.zipWithIndex.map { case (enc, index) => + val originalInputObject = enc.serializer.head.collect { case b: BoundReference => b }.head + val newInputObject = Invoke( +BoundReference(0, ObjectType(cls), nullable = true), +s"_${index + 1}", +originalInputObject.dataType) + + val newSerializer = enc.serializer.map(_.transformUp { +case b: BoundReference if b == originalInputObject => newInputObject + }) + + if (enc.flat) { +newSerializer.head + } else { +// For non-flat encoder, the input object is not top level anymore after being combined to +// a tuple encoder, thus it can be null and we should wrap the `CreateStruct` with `If` and +// null check to handle null case correctly. +// e.g. for Encoder[(Int, String)], the serializer expressions will create 2 columns, and is +// not able to handle the case when the input tuple is null. This is not a problem as there +// is a check to make sure the input object won't be null. However, if this encoder is used +// to create a bigger tuple encoder, the original input object becomes a filed of the new +// input tuple and can be null. So instead of creating a struct directly here, we should add +// a null/None check and return a null struct if the null/None check fails. +val struct = CreateStruct(newSerializer) +val nullCheck = Or( + IsNull(newInputObject), + Invoke(Literal.fromObject(None), "equals", BooleanType, newInputObject :: Nil)) +If(nullCheck, Literal.create(null, struct.dataType), struct) } } @@ -203,8 +221,12 @@ case class ExpressionEncoder[T]( // (intermediate
spark git commit: [SPARK-16121] ListingFileCatalog does not list in parallel anymore
Repository: spark Updated Branches: refs/heads/branch-2.0 838143a2a -> 60bd704b5 [SPARK-16121] ListingFileCatalog does not list in parallel anymore ## What changes were proposed in this pull request? Seems the fix of SPARK-14959 breaks the parallel partitioning discovery. This PR fixes the problem ## How was this patch tested? Tested manually. (This PR also adds a proper test for SPARK-14959) Author: Yin Huai <yh...@databricks.com> Closes #13830 from yhuai/SPARK-16121. (cherry picked from commit 39ad53f7ffddae5ba0ff0a76089ba671b14c44c8) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60bd704b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60bd704b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60bd704b Branch: refs/heads/branch-2.0 Commit: 60bd704b541c4d1991922ffd3dd5b47de9bd5821 Parents: 838143a Author: Yin Huai <yh...@databricks.com> Authored: Wed Jun 22 18:07:07 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 22 18:07:27 2016 +0800 -- .../datasources/ListingFileCatalog.scala| 58 ++-- .../datasources/fileSourceInterfaces.scala | 7 ++- .../datasources/FileSourceStrategySuite.scala | 45 ++- 3 files changed, 101 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60bd704b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala index f713fde..675e755 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources import scala.collection.mutable import scala.util.Try -import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.SparkSession @@ -73,21 +73,67 @@ class ListingFileCatalog( cachedPartitionSpec = null } - protected def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { + /** + * List leaf files of given paths. This method will submit a Spark job to do parallel + * listing whenever there is a path having more files than the parallel partition discovery + * discovery threshold. + */ + protected[spark] def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession) } else { + // Right now, the number of paths is less than the value of + // parallelPartitionDiscoveryThreshold. So, we will list file statues at the driver. + // If there is any child that has more files than the threshold, we will use parallel + // listing. + // Dummy jobconf to get to the pathFilter defined in configuration val jobConf = new JobConf(hadoopConf, this.getClass) val pathFilter = FileInputFormat.getInputPathFilter(jobConf) + val statuses: Seq[FileStatus] = paths.flatMap { path => val fs = path.getFileSystem(hadoopConf) logTrace(s"Listing $path on driver") -Try { - HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), pathFilter) -}.getOrElse(Array.empty[FileStatus]) + +val childStatuses = { + // TODO: We need to avoid of using Try at here. + val stats = Try(fs.listStatus(path)).getOrElse(Array.empty[FileStatus]) + if (pathFilter != null) stats.filter(f => pathFilter.accept(f.getPath)) else stats +} + +childStatuses.map { + case f: LocatedFileStatus => f + + // NOTE: + // + // - Although S3/S3A/S3N file system can be quite slow for remote file metadata + // operations, calling `getFileBlockLocations` does no harm here since these file system + // implementations don't actually issue RPC for this method. + // + // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not + // be a big deal since we always use to `listLeafFilesInParallel` when the number of + // paths exceeds threshold. + case f => +
spark git commit: [SPARK-16121] ListingFileCatalog does not list in parallel anymore
Repository: spark Updated Branches: refs/heads/master d281b0baf -> 39ad53f7f [SPARK-16121] ListingFileCatalog does not list in parallel anymore ## What changes were proposed in this pull request? Seems the fix of SPARK-14959 breaks the parallel partitioning discovery. This PR fixes the problem ## How was this patch tested? Tested manually. (This PR also adds a proper test for SPARK-14959) Author: Yin Huai <yh...@databricks.com> Closes #13830 from yhuai/SPARK-16121. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39ad53f7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39ad53f7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39ad53f7 Branch: refs/heads/master Commit: 39ad53f7ffddae5ba0ff0a76089ba671b14c44c8 Parents: d281b0b Author: Yin Huai <yh...@databricks.com> Authored: Wed Jun 22 18:07:07 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 22 18:07:07 2016 +0800 -- .../datasources/ListingFileCatalog.scala| 58 ++-- .../datasources/fileSourceInterfaces.scala | 7 ++- .../datasources/FileSourceStrategySuite.scala | 45 ++- 3 files changed, 101 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39ad53f7/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala index f713fde..675e755 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources import scala.collection.mutable import scala.util.Try -import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.SparkSession @@ -73,21 +73,67 @@ class ListingFileCatalog( cachedPartitionSpec = null } - protected def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { + /** + * List leaf files of given paths. This method will submit a Spark job to do parallel + * listing whenever there is a path having more files than the parallel partition discovery + * discovery threshold. + */ + protected[spark] def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession) } else { + // Right now, the number of paths is less than the value of + // parallelPartitionDiscoveryThreshold. So, we will list file statues at the driver. + // If there is any child that has more files than the threshold, we will use parallel + // listing. + // Dummy jobconf to get to the pathFilter defined in configuration val jobConf = new JobConf(hadoopConf, this.getClass) val pathFilter = FileInputFormat.getInputPathFilter(jobConf) + val statuses: Seq[FileStatus] = paths.flatMap { path => val fs = path.getFileSystem(hadoopConf) logTrace(s"Listing $path on driver") -Try { - HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), pathFilter) -}.getOrElse(Array.empty[FileStatus]) + +val childStatuses = { + // TODO: We need to avoid of using Try at here. + val stats = Try(fs.listStatus(path)).getOrElse(Array.empty[FileStatus]) + if (pathFilter != null) stats.filter(f => pathFilter.accept(f.getPath)) else stats +} + +childStatuses.map { + case f: LocatedFileStatus => f + + // NOTE: + // + // - Although S3/S3A/S3N file system can be quite slow for remote file metadata + // operations, calling `getFileBlockLocations` does no harm here since these file system + // implementations don't actually issue RPC for this method. + // + // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not + // be a big deal since we always use to `listLeafFilesInParallel` when the number of + // paths exceeds threshold. + case f => +if (f.isDirectory ) { + // If f is a directory, we do not need to call getFileBlockLocations (SPAR
spark git commit: [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0
Repository: spark Updated Branches: refs/heads/branch-2.0 5a4fce456 -> 77d8226df [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0 ## What changes were proposed in this pull request? Doc changes ## How was this patch tested? manual liancheng Author: Felix Cheung <felixcheun...@hotmail.com> Closes #13827 from felixcheung/sqldocdeprecate. (cherry picked from commit 79aa1d82ca56eb847cbf4ff81de0564b339988f6) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77d8226d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77d8226d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77d8226d Branch: refs/heads/branch-2.0 Commit: 77d8226dfc93fc5f7cde3cc601984fc1a1a54be5 Parents: 5a4fce4 Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Wed Jun 22 10:37:13 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 22 10:41:15 2016 +0800 -- docs/sql-programming-guide.md | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/77d8226d/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index ddf8f70..4b52c94 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2143,7 +2143,7 @@ options. ## Upgrading From Spark SQL 1.6 to 2.0 - `SparkSession` is now the new entry point of Spark that replaces the old `SQLContext` and - `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. + `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. A new `catalog` interface is accessible from `SparkSession` - existing API on databases and tables access such as `listTables`, `createExternalTable`, `dropTempView`, `cacheTable` are moved here. - Dataset API and DataFrame API are unified. In Scala, `DataFrame` becomes a type alias for `Dataset[Row]`, while Java API users must replace `DataFrame` with `Dataset`. Both the typed @@ -2153,6 +2153,10 @@ options. APIs. Instead, `DataFrame` remains the primary programing abstraction, which is analogous to the single-node data frame notion in these languages. + - Dataset and DataFrame API `unionAll` has been deprecated and replaced by `union` + - Dataset and DataFrame API `explode` has been deprecated, alternatively, use `functions.explode()` with `select` or `flatMap` + - Dataset and DataFrame API `registerTempTable` has been deprecated and replaced by `createOrReplaceTempView` + ## Upgrading From Spark SQL 1.5 to 1.6 - From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0
Repository: spark Updated Branches: refs/heads/master 9493b079a -> 79aa1d82c [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0 ## What changes were proposed in this pull request? Doc changes ## How was this patch tested? manual liancheng Author: Felix Cheung <felixcheun...@hotmail.com> Closes #13827 from felixcheung/sqldocdeprecate. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79aa1d82 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79aa1d82 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79aa1d82 Branch: refs/heads/master Commit: 79aa1d82ca56eb847cbf4ff81de0564b339988f6 Parents: 9493b07 Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Wed Jun 22 10:37:13 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 22 10:37:13 2016 +0800 -- docs/sql-programming-guide.md | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79aa1d82/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index ddf8f70..4b52c94 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2143,7 +2143,7 @@ options. ## Upgrading From Spark SQL 1.6 to 2.0 - `SparkSession` is now the new entry point of Spark that replaces the old `SQLContext` and - `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. + `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. A new `catalog` interface is accessible from `SparkSession` - existing API on databases and tables access such as `listTables`, `createExternalTable`, `dropTempView`, `cacheTable` are moved here. - Dataset API and DataFrame API are unified. In Scala, `DataFrame` becomes a type alias for `Dataset[Row]`, while Java API users must replace `DataFrame` with `Dataset`. Both the typed @@ -2153,6 +2153,10 @@ options. APIs. Instead, `DataFrame` remains the primary programing abstraction, which is analogous to the single-node data frame notion in these languages. + - Dataset and DataFrame API `unionAll` has been deprecated and replaced by `union` + - Dataset and DataFrame API `explode` has been deprecated, alternatively, use `functions.explode()` with `select` or `flatMap` + - Dataset and DataFrame API `registerTempTable` has been deprecated and replaced by `createOrReplaceTempView` + ## Upgrading From Spark SQL 1.5 to 1.6 - From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15894][SQL][DOC] Update docs for controlling #partitions
Repository: spark Updated Branches: refs/heads/master 58f6e27dd -> 41e0ffb19 [SPARK-15894][SQL][DOC] Update docs for controlling #partitions ## What changes were proposed in this pull request? Update docs for two parameters `spark.sql.files.maxPartitionBytes` and `spark.sql.files.openCostInBytes ` in Other Configuration Options. ## How was this patch tested? N/A Author: Takeshi YAMAMURO <linguin@gmail.com> Closes #13797 from maropu/SPARK-15894-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/41e0ffb1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/41e0ffb1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/41e0ffb1 Branch: refs/heads/master Commit: 41e0ffb19f678e9b1e87f747a5e4e3d44964e39a Parents: 58f6e27 Author: Takeshi YAMAMURO <linguin@gmail.com> Authored: Tue Jun 21 14:27:16 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jun 21 14:27:16 2016 +0800 -- docs/sql-programming-guide.md | 17 + 1 file changed, 17 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/41e0ffb1/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 4206f73..ddf8f70 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2016,6 +2016,23 @@ that these options will be deprecated in future release as more optimizations ar Property NameDefaultMeaning +spark.sql.files.maxPartitionBytes +134217728 (128 MB) + + The maximum number of bytes to pack into a single partition when reading files. + + + +spark.sql.files.openCostInBytes +4194304 (4 MB) + + The estimated cost to open a file, measured by the number of bytes could be scanned in the same + time. This is used when putting multiple files into a partition. It is better to over estimated, + then the partitions with small files will be faster than partitions with bigger files (which is + scheduled first). + + + spark.sql.autoBroadcastJoinThreshold 10485760 (10 MB) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15894][SQL][DOC] Update docs for controlling #partitions
Repository: spark Updated Branches: refs/heads/branch-2.0 dbf7f48b6 -> 4e193d3da [SPARK-15894][SQL][DOC] Update docs for controlling #partitions ## What changes were proposed in this pull request? Update docs for two parameters `spark.sql.files.maxPartitionBytes` and `spark.sql.files.openCostInBytes ` in Other Configuration Options. ## How was this patch tested? N/A Author: Takeshi YAMAMURO <linguin@gmail.com> Closes #13797 from maropu/SPARK-15894-2. (cherry picked from commit 41e0ffb19f678e9b1e87f747a5e4e3d44964e39a) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e193d3d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e193d3d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e193d3d Branch: refs/heads/branch-2.0 Commit: 4e193d3daf5bdfb38d7df6da5b7abdd53888ec99 Parents: dbf7f48 Author: Takeshi YAMAMURO <linguin@gmail.com> Authored: Tue Jun 21 14:27:16 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jun 21 14:27:31 2016 +0800 -- docs/sql-programming-guide.md | 17 + 1 file changed, 17 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e193d3d/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 4206f73..ddf8f70 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2016,6 +2016,23 @@ that these options will be deprecated in future release as more optimizations ar Property NameDefaultMeaning +spark.sql.files.maxPartitionBytes +134217728 (128 MB) + + The maximum number of bytes to pack into a single partition when reading files. + + + +spark.sql.files.openCostInBytes +4194304 (4 MB) + + The estimated cost to open a file, measured by the number of bytes could be scanned in the same + time. This is used when putting multiple files into a partition. It is better to over estimated, + then the partitions with small files will be faster than partitions with bigger files (which is + scheduled first). + + + spark.sql.autoBroadcastJoinThreshold 10485760 (10 MB) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15863][SQL][DOC][SPARKR] sql programming guide updates to include sparkSession in R
Repository: spark Updated Branches: refs/heads/branch-2.0 4fc4eb943 -> dbf7f48b6 [SPARK-15863][SQL][DOC][SPARKR] sql programming guide updates to include sparkSession in R ## What changes were proposed in this pull request? Update doc as per discussion in PR #13592 ## How was this patch tested? manual shivaram liancheng Author: Felix Cheung <felixcheun...@hotmail.com> Closes #13799 from felixcheung/rsqlprogrammingguide. (cherry picked from commit 58f6e27dd70f476f99ac8204e6b405bced4d6de1) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbf7f48b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbf7f48b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbf7f48b Branch: refs/heads/branch-2.0 Commit: dbf7f48b6e73f3500b0abe9055ac204a3f756418 Parents: 4fc4eb9 Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Tue Jun 21 13:56:37 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jun 21 13:57:03 2016 +0800 -- docs/sparkr.md| 2 +- docs/sql-programming-guide.md | 34 -- 2 files changed, 17 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dbf7f48b/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 023bbcd..f018901 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -152,7 +152,7 @@ write.df(people, path="people.parquet", source="parquet", mode="overwrite") ### From Hive tables -You can also create SparkDataFrames from Hive tables. To do this we will need to create a SparkSession with Hive support which can access tables in the Hive MetaStore. Note that Spark should have been built with [Hive support](building-spark.html#building-with-hive-and-jdbc-support) and more details can be found in the [SQL programming guide](sql-programming-guide.html#starting-point-sqlcontext). In SparkR, by default it will attempt to create a SparkSession with Hive support enabled (`enableHiveSupport = TRUE`). +You can also create SparkDataFrames from Hive tables. To do this we will need to create a SparkSession with Hive support which can access tables in the Hive MetaStore. Note that Spark should have been built with [Hive support](building-spark.html#building-with-hive-and-jdbc-support) and more details can be found in the [SQL programming guide](sql-programming-guide.html#starting-point-sparksession). In SparkR, by default it will attempt to create a SparkSession with Hive support enabled (`enableHiveSupport = TRUE`). {% highlight r %} http://git-wip-us.apache.org/repos/asf/spark/blob/dbf7f48b/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d93f30b..4206f73 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -107,19 +107,17 @@ spark = SparkSession.build \ -Unlike Scala, Java, and Python API, we haven't finished migrating `SQLContext` to `SparkSession` for SparkR yet, so -the entry point into all relational functionality in SparkR is still the -`SQLContext` class in Spark 2.0. To create a basic `SQLContext`, all you need is a `SparkContext`. +The entry point into all functionality in Spark is the [`SparkSession`](api/R/sparkR.session.html) class. To initialize a basic `SparkSession`, just call `sparkR.session()`: {% highlight r %} -spark <- sparkRSQL.init(sc) +sparkR.session() {% endhighlight %} -Note that when invoked for the first time, `sparkRSQL.init()` initializes a global `SQLContext` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SQLContext` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SQLContext` instance around. +Note that when invoked for the first time, `sparkR.session()` initializes a global `SparkSession` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SparkSession` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SparkSession` instance around. -`SparkSession` (or `SQLContext` for SparkR) in Spark 2.0 provides builtin support for Hive features including the ability to +`SparkSession` in Spark 2.0 provides builtin support for Hive features including the ability to write queries using HiveQL, access to Hive U
spark git commit: [SPARK-15863][SQL][DOC][SPARKR] sql programming guide updates to include sparkSession in R
Repository: spark Updated Branches: refs/heads/master 07367533d -> 58f6e27dd [SPARK-15863][SQL][DOC][SPARKR] sql programming guide updates to include sparkSession in R ## What changes were proposed in this pull request? Update doc as per discussion in PR #13592 ## How was this patch tested? manual shivaram liancheng Author: Felix Cheung <felixcheun...@hotmail.com> Closes #13799 from felixcheung/rsqlprogrammingguide. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58f6e27d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58f6e27d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58f6e27d Branch: refs/heads/master Commit: 58f6e27dd70f476f99ac8204e6b405bced4d6de1 Parents: 0736753 Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Tue Jun 21 13:56:37 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Tue Jun 21 13:56:37 2016 +0800 -- docs/sparkr.md| 2 +- docs/sql-programming-guide.md | 34 -- 2 files changed, 17 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/58f6e27d/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 023bbcd..f018901 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -152,7 +152,7 @@ write.df(people, path="people.parquet", source="parquet", mode="overwrite") ### From Hive tables -You can also create SparkDataFrames from Hive tables. To do this we will need to create a SparkSession with Hive support which can access tables in the Hive MetaStore. Note that Spark should have been built with [Hive support](building-spark.html#building-with-hive-and-jdbc-support) and more details can be found in the [SQL programming guide](sql-programming-guide.html#starting-point-sqlcontext). In SparkR, by default it will attempt to create a SparkSession with Hive support enabled (`enableHiveSupport = TRUE`). +You can also create SparkDataFrames from Hive tables. To do this we will need to create a SparkSession with Hive support which can access tables in the Hive MetaStore. Note that Spark should have been built with [Hive support](building-spark.html#building-with-hive-and-jdbc-support) and more details can be found in the [SQL programming guide](sql-programming-guide.html#starting-point-sparksession). In SparkR, by default it will attempt to create a SparkSession with Hive support enabled (`enableHiveSupport = TRUE`). {% highlight r %} http://git-wip-us.apache.org/repos/asf/spark/blob/58f6e27d/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d93f30b..4206f73 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -107,19 +107,17 @@ spark = SparkSession.build \ -Unlike Scala, Java, and Python API, we haven't finished migrating `SQLContext` to `SparkSession` for SparkR yet, so -the entry point into all relational functionality in SparkR is still the -`SQLContext` class in Spark 2.0. To create a basic `SQLContext`, all you need is a `SparkContext`. +The entry point into all functionality in Spark is the [`SparkSession`](api/R/sparkR.session.html) class. To initialize a basic `SparkSession`, just call `sparkR.session()`: {% highlight r %} -spark <- sparkRSQL.init(sc) +sparkR.session() {% endhighlight %} -Note that when invoked for the first time, `sparkRSQL.init()` initializes a global `SQLContext` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SQLContext` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SQLContext` instance around. +Note that when invoked for the first time, `sparkR.session()` initializes a global `SparkSession` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SparkSession` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SparkSession` instance around. -`SparkSession` (or `SQLContext` for SparkR) in Spark 2.0 provides builtin support for Hive features including the ability to +`SparkSession` in Spark 2.0 provides builtin support for Hive features including the ability to write queries using HiveQL, access to Hive UDFs, and the ability to read data from Hive tables. To use these features, you do not need to have an existing Hive setup. @
spark git commit: [SPARK-16030][SQL] Allow specifying static partitions when inserting to data source tables
Repository: spark Updated Branches: refs/heads/branch-2.0 19397caab -> 0b0b5fe54 [SPARK-16030][SQL] Allow specifying static partitions when inserting to data source tables ## What changes were proposed in this pull request? This PR adds the static partition support to INSERT statement when the target table is a data source table. ## How was this patch tested? New tests in InsertIntoHiveTableSuite and DataSourceAnalysisSuite. **Note: This PR is based on https://github.com/apache/spark/pull/13766. The last commit is the actual change.** Author: Yin Huai <yh...@databricks.com> Closes #13769 from yhuai/SPARK-16030-1. (cherry picked from commit 905f774b71f4b814d5a2412c7c35bd023c3dfdf8) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b0b5fe5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b0b5fe5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b0b5fe5 Branch: refs/heads/branch-2.0 Commit: 0b0b5fe549086171d851d7c4458d48be9409380f Parents: 19397ca Author: Yin Huai <yh...@databricks.com> Authored: Mon Jun 20 20:17:47 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 20 20:18:17 2016 +0800 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 19 ++ .../datasources/DataSourceStrategy.scala| 127 +++- .../spark/sql/execution/datasources/rules.scala | 7 - .../spark/sql/internal/SessionState.scala | 2 +- .../sql/sources/DataSourceAnalysisSuite.scala | 202 +++ .../spark/sql/hive/HiveSessionState.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala| 3 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 97 - .../sql/hive/execution/HiveQuerySuite.scala | 2 +- 9 files changed, 436 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b0b5fe5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 7b451ba..8992276 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -313,6 +313,8 @@ trait CheckAnalysis extends PredicateHelper { |${s.catalogTable.identifier} """.stripMargin) + // TODO: We need to consolidate this kind of checks for InsertIntoTable + // with the rule of PreWriteCheck defined in extendedCheckRules. case InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) => failAnalysis( s""" @@ -320,6 +322,23 @@ trait CheckAnalysis extends PredicateHelper { |${s.catalogTable.identifier} """.stripMargin) + case InsertIntoTable(t, _, _, _, _) +if !t.isInstanceOf[LeafNode] || + t == OneRowRelation || + t.isInstanceOf[LocalRelation] => +failAnalysis(s"Inserting into an RDD-based table is not allowed.") + + case i @ InsertIntoTable(table, partitions, query, _, _) => +val numStaticPartitions = partitions.values.count(_.isDefined) +if (table.output.size != (query.output.size + numStaticPartitions)) { + failAnalysis( +s"$table requires that the data to be inserted have the same number of " + + s"columns as the target table: target table has ${table.output.size} " + + s"column(s) but the inserted data has " + + s"${query.output.size + numStaticPartitions} column(s), including " + + s"$numStaticPartitions partition column(s) having constant value(s).") +} + case o if !o.resolved => failAnalysis( s"unresolved operator ${operator.simpleString}") http://git-wip-us.apache.org/repos/asf/spark/blob/0b0b5fe5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 2b47865..27133f0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/executio
spark git commit: [SPARK-16030][SQL] Allow specifying static partitions when inserting to data source tables
Repository: spark Updated Branches: refs/heads/master 6d0f921ae -> 905f774b7 [SPARK-16030][SQL] Allow specifying static partitions when inserting to data source tables ## What changes were proposed in this pull request? This PR adds the static partition support to INSERT statement when the target table is a data source table. ## How was this patch tested? New tests in InsertIntoHiveTableSuite and DataSourceAnalysisSuite. **Note: This PR is based on https://github.com/apache/spark/pull/13766. The last commit is the actual change.** Author: Yin Huai <yh...@databricks.com> Closes #13769 from yhuai/SPARK-16030-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/905f774b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/905f774b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/905f774b Branch: refs/heads/master Commit: 905f774b71f4b814d5a2412c7c35bd023c3dfdf8 Parents: 6d0f921 Author: Yin Huai <yh...@databricks.com> Authored: Mon Jun 20 20:17:47 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 20 20:17:47 2016 +0800 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 19 ++ .../datasources/DataSourceStrategy.scala| 127 +++- .../spark/sql/execution/datasources/rules.scala | 7 - .../spark/sql/internal/SessionState.scala | 2 +- .../sql/sources/DataSourceAnalysisSuite.scala | 202 +++ .../spark/sql/hive/HiveSessionState.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala| 3 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 97 - .../sql/hive/execution/HiveQuerySuite.scala | 2 +- 9 files changed, 436 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/905f774b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 7b451ba..8992276 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -313,6 +313,8 @@ trait CheckAnalysis extends PredicateHelper { |${s.catalogTable.identifier} """.stripMargin) + // TODO: We need to consolidate this kind of checks for InsertIntoTable + // with the rule of PreWriteCheck defined in extendedCheckRules. case InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) => failAnalysis( s""" @@ -320,6 +322,23 @@ trait CheckAnalysis extends PredicateHelper { |${s.catalogTable.identifier} """.stripMargin) + case InsertIntoTable(t, _, _, _, _) +if !t.isInstanceOf[LeafNode] || + t == OneRowRelation || + t.isInstanceOf[LocalRelation] => +failAnalysis(s"Inserting into an RDD-based table is not allowed.") + + case i @ InsertIntoTable(table, partitions, query, _, _) => +val numStaticPartitions = partitions.values.count(_.isDefined) +if (table.output.size != (query.output.size + numStaticPartitions)) { + failAnalysis( +s"$table requires that the data to be inserted have the same number of " + + s"columns as the target table: target table has ${table.output.size} " + + s"column(s) but the inserted data has " + + s"${query.output.size + numStaticPartitions} column(s), including " + + s"$numStaticPartitions partition column(s) having constant value(s).") +} + case o if !o.resolved => failAnalysis( s"unresolved operator ${operator.simpleString}") http://git-wip-us.apache.org/repos/asf/spark/blob/905f774b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 2b47865..27133f0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrat
spark git commit: [SPARK-15916][SQL] JDBC filter push down should respect operator precedence
Repository: spark Updated Branches: refs/heads/master 7d65a0db4 -> ebb9a3b6f [SPARK-15916][SQL] JDBC filter push down should respect operator precedence ## What changes were proposed in this pull request? This PR fixes the problem that the precedence order is messed when pushing where-clause expression to JDBC layer. **Case 1:** For sql `select * from table where (a or b) and c`, the where-clause is wrongly converted to JDBC where-clause `a or (b and c)` after filter push down. The consequence is that JDBC may returns less or more rows than expected. **Case 2:** For sql `select * from table where always_false_condition`, the result table may not be empty if the JDBC RDD is partitioned using where-clause: ``` spark.read.jdbc(url, table, predicates = Array("partition 1 where clause", "partition 2 where clause"...) ``` ## How was this patch tested? Unit test. This PR also close #13640 Author: hyukjinkwon <gurwls...@gmail.com> Author: Sean Zhong <seanzh...@databricks.com> Closes #13743 from clockfly/SPARK-15916. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebb9a3b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebb9a3b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebb9a3b6 Branch: refs/heads/master Commit: ebb9a3b6fd834e2c856a192b4455aab83e9c4dc8 Parents: 7d65a0d Author: hyukjinkwon <gurwls...@gmail.com> Authored: Fri Jun 17 17:11:38 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 17 17:11:38 2016 -0700 -- .../execution/datasources/jdbc/JDBCRDD.scala| 4 +-- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 26 2 files changed, 28 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebb9a3b6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 8d0906e..44cfbb9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -305,14 +305,14 @@ private[sql] class JDBCRDD( * `filters`, but as a WHERE clause suitable for injection into a SQL query. */ private val filterWhereClause: String = -filters.flatMap(JDBCRDD.compileFilter).mkString(" AND ") +filters.flatMap(JDBCRDD.compileFilter).map(p => s"($p)").mkString(" AND ") /** * A WHERE clause representing both `filters`, if any, and the current partition. */ private def getWhereClause(part: JDBCPartition): String = { if (part.whereClause != null && filterWhereClause.length > 0) { - "WHERE " + filterWhereClause + " AND " + part.whereClause + "WHERE " + s"($filterWhereClause)" + " AND " + s"(${part.whereClause})" } else if (part.whereClause != null) { "WHERE " + part.whereClause } else if (filterWhereClause.length > 0) { http://git-wip-us.apache.org/repos/asf/spark/blob/ebb9a3b6/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index abb7918..d6ec40c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -661,4 +661,30 @@ class JDBCSuite extends SparkFunSuite assert(oracleDialect.getJDBCType(StringType). map(_.databaseTypeDefinition).get == "VARCHAR2(255)") } + + private def assertEmptyQuery(sqlString: String): Unit = { +assert(sql(sqlString).collect().isEmpty) + } + + test("SPARK-15916: JDBC filter operator push down should respect operator precedence") { +val TRUE = "NAME != 'non_exists'" +val FALSE1 = "THEID > 10" +val FALSE2 = "THEID < -10" + +assertEmptyQuery(s"SELECT * FROM foobar WHERE ($TRUE OR $FALSE1) AND $FALSE2") +assertEmptyQuery(s"SELECT * FROM foobar WHERE $FALSE1 AND ($FALSE2 OR $TRUE)") + +// Tests JDBCPartition whereClause clause push down. +withTempTable("tempFrame") { + val jdbcPartitionWhereClause = s"$FALSE1 OR $TRUE" + val df = spark.read
spark git commit: [SPARK-15916][SQL] JDBC filter push down should respect operator precedence
Repository: spark Updated Branches: refs/heads/branch-2.0 ca0802fd5 -> b22b20db6 [SPARK-15916][SQL] JDBC filter push down should respect operator precedence ## What changes were proposed in this pull request? This PR fixes the problem that the precedence order is messed when pushing where-clause expression to JDBC layer. **Case 1:** For sql `select * from table where (a or b) and c`, the where-clause is wrongly converted to JDBC where-clause `a or (b and c)` after filter push down. The consequence is that JDBC may returns less or more rows than expected. **Case 2:** For sql `select * from table where always_false_condition`, the result table may not be empty if the JDBC RDD is partitioned using where-clause: ``` spark.read.jdbc(url, table, predicates = Array("partition 1 where clause", "partition 2 where clause"...) ``` ## How was this patch tested? Unit test. This PR also close #13640 Author: hyukjinkwon <gurwls...@gmail.com> Author: Sean Zhong <seanzh...@databricks.com> Closes #13743 from clockfly/SPARK-15916. (cherry picked from commit ebb9a3b6fd834e2c856a192b4455aab83e9c4dc8) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b22b20db Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b22b20db Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b22b20db Branch: refs/heads/branch-2.0 Commit: b22b20db640e9fac20c5d54cc83964dc74393821 Parents: ca0802f Author: hyukjinkwon <gurwls...@gmail.com> Authored: Fri Jun 17 17:11:38 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 17 17:11:50 2016 -0700 -- .../execution/datasources/jdbc/JDBCRDD.scala| 4 +-- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 26 2 files changed, 28 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b22b20db/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 8d0906e..44cfbb9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -305,14 +305,14 @@ private[sql] class JDBCRDD( * `filters`, but as a WHERE clause suitable for injection into a SQL query. */ private val filterWhereClause: String = -filters.flatMap(JDBCRDD.compileFilter).mkString(" AND ") +filters.flatMap(JDBCRDD.compileFilter).map(p => s"($p)").mkString(" AND ") /** * A WHERE clause representing both `filters`, if any, and the current partition. */ private def getWhereClause(part: JDBCPartition): String = { if (part.whereClause != null && filterWhereClause.length > 0) { - "WHERE " + filterWhereClause + " AND " + part.whereClause + "WHERE " + s"($filterWhereClause)" + " AND " + s"(${part.whereClause})" } else if (part.whereClause != null) { "WHERE " + part.whereClause } else if (filterWhereClause.length > 0) { http://git-wip-us.apache.org/repos/asf/spark/blob/b22b20db/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index abb7918..d6ec40c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -661,4 +661,30 @@ class JDBCSuite extends SparkFunSuite assert(oracleDialect.getJDBCType(StringType). map(_.databaseTypeDefinition).get == "VARCHAR2(255)") } + + private def assertEmptyQuery(sqlString: String): Unit = { +assert(sql(sqlString).collect().isEmpty) + } + + test("SPARK-15916: JDBC filter operator push down should respect operator precedence") { +val TRUE = "NAME != 'non_exists'" +val FALSE1 = "THEID > 10" +val FALSE2 = "THEID < -10" + +assertEmptyQuery(s"SELECT * FROM foobar WHERE ($TRUE OR $FALSE1) AND $FALSE2") +assertEmptyQuery(s"SELECT * FROM foobar WHERE $FALSE1 AND ($FALSE2 OR $TRUE)") + +// Tests JDBCPartition whereClause clause push down. +withTempTable
spark git commit: [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT
Repository: spark Updated Branches: refs/heads/branch-2.0 52cb1ad38 -> 26359d27c [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT What changes were proposed in this pull request? ~~If the temp table already exists, we should not silently replace it when doing `CACHE TABLE AS SELECT`. This is inconsistent with the behavior of `CREAT VIEW` or `CREATE TABLE`. This PR is to fix this silent drop.~~ ~~Maybe, we also can introduce new syntax for replacing the existing one. For example, in Hive, to replace a view, the syntax should be like `ALTER VIEW AS SELECT` or `CREATE OR REPLACE VIEW AS SELECT`~~ The table name in `CACHE TABLE AS SELECT` should NOT contain database prefix like "database.table". Thus, this PR captures this in Parser and outputs a better error message, instead of reporting the view already exists. In addition, refactoring the `Parser` to generate table identifiers instead of returning the table name string. How was this patch tested? - Added a test case for caching and uncaching qualified table names - Fixed a few test cases that do not drop temp table at the end - Added the related test case for the issue resolved in this PR Author: gatorsmile <gatorsm...@gmail.com> Author: xiaoli <lixiao1...@gmail.com> Author: Xiao Li <xiaoli@Xiaos-MacBook-Pro.local> Closes #13572 from gatorsmile/cacheTableAsSelect. (cherry picked from commit 6451cf9270b55465d8ecea4c4031329a1058561a) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26359d27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26359d27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26359d27 Branch: refs/heads/branch-2.0 Commit: 26359d27c47ae3ec53e442de3884ec9245d15cee Parents: 52cb1ad Author: gatorsmile <gatorsm...@gmail.com> Authored: Thu Jun 16 10:01:59 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 16 10:02:12 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 4 +- .../spark/sql/execution/SparkSqlParser.scala| 10 ++- .../spark/sql/execution/command/cache.scala | 20 ++--- .../spark/sql/execution/command/views.scala | 2 +- .../org/apache/spark/sql/CachedTableSuite.scala | 68 + .../apache/spark/sql/hive/test/TestHive.scala | 2 +- .../spark/sql/hive/CachedTableSuite.scala | 79 +++- 7 files changed, 121 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26359d27/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 044f910..b603196 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -114,8 +114,8 @@ statement tableIdentifier partitionSpec? describeColName? #describeTable | REFRESH TABLE tableIdentifier #refreshTable | REFRESH .*? #refreshResource -| CACHE LAZY? TABLE identifier (AS? query)? #cacheTable -| UNCACHE TABLE identifier #uncacheTable +| CACHE LAZY? TABLE tableIdentifier (AS? query)? #cacheTable +| UNCACHE TABLE tableIdentifier #uncacheTable | CLEAR CACHE #clearCache | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE tableIdentifier partitionSpec? #loadData http://git-wip-us.apache.org/repos/asf/spark/blob/26359d27/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a0508ad..154c25a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -221,14 +221,20 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { */ override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { val query = Option(ctx.query).map(plan) -CacheTab
spark git commit: [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT
Repository: spark Updated Branches: refs/heads/master 7c6c69263 -> 6451cf927 [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT What changes were proposed in this pull request? ~~If the temp table already exists, we should not silently replace it when doing `CACHE TABLE AS SELECT`. This is inconsistent with the behavior of `CREAT VIEW` or `CREATE TABLE`. This PR is to fix this silent drop.~~ ~~Maybe, we also can introduce new syntax for replacing the existing one. For example, in Hive, to replace a view, the syntax should be like `ALTER VIEW AS SELECT` or `CREATE OR REPLACE VIEW AS SELECT`~~ The table name in `CACHE TABLE AS SELECT` should NOT contain database prefix like "database.table". Thus, this PR captures this in Parser and outputs a better error message, instead of reporting the view already exists. In addition, refactoring the `Parser` to generate table identifiers instead of returning the table name string. How was this patch tested? - Added a test case for caching and uncaching qualified table names - Fixed a few test cases that do not drop temp table at the end - Added the related test case for the issue resolved in this PR Author: gatorsmile <gatorsm...@gmail.com> Author: xiaoli <lixiao1...@gmail.com> Author: Xiao Li <xiaoli@Xiaos-MacBook-Pro.local> Closes #13572 from gatorsmile/cacheTableAsSelect. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6451cf92 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6451cf92 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6451cf92 Branch: refs/heads/master Commit: 6451cf9270b55465d8ecea4c4031329a1058561a Parents: 7c6c692 Author: gatorsmile <gatorsm...@gmail.com> Authored: Thu Jun 16 10:01:59 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Thu Jun 16 10:01:59 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 4 +- .../spark/sql/execution/SparkSqlParser.scala| 10 ++- .../spark/sql/execution/command/cache.scala | 20 ++--- .../spark/sql/execution/command/views.scala | 2 +- .../org/apache/spark/sql/CachedTableSuite.scala | 68 + .../apache/spark/sql/hive/test/TestHive.scala | 2 +- .../spark/sql/hive/CachedTableSuite.scala | 79 +++- 7 files changed, 121 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6451cf92/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 044f910..b603196 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -114,8 +114,8 @@ statement tableIdentifier partitionSpec? describeColName? #describeTable | REFRESH TABLE tableIdentifier #refreshTable | REFRESH .*? #refreshResource -| CACHE LAZY? TABLE identifier (AS? query)? #cacheTable -| UNCACHE TABLE identifier #uncacheTable +| CACHE LAZY? TABLE tableIdentifier (AS? query)? #cacheTable +| UNCACHE TABLE tableIdentifier #uncacheTable | CLEAR CACHE #clearCache | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE tableIdentifier partitionSpec? #loadData http://git-wip-us.apache.org/repos/asf/spark/blob/6451cf92/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a0508ad..154c25a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -221,14 +221,20 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { */ override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { val query = Option(ctx.query).map(plan) -CacheTableCommand(ctx.identifier.getText, query, ctx.LAZY != null) +val tableIdent = visitTableIdentifier(ctx.tableIdentifier) +if (query
spark git commit: [SPARK-15901][SQL][TEST] Verification of CONVERT_METASTORE_ORC and CONVERT_METASTORE_PARQUET
Repository: spark Updated Branches: refs/heads/branch-2.0 f1e9d2d92 -> 4253ba71b [SPARK-15901][SQL][TEST] Verification of CONVERT_METASTORE_ORC and CONVERT_METASTORE_PARQUET What changes were proposed in this pull request? So far, we do not have test cases for verifying whether the external parameters `HiveUtils .CONVERT_METASTORE_ORC` and `HiveUtils.CONVERT_METASTORE_PARQUET` properly works when users use non-default values. This PR is to add such test cases for avoiding potential regression. How was this patch tested? N/A Author: gatorsmile <gatorsm...@gmail.com> Closes #13622 from gatorsmile/addTestCase4parquetOrcConversion. (cherry picked from commit 09925735b5e53db61ed12abae58864670a3a5f98) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4253ba71 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4253ba71 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4253ba71 Branch: refs/heads/branch-2.0 Commit: 4253ba71b6d291f0fcb3b67048ff915595c83c9e Parents: f1e9d2d Author: gatorsmile <gatorsm...@gmail.com> Authored: Wed Jun 15 14:08:55 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 15 14:09:05 2016 -0700 -- .../spark/sql/hive/orc/OrcQuerySuite.scala | 75 +++- .../apache/spark/sql/hive/parquetSuites.scala | 40 +++ 2 files changed, 83 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4253ba71/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala index e6c9c5d..cd41da7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.hive.orc -import java.io.File import java.nio.charset.StandardCharsets import org.scalatest.BeforeAndAfterAll @@ -25,7 +24,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.hive.HiveUtils +import org.apache.spark.sql.hive.{HiveUtils, MetastoreRelation} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.internal.SQLConf @@ -401,36 +400,48 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { } } - test("SPARK-14070 Use ORC data source for SQL queries on ORC tables") { -withTempPath { dir => - withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true", -HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { -val path = dir.getCanonicalPath - -withTable("dummy_orc") { - withTempTable("single") { -spark.sql( - s"""CREATE TABLE dummy_orc(key INT, value STRING) - |STORED AS ORC - |LOCATION '$path' - """.stripMargin) - -val singleRowDF = Seq((0, "foo")).toDF("key", "value").coalesce(1) -singleRowDF.createOrReplaceTempView("single") - -spark.sql( - s"""INSERT INTO TABLE dummy_orc - |SELECT key, value FROM single - """.stripMargin) - -val df = spark.sql("SELECT * FROM dummy_orc WHERE key=0") -checkAnswer(df, singleRowDF) - -val queryExecution = df.queryExecution -queryExecution.analyzed.collectFirst { - case _: LogicalRelation => () -}.getOrElse { - fail(s"Expecting the query plan to have LogicalRelation, but got:\n$queryExecution") + test("Verify the ORC conversion parameter: CONVERT_METASTORE_ORC") { +withTempTable("single") { + val singleRowDF = Seq((0, "foo")).toDF("key", "value") + singleRowDF.createOrReplaceTempView("single") + + Seq("true", "false").foreach { orcConversion => +withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> orcConversion) { + withTable("dummy_orc") { +withTempPath { dir => + val path = dir.getCanonicalPath +
spark git commit: [SPARK-15901][SQL][TEST] Verification of CONVERT_METASTORE_ORC and CONVERT_METASTORE_PARQUET
Repository: spark Updated Branches: refs/heads/master 4df8df5c2 -> 09925735b [SPARK-15901][SQL][TEST] Verification of CONVERT_METASTORE_ORC and CONVERT_METASTORE_PARQUET What changes were proposed in this pull request? So far, we do not have test cases for verifying whether the external parameters `HiveUtils .CONVERT_METASTORE_ORC` and `HiveUtils.CONVERT_METASTORE_PARQUET` properly works when users use non-default values. This PR is to add such test cases for avoiding potential regression. How was this patch tested? N/A Author: gatorsmile <gatorsm...@gmail.com> Closes #13622 from gatorsmile/addTestCase4parquetOrcConversion. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/09925735 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/09925735 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/09925735 Branch: refs/heads/master Commit: 09925735b5e53db61ed12abae58864670a3a5f98 Parents: 4df8df5 Author: gatorsmile <gatorsm...@gmail.com> Authored: Wed Jun 15 14:08:55 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Wed Jun 15 14:08:55 2016 -0700 -- .../spark/sql/hive/orc/OrcQuerySuite.scala | 75 +++- .../apache/spark/sql/hive/parquetSuites.scala | 40 +++ 2 files changed, 83 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/09925735/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala index e6c9c5d..cd41da7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.hive.orc -import java.io.File import java.nio.charset.StandardCharsets import org.scalatest.BeforeAndAfterAll @@ -25,7 +24,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.hive.HiveUtils +import org.apache.spark.sql.hive.{HiveUtils, MetastoreRelation} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.internal.SQLConf @@ -401,36 +400,48 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { } } - test("SPARK-14070 Use ORC data source for SQL queries on ORC tables") { -withTempPath { dir => - withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true", -HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { -val path = dir.getCanonicalPath - -withTable("dummy_orc") { - withTempTable("single") { -spark.sql( - s"""CREATE TABLE dummy_orc(key INT, value STRING) - |STORED AS ORC - |LOCATION '$path' - """.stripMargin) - -val singleRowDF = Seq((0, "foo")).toDF("key", "value").coalesce(1) -singleRowDF.createOrReplaceTempView("single") - -spark.sql( - s"""INSERT INTO TABLE dummy_orc - |SELECT key, value FROM single - """.stripMargin) - -val df = spark.sql("SELECT * FROM dummy_orc WHERE key=0") -checkAnswer(df, singleRowDF) - -val queryExecution = df.queryExecution -queryExecution.analyzed.collectFirst { - case _: LogicalRelation => () -}.getOrElse { - fail(s"Expecting the query plan to have LogicalRelation, but got:\n$queryExecution") + test("Verify the ORC conversion parameter: CONVERT_METASTORE_ORC") { +withTempTable("single") { + val singleRowDF = Seq((0, "foo")).toDF("key", "value") + singleRowDF.createOrReplaceTempView("single") + + Seq("true", "false").foreach { orcConversion => +withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> orcConversion) { + withTable("dummy_orc") { +withTempPath { dir => + val path = dir.getCanonicalPath + spark.sql( +s""" + |CREATE TABLE dummy_orc(key INT, value STRING) + |STORED AS ORC +
spark git commit: [SPARK-15929] Fix portability of DataFrameSuite path globbing tests
Repository: spark Updated Branches: refs/heads/master ced8d669b -> a6babca1b [SPARK-15929] Fix portability of DataFrameSuite path globbing tests The DataFrameSuite regression tests for SPARK-13774 fail in my environment because they attempt to glob over all of `/mnt` and some of the subdirectories restrictive permissions which cause the test to fail. This patch rewrites those tests to remove all environment-specific assumptions; the tests now create their own unique temporary paths for use in the tests. Author: Josh Rosen <joshro...@databricks.com> Closes #13649 from JoshRosen/SPARK-15929. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6babca1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6babca1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6babca1 Branch: refs/heads/master Commit: a6babca1bf76e70488ce6005ec3b8b53afc7edfd Parents: ced8d66 Author: Josh Rosen <joshro...@databricks.com> Authored: Mon Jun 13 17:06:22 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 13 17:06:22 2016 -0700 -- .../org/apache/spark/sql/DataFrameSuite.scala | 45 1 file changed, 36 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a6babca1/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 6bb0ce9..c8a0f71 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import java.io.File import java.nio.charset.StandardCharsets +import java.util.UUID import scala.language.postfixOps import scala.util.Random @@ -35,6 +36,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSQLContext} import org.apache.spark.sql.test.SQLTestData.TestData2 import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils class DataFrameSuite extends QueryTest with SharedSQLContext { import testImplicits._ @@ -1495,18 +1497,43 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { } test("SPARK-13774: Check error message for non existent path without globbed paths") { -val e = intercept[AnalysisException] (spark.read.format("csv"). - load("/xyz/file2", "/xyz/file21", "/abc/files555", "a")).getMessage() -assert(e.startsWith("Path does not exist")) +val uuid = UUID.randomUUID().toString +val baseDir = Utils.createTempDir() +try { + val e = intercept[AnalysisException] { +spark.read.format("csv").load( + new File(baseDir, "file").getAbsolutePath, + new File(baseDir, "file2").getAbsolutePath, + new File(uuid, "file3").getAbsolutePath, + uuid).rdd + } + assert(e.getMessage.startsWith("Path does not exist")) +} finally { + +} + } test("SPARK-13774: Check error message for not existent globbed paths") { -val e = intercept[AnalysisException] (spark.read.format("text"). - load( "/xyz/*")).getMessage() -assert(e.startsWith("Path does not exist")) +// Non-existent initial path component: +val nonExistentBasePath = "/" + UUID.randomUUID().toString +assert(!new File(nonExistentBasePath).exists()) +val e = intercept[AnalysisException] { + spark.read.format("text").load(s"$nonExistentBasePath/*") +} +assert(e.getMessage.startsWith("Path does not exist")) -val e1 = intercept[AnalysisException] (spark.read.json("/mnt/*/*-xyz.json").rdd). - getMessage() -assert(e1.startsWith("Path does not exist")) +// Existent initial path component, but no matching files: +val baseDir = Utils.createTempDir() +val childDir = Utils.createTempDir(baseDir.getAbsolutePath) +assert(childDir.exists()) +try { + val e1 = intercept[AnalysisException] { +spark.read.json(s"${baseDir.getAbsolutePath}/*/*-xyz.json").rdd + } + assert(e1.getMessage.startsWith("Path does not exist")) +} finally { + Utils.deleteRecursively(baseDir) +} } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15929] Fix portability of DataFrameSuite path globbing tests
Repository: spark Updated Branches: refs/heads/branch-2.0 1f3b5a5ac -> ab9a304a7 [SPARK-15929] Fix portability of DataFrameSuite path globbing tests The DataFrameSuite regression tests for SPARK-13774 fail in my environment because they attempt to glob over all of `/mnt` and some of the subdirectories restrictive permissions which cause the test to fail. This patch rewrites those tests to remove all environment-specific assumptions; the tests now create their own unique temporary paths for use in the tests. Author: Josh Rosen <joshro...@databricks.com> Closes #13649 from JoshRosen/SPARK-15929. (cherry picked from commit a6babca1bf76e70488ce6005ec3b8b53afc7edfd) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab9a304a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab9a304a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab9a304a Branch: refs/heads/branch-2.0 Commit: ab9a304a7c690c748d3d99f1d933116e25ea0a73 Parents: 1f3b5a5 Author: Josh Rosen <joshro...@databricks.com> Authored: Mon Jun 13 17:06:22 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 13 17:06:36 2016 -0700 -- .../org/apache/spark/sql/DataFrameSuite.scala | 45 1 file changed, 36 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab9a304a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 6bb0ce9..c8a0f71 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import java.io.File import java.nio.charset.StandardCharsets +import java.util.UUID import scala.language.postfixOps import scala.util.Random @@ -35,6 +36,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSQLContext} import org.apache.spark.sql.test.SQLTestData.TestData2 import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils class DataFrameSuite extends QueryTest with SharedSQLContext { import testImplicits._ @@ -1495,18 +1497,43 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { } test("SPARK-13774: Check error message for non existent path without globbed paths") { -val e = intercept[AnalysisException] (spark.read.format("csv"). - load("/xyz/file2", "/xyz/file21", "/abc/files555", "a")).getMessage() -assert(e.startsWith("Path does not exist")) +val uuid = UUID.randomUUID().toString +val baseDir = Utils.createTempDir() +try { + val e = intercept[AnalysisException] { +spark.read.format("csv").load( + new File(baseDir, "file").getAbsolutePath, + new File(baseDir, "file2").getAbsolutePath, + new File(uuid, "file3").getAbsolutePath, + uuid).rdd + } + assert(e.getMessage.startsWith("Path does not exist")) +} finally { + +} + } test("SPARK-13774: Check error message for not existent globbed paths") { -val e = intercept[AnalysisException] (spark.read.format("text"). - load( "/xyz/*")).getMessage() -assert(e.startsWith("Path does not exist")) +// Non-existent initial path component: +val nonExistentBasePath = "/" + UUID.randomUUID().toString +assert(!new File(nonExistentBasePath).exists()) +val e = intercept[AnalysisException] { + spark.read.format("text").load(s"$nonExistentBasePath/*") +} +assert(e.getMessage.startsWith("Path does not exist")) -val e1 = intercept[AnalysisException] (spark.read.json("/mnt/*/*-xyz.json").rdd). - getMessage() -assert(e1.startsWith("Path does not exist")) +// Existent initial path component, but no matching files: +val baseDir = Utils.createTempDir() +val childDir = Utils.createTempDir(baseDir.getAbsolutePath) +assert(childDir.exists()) +try { + val e1 = intercept[AnalysisException] { +spark.read.json(s"${baseDir.getAbsolutePath}/*/*-xyz.json").rdd + } + assert(e1.getMessage.startsWith("Path does not exist")) +} finally { + Utils.deleteRecursively(baseDir) +} } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-15639][SQL] Try to push down filter at RowGroups level for parquet reader"
Repository: spark Updated Branches: refs/heads/branch-2.0 a08715c7a -> 91dffcabd Revert "[SPARK-15639][SQL] Try to push down filter at RowGroups level for parquet reader" This reverts commit 7d6bd1196410563bd1fccc10e7bff6e75b5c9f22. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91dffcab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91dffcab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91dffcab Branch: refs/heads/branch-2.0 Commit: 91dffcabdecd4ab651024c027cf9716664084e1e Parents: a08715c Author: Cheng Lian <l...@databricks.com> Authored: Fri Jun 10 20:45:27 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 10 20:45:27 2016 -0700 -- .../catalyst/expressions/namedExpressions.scala | 8 --- .../datasources/FileSourceStrategy.scala| 9 +-- .../datasources/parquet/ParquetFileFormat.scala | 61 ++-- 3 files changed, 57 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/91dffcab/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index c06a1ea..306a99d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -292,14 +292,6 @@ case class AttributeReference( } } - def withMetadata(newMetadata: Metadata): AttributeReference = { -if (metadata == newMetadata) { - this -} else { - AttributeReference(name, dataType, nullable, newMetadata)(exprId, qualifier, isGenerated) -} - } - override protected final def otherCopyArgs: Seq[AnyRef] = { exprId :: qualifier :: isGenerated :: Nil } http://git-wip-us.apache.org/repos/asf/spark/blob/91dffcab/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 7fc842f..13a86bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -84,14 +84,7 @@ private[sql] object FileSourceStrategy extends Strategy with Logging { logInfo(s"Pruning directories with: ${partitionKeyFilters.mkString(",")}") val dataColumns = -l.resolve(files.dataSchema, files.sparkSession.sessionState.analyzer.resolver).map { c => - files.dataSchema.find(_.name == c.name).map { f => -c match { - case a: AttributeReference => a.withMetadata(f.metadata) - case _ => c -} - }.getOrElse(c) -} +l.resolve(files.dataSchema, files.sparkSession.sessionState.analyzer.resolver) // Partition keys are not available in the statistics of the files. val dataFilters = normalizedFilters.filter(_.references.intersect(partitionSet).isEmpty) http://git-wip-us.apache.org/repos/asf/spark/blob/91dffcab/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index bc4a9de..3735c94 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -357,11 +357,6 @@ private[sql] class ParquetFileFormat val hadoopAttemptContext = new TaskAttemptContextImpl(broadcastedHadoopConf.value.value, attemptId) - // Try to push down filters when filter push-down is enabled. - // Notice: This push-down is RowGroups level, not individual records. - pushed.foreach { - ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, _) - } val parquetReader = if (enableVectorizedReader) { val vectorizedReader = new VectorizedParquet
spark git commit: Revert "[SPARK-15639][SQL] Try to push down filter at RowGroups level for parquet reader"
Repository: spark Updated Branches: refs/heads/master 99f3c8277 -> 8e7b56f3d Revert "[SPARK-15639][SQL] Try to push down filter at RowGroups level for parquet reader" This reverts commit bba5d7999f7b3ae9d816ea552ba9378fea1615a6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e7b56f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e7b56f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e7b56f3 Branch: refs/heads/master Commit: 8e7b56f3d4917692d3ff44d91aa264738a6fc2ed Parents: 99f3c82 Author: Cheng Lian <l...@databricks.com> Authored: Fri Jun 10 20:41:48 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 10 20:41:48 2016 -0700 -- .../catalyst/expressions/namedExpressions.scala | 8 --- .../datasources/FileSourceStrategy.scala| 9 +-- .../datasources/parquet/ParquetFileFormat.scala | 61 ++-- 3 files changed, 57 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8e7b56f3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index c06a1ea..306a99d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -292,14 +292,6 @@ case class AttributeReference( } } - def withMetadata(newMetadata: Metadata): AttributeReference = { -if (metadata == newMetadata) { - this -} else { - AttributeReference(name, dataType, nullable, newMetadata)(exprId, qualifier, isGenerated) -} - } - override protected final def otherCopyArgs: Seq[AnyRef] = { exprId :: qualifier :: isGenerated :: Nil } http://git-wip-us.apache.org/repos/asf/spark/blob/8e7b56f3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 7fc842f..13a86bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -84,14 +84,7 @@ private[sql] object FileSourceStrategy extends Strategy with Logging { logInfo(s"Pruning directories with: ${partitionKeyFilters.mkString(",")}") val dataColumns = -l.resolve(files.dataSchema, files.sparkSession.sessionState.analyzer.resolver).map { c => - files.dataSchema.find(_.name == c.name).map { f => -c match { - case a: AttributeReference => a.withMetadata(f.metadata) - case _ => c -} - }.getOrElse(c) -} +l.resolve(files.dataSchema, files.sparkSession.sessionState.analyzer.resolver) // Partition keys are not available in the statistics of the files. val dataFilters = normalizedFilters.filter(_.references.intersect(partitionSet).isEmpty) http://git-wip-us.apache.org/repos/asf/spark/blob/8e7b56f3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index bc4a9de..3735c94 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -357,11 +357,6 @@ private[sql] class ParquetFileFormat val hadoopAttemptContext = new TaskAttemptContextImpl(broadcastedHadoopConf.value.value, attemptId) - // Try to push down filters when filter push-down is enabled. - // Notice: This push-down is RowGroups level, not individual records. - pushed.foreach { - ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, _) - } val parquetReader = if (enableVectorizedReader) { val vectorizedReader = new VectorizedParquet
spark git commit: [SPARK-15884][SPARKR][SQL] Overriding stringArgs in MapPartitionsInR
Repository: spark Updated Branches: refs/heads/branch-2.0 f41f433b1 -> 0a450cfff [SPARK-15884][SPARKR][SQL] Overriding stringArgs in MapPartitionsInR ## What changes were proposed in this pull request? As discussed in https://github.com/apache/spark/pull/12836 we need to override stringArgs method in MapPartitionsInR in order to avoid too large strings generated by "stringArgs" method based on the input arguments. In this case exclude some of the input arguments: serialized R objects. ## How was this patch tested? Existing test cases Author: Narine Kokhlikyan <narine.kokhlik...@gmail.com> Closes #13610 from NarineK/dapply_MapPartitionsInR_stringArgs. (cherry picked from commit 54f758b5fc60ecb0da6b191939a72ef5829be38c) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a450cff Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a450cff Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a450cff Branch: refs/heads/branch-2.0 Commit: 0a450cfffada67f841795a09af3bf6320343b358 Parents: f41f433 Author: Narine Kokhlikyan <narine.kokhlik...@gmail.com> Authored: Fri Jun 10 17:17:47 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 10 17:17:57 2016 -0700 -- .../org/apache/spark/sql/catalyst/plans/logical/object.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a450cff/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala index 55d8adf..78e8822 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala @@ -141,6 +141,9 @@ case class MapPartitionsInR( outputObjAttr: Attribute, child: LogicalPlan) extends ObjectConsumer with ObjectProducer { override lazy val schema = outputSchema + + override protected def stringArgs: Iterator[Any] = Iterator(inputSchema, outputSchema, +outputObjAttr, child) } object MapElements { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15884][SPARKR][SQL] Overriding stringArgs in MapPartitionsInR
Repository: spark Updated Branches: refs/heads/master 2022afe57 -> 54f758b5f [SPARK-15884][SPARKR][SQL] Overriding stringArgs in MapPartitionsInR ## What changes were proposed in this pull request? As discussed in https://github.com/apache/spark/pull/12836 we need to override stringArgs method in MapPartitionsInR in order to avoid too large strings generated by "stringArgs" method based on the input arguments. In this case exclude some of the input arguments: serialized R objects. ## How was this patch tested? Existing test cases Author: Narine Kokhlikyan <narine.kokhlik...@gmail.com> Closes #13610 from NarineK/dapply_MapPartitionsInR_stringArgs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54f758b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54f758b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54f758b5 Branch: refs/heads/master Commit: 54f758b5fc60ecb0da6b191939a72ef5829be38c Parents: 2022afe Author: Narine Kokhlikyan <narine.kokhlik...@gmail.com> Authored: Fri Jun 10 17:17:47 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 10 17:17:47 2016 -0700 -- .../org/apache/spark/sql/catalyst/plans/logical/object.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54f758b5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala index 55d8adf..78e8822 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala @@ -141,6 +141,9 @@ case class MapPartitionsInR( outputObjAttr: Attribute, child: LogicalPlan) extends ObjectConsumer with ObjectProducer { override lazy val schema = outputSchema + + override protected def stringArgs: Iterator[Any] = Iterator(inputSchema, outputSchema, +outputObjAttr, child) } object MapElements { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15753][SQL] Move Analyzer stuff to Analyzer from DataFrameWriter
Repository: spark Updated Branches: refs/heads/branch-2.0 47c2a265f -> 55a837246 [SPARK-15753][SQL] Move Analyzer stuff to Analyzer from DataFrameWriter ## What changes were proposed in this pull request? This patch moves some codes in `DataFrameWriter.insertInto` that belongs to `Analyzer`. ## How was this patch tested? Existing tests. Author: Liang-Chi Hsieh <sim...@tw.ibm.com> Closes #13496 from viirya/move-analyzer-stuff. (cherry picked from commit 0ec279ffdf92853965e327a9f0f6956cacb7a23e) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/55a83724 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/55a83724 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/55a83724 Branch: refs/heads/branch-2.0 Commit: 55a83724632aa54e49aedbab8ddd21d010eca26d Parents: 47c2a26 Author: Liang-Chi Hsieh <sim...@tw.ibm.com> Authored: Fri Jun 10 11:05:04 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 10 11:05:14 2016 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 17 ++--- .../org/apache/spark/sql/DataFrameWriter.scala | 12 +--- .../spark/sql/hive/execution/HiveQuerySuite.scala | 4 ++-- 3 files changed, 17 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/55a83724/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 4446140..a081357 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -452,6 +452,17 @@ class Analyzer( def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) if child.resolved => +// A partitioned relation's schema can be different from the input logicalPlan, since +// partition columns are all moved after data columns. We Project to adjust the ordering. +val input = if (parts.nonEmpty) { + val (inputPartCols, inputDataCols) = child.output.partition { attr => +parts.contains(attr.name) + } + Project(inputDataCols ++ inputPartCols, child) +} else { + child +} + val table = lookupTableFromCatalog(u) // adding the table's partitions or validate the query's partition info table match { @@ -467,8 +478,8 @@ class Analyzer( |Requested partitions: ${parts.keys.mkString(",")} |Table partitions: ${tablePartitionNames.mkString(",")}""".stripMargin) } - // Assume partition columns are correctly placed at the end of the child's output - i.copy(table = EliminateSubqueryAliases(table)) + // Partition columns are already correctly placed at the end of the child's output + i.copy(table = EliminateSubqueryAliases(table), child = input) } else { // Set up the table's partition scheme with all dynamic partitions by moving partition // columns to the end of the column list, in partition order. @@ -486,7 +497,7 @@ class Analyzer( child = Project(columns ++ partColumns, child)) } case _ => -i.copy(table = EliminateSubqueryAliases(table)) +i.copy(table = EliminateSubqueryAliases(table), child = input) } case u: UnresolvedRelation => val table = u.tableIdentifier http://git-wip-us.apache.org/repos/asf/spark/blob/55a83724/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 32e2fdc..6ce59e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -505,21 +505,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val partitions = normalizedParCols.map(_.map(col => col -> (None: Option[String])).toMap) val overwrite = mode == SaveMode.Overwrite -// A partitioned relation's schema can be different from the input logicalPlan, since -// partit
spark git commit: [SPARK-15753][SQL] Move Analyzer stuff to Analyzer from DataFrameWriter
Repository: spark Updated Branches: refs/heads/master abdb5d42c -> 0ec279ffd [SPARK-15753][SQL] Move Analyzer stuff to Analyzer from DataFrameWriter ## What changes were proposed in this pull request? This patch moves some codes in `DataFrameWriter.insertInto` that belongs to `Analyzer`. ## How was this patch tested? Existing tests. Author: Liang-Chi Hsieh <sim...@tw.ibm.com> Closes #13496 from viirya/move-analyzer-stuff. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0ec279ff Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0ec279ff Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0ec279ff Branch: refs/heads/master Commit: 0ec279ffdf92853965e327a9f0f6956cacb7a23e Parents: abdb5d4 Author: Liang-Chi Hsieh <sim...@tw.ibm.com> Authored: Fri Jun 10 11:05:04 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Fri Jun 10 11:05:04 2016 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 17 ++--- .../org/apache/spark/sql/DataFrameWriter.scala | 12 +--- .../spark/sql/hive/execution/HiveQuerySuite.scala | 4 ++-- 3 files changed, 17 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0ec279ff/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index d1ca99f..58f3904 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -452,6 +452,17 @@ class Analyzer( def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) if child.resolved => +// A partitioned relation's schema can be different from the input logicalPlan, since +// partition columns are all moved after data columns. We Project to adjust the ordering. +val input = if (parts.nonEmpty) { + val (inputPartCols, inputDataCols) = child.output.partition { attr => +parts.contains(attr.name) + } + Project(inputDataCols ++ inputPartCols, child) +} else { + child +} + val table = lookupTableFromCatalog(u) // adding the table's partitions or validate the query's partition info table match { @@ -467,8 +478,8 @@ class Analyzer( |Requested partitions: ${parts.keys.mkString(",")} |Table partitions: ${tablePartitionNames.mkString(",")}""".stripMargin) } - // Assume partition columns are correctly placed at the end of the child's output - i.copy(table = EliminateSubqueryAliases(table)) + // Partition columns are already correctly placed at the end of the child's output + i.copy(table = EliminateSubqueryAliases(table), child = input) } else { // Set up the table's partition scheme with all dynamic partitions by moving partition // columns to the end of the column list, in partition order. @@ -486,7 +497,7 @@ class Analyzer( child = Project(columns ++ partColumns, child)) } case _ => -i.copy(table = EliminateSubqueryAliases(table)) +i.copy(table = EliminateSubqueryAliases(table), child = input) } case u: UnresolvedRelation => val table = u.tableIdentifier http://git-wip-us.apache.org/repos/asf/spark/blob/0ec279ff/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 32e2fdc..6ce59e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -505,21 +505,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val partitions = normalizedParCols.map(_.map(col => col -> (None: Option[String])).toMap) val overwrite = mode == SaveMode.Overwrite -// A partitioned relation's schema can be different from the input logicalPlan, since -// partition columns are all moved after data columns. We Project to adjust the ordering. -// TODO: this belongs to the analyzer. -
spark git commit: [SPARK-15792][SQL] Allows operator to change the verbosity in explain output
Repository: spark Updated Branches: refs/heads/branch-2.0 a5bec5b81 -> 57dd4efcd [SPARK-15792][SQL] Allows operator to change the verbosity in explain output ## What changes were proposed in this pull request? This PR allows customization of verbosity in explain output. After change, `dataframe.explain()` and `dataframe.explain(true)` has different verbosity output for physical plan. Currently, this PR only enables verbosity string for operator `HashAggregateExec` and `SortAggregateExec`. We will gradually enable verbosity string for more operators in future. **Less verbose mode:** dataframe.explain(extended = false) `output=[count(a)#85L]` is **NOT** displayed for HashAggregate. ``` scala> Seq((1,2,3)).toDF("a", "b", "c").createTempView("df2") scala> spark.sql("select count(a) from df2").explain() == Physical Plan == *HashAggregate(key=[], functions=[count(1)]) +- Exchange SinglePartition +- *HashAggregate(key=[], functions=[partial_count(1)]) +- LocalTableScan ``` **Verbose mode:** dataframe.explain(extended = true) `output=[count(a)#85L]` is displayed for HashAggregate. ``` scala> spark.sql("select count(a) from df2").explain(true) // "output=[count(a)#85L]" is added ... == Physical Plan == *HashAggregate(key=[], functions=[count(1)], output=[count(a)#85L]) +- Exchange SinglePartition +- *HashAggregate(key=[], functions=[partial_count(1)], output=[count#87L]) +- LocalTableScan ``` ## How was this patch tested? Manual test. Author: Sean Zhong <seanzh...@databricks.com> Closes #13535 from clockfly/verbose_breakdown_2. (cherry picked from commit 5f731d6859c4516941e5f90c99c966ef76268864) Signed-off-by: Cheng Lian <l...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57dd4efc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57dd4efc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57dd4efc Branch: refs/heads/branch-2.0 Commit: 57dd4efcda9158646df41ea8d70754dc110ecd6f Parents: a5bec5b Author: Sean Zhong <seanzh...@databricks.com> Authored: Mon Jun 6 22:59:25 2016 -0700 Committer: Cheng Lian <l...@databricks.com> Committed: Mon Jun 6 22:59:34 2016 -0700 -- .../sql/catalyst/expressions/Expression.scala | 4 .../spark/sql/catalyst/plans/QueryPlan.scala| 2 ++ .../spark/sql/catalyst/trees/TreeNode.scala | 23 +++- .../spark/sql/execution/QueryExecution.scala| 14 +++- .../sql/execution/WholeStageCodegenExec.scala | 6 +++-- .../execution/aggregate/HashAggregateExec.scala | 12 -- .../execution/aggregate/SortAggregateExec.scala | 12 -- 7 files changed, 55 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57dd4efc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 2ec4621..efe592d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -190,6 +190,10 @@ abstract class Expression extends TreeNode[Expression] { case single => single :: Nil } + // Marks this as final, Expression.verboseString should never be called, and thus shouldn't be + // overridden by concrete classes. + final override def verboseString: String = simpleString + override def simpleString: String = toString override def toString: String = prettyName + flatArguments.mkString("(", ", ", ")") http://git-wip-us.apache.org/repos/asf/spark/blob/57dd4efc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index 19a66cf..cf34f4b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -257,6 +257,8 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT override def simpleString: String = statePrefix + super.simpleString + override def verboseString: String = simpleString + /** * All the subqueries of current plan.