spark git commit: [SPARK-25019][BUILD] Fix orc dependency to use the same exclusion rules
Repository: spark Updated Branches: refs/heads/master 51e2b38d9 -> 278984d5a [SPARK-25019][BUILD] Fix orc dependency to use the same exclusion rules ## What changes were proposed in this pull request? During upgrading Apache ORC to 1.5.2 ([SPARK-24576](https://issues.apache.org/jira/browse/SPARK-24576)), `sql/core` module overrides the exclusion rules of parent pom file and it causes published `spark-sql_2.1X` artifacts have incomplete exclusion rules ([SPARK-25019](https://issues.apache.org/jira/browse/SPARK-25019)). This PR fixes it by moving the newly added exclusion rule to the parent pom. This also fixes the sbt build hack introduced at that time. ## How was this patch tested? Pass the existing dependency check and the tests. Author: Dongjoon Hyun Closes #22003 from dongjoon-hyun/SPARK-25019. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/278984d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/278984d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/278984d5 Branch: refs/heads/master Commit: 278984d5a5e56136c9f940f2d0e3d2040fad180b Parents: 51e2b38 Author: Dongjoon Hyun Authored: Mon Aug 6 12:00:39 2018 -0700 Committer: Yin Huai Committed: Mon Aug 6 12:00:39 2018 -0700 -- pom.xml | 4 sql/core/pom.xml | 28 2 files changed, 4 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/278984d5/pom.xml -- diff --git a/pom.xml b/pom.xml index c46eb31..8abdb70 100644 --- a/pom.xml +++ b/pom.xml @@ -1744,6 +1744,10 @@ hadoop-common +org.apache.hadoop +hadoop-hdfs + + org.apache.hive hive-storage-api http://git-wip-us.apache.org/repos/asf/spark/blob/278984d5/sql/core/pom.xml -- diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 68b42a4..ba17f5f 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -90,39 +90,11 @@ org.apache.orc orc-core ${orc.classifier} - - - org.apache.hadoop - hadoop-hdfs - - - - org.apache.hive - hive-storage-api - - org.apache.orc orc-mapreduce ${orc.classifier} - - - org.apache.hadoop - hadoop-hdfs - - - - org.apache.hive - hive-storage-api - - org.apache.parquet - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24895] Remove spotbugs plugin
Repository: spark Updated Branches: refs/heads/master d4a277f0c -> fc21f192a [SPARK-24895] Remove spotbugs plugin ## What changes were proposed in this pull request? Spotbugs maven plugin was a recently added plugin before 2.4.0 snapshot artifacts were broken. To ensure it does not affect the maven deploy plugin, this change removes it. ## How was this patch tested? Local build was ran, but this patch will be actually tested by monitoring the apache repo artifacts and making sure metadata is correctly uploaded after this job is ran: https://amplab.cs.berkeley.edu/jenkins/view/Spark%20Packaging/job/spark-master-maven-snapshots/ Author: Eric Chang Closes #21865 from ericfchang/SPARK-24895. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc21f192 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc21f192 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc21f192 Branch: refs/heads/master Commit: fc21f192a302e48e5c321852e2a25639c5a182b5 Parents: d4a277f Author: Eric Chang Authored: Tue Jul 24 15:53:50 2018 -0700 Committer: Yin Huai Committed: Tue Jul 24 15:53:50 2018 -0700 -- pom.xml | 22 -- 1 file changed, 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc21f192/pom.xml -- diff --git a/pom.xml b/pom.xml index 81a53ee..d75db0f 100644 --- a/pom.xml +++ b/pom.xml @@ -2610,28 +2610,6 @@ - -com.github.spotbugs -spotbugs-maven-plugin -3.1.3 - - ${basedir}/target/scala-${scala.binary.version}/classes - ${basedir}/target/scala-${scala.binary.version}/test-classes - Max - Low - true - FindPuzzlers - true - - - - - check - -compile - - - - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r25568 - /release/spark/KEYS
Author: yhuai Date: Wed Mar 7 17:53:32 2018 New Revision: 25568 Log: Update KEYS for Sameer Agarwal Modified: release/spark/KEYS Modified: release/spark/KEYS == --- release/spark/KEYS (original) +++ release/spark/KEYS Wed Mar 7 17:53:32 2018 @@ -460,3 +460,61 @@ dcqbOYBLINwxIMZA6N9qCGrST4DfqbAzGSvZ08oe =et2/ -END PGP PUBLIC KEY BLOCK- +pub rsa4096 2018-01-17 [SC] + F2C64242EC1BEC69EA8FBE35DCE4BFD807461E96 +uid [ultimate] Sameer Agarwal (CODE SIGNING KEY) +sub rsa4096 2018-01-17 [E] + +-BEGIN PGP PUBLIC KEY BLOCK- + +mQINBFpftRMBEADEsiDSnSg7EBdFoWdRhVrjePjsYyEq4Sxt61vkkwhrH/pZ8r07 +4kVSZV0hdc+7PLa27X400re6OgULDtQ7c3F1hcrcl72VLNo7iE5FcQITSRvXXsf0 +Lb6eHmkUjCrZW8FF5WLdr/XA/aC2YpuXYszCWH3f7It9864M8OjzKznGfR/Q+9kd +jq2l2d1gLhdMnBwOjxMlyDvU3N3wr1bGNf/s7QAltv5V3yNTPvH9I+iy9FbTuseE +vnMo3KnopEivmF0yqz2qlN3joVg7yAcMPWG92lRQzkUAkrQXcPvcsEvu22kipcOQ +SQQMcMQZFQh8E/dLzp4+DA2bRcshHnM5bWG9NZNMnXKRmcJrHmjJDstEN7LR+zwt +cRj9d0RwCFtS7M9YUX4eCc9Dqgtgg31GVNUZdUcZ1/OHqv+NJUOSZipoKJmAfcBN +OyEGhlWOGidd/3xJtK1GUtTd9iLqjcbcxHapeTOS3kNdXbAwuvX1ADkQ+CTYw5cd +jx2CAEKsBCz1r++/sApRPLIWSRBaGoF2HgGv89/33R66EVSmNhGkS3g6W6ICqrdY +vwhK92NJpapQFwhzk4U3ZrcRwXXktv7PlMFywuSXNbOT7XwkrGOUYqzzi7esV4uF +TDllNmwuVG7q3K7cvGDn69mbgYH8vULzEfuZQYhT9zYPaRePKaILqWLf6wARAQAB +tDdTYW1lZXIgQWdhcndhbCAoQ09ERSBTSUdOSU5HIEtFWSkgPHNhbWVlcmFnQGFw +YWNoZS5vcmc+iQJOBBMBCAA4FiEE8sZCQuwb7Gnqj7413OS/2AdGHpYFAlpftRMC +GwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AACgkQ3OS/2AdGHpYqtg/+IrcrH66c +8A6+LurGr0ZDxQzI3Ka016UOkruLGI4oitqyzgJ/j6quGTxLNEcBToeh8IUqQDN0 +VriV9iPntIUarf9b6Yx6aCxSvBwls9k9PMZqWVu0oIAecWGvvniGooxJlrelpp0M +PJaEPHswH80d8rBDGjktBOrQIq8bak7jLomsFK1zGH6pPkAL9GYo4XK2Ik5OiRs3 +H8bJA/FS4sx17GR0IBWumBvYXtHvAmvfwIEeGtcE+cPj/S438N+fwuXI82c6EGIH +ubFM7uqylbZMlmDgdKkG6YmEQMqK0Ka84iLzUOzqFyOj/aTrKj9GKLc8bBVLU1DP +/PfMmJQDiETJGwwcKhRm9tYYH1DiMhWp5j1jyhOKIEKGUVJ8IxgpAkFURyOQaA4e +5rnPoC65Pp1JzTKXWqmjDm7MRgcP77WqWis7SDgMq56/tdCbjZ2WzyfBQCUlfKU3 +7Iax5qKtdoczZRYhdZGzT8d2pMvQVu9zGuwhiPU/nwFybY1haneZhWpXTKbJkNpc +Gzi2gE7pqXasjA+fn40tuMa4WZlrlvNhTONatcfVuNv1hGS/G+UJjhJzOo40AX2w +2TCmaj4jiwiqByc4QZKM/iGfVCN6GlOI3+1O1KzybqoQG2Tg/ug5unmAvc23ZYw7 +uu+BnBSTsCODqQG8fPRiDlYRdZtDyQQC8M25Ag0EWl+1EwEQAJ82cuI/R4StkgBX +zn7loZmSRZUx08EgsB8vq0s1h8g/pLdBN1h22sj9dnfcW4tFUxIKiwpLK84/Rlj7 +o2W8ZynpaKzR6pelV6Cb3+SMgtWe6DQnKaBRKJ3hzdcdA7Fp6aIjuzMsakOEOx3V +wmtHkCn5MgN/xQBAB3T65thTOFryYqcmEoKWkd5FegJwG4sjHCCARPjgv8ucY/Vs +6lZ0cxOB6qMO0jxH+FSMCZ4xmy7gpvQSs7D0/aj73kJ0Xv1sPZYxacf+P9MnF8jr +mI7jKODvtKNbffRzIK/c2YCcYHvb0PtkLN8hhsmtXcmm4ezQwqA1QZWJhtI7oiCX +A7AYrDKqsLPY4sgzeIzVmz35P/Y0baFp6Qt2eiHQ58I3Eu2+PG6x897So5j6obKi +FEfprFKOewjefPmt+yNxhXITXUAuw57uXR7PeIcIb6bynZjyUcK+Rr8+vfI1JPaS +ZVFaUn6KNFueK/bxDo4dzHMdj4gF9kGE+hPNRGepO7ba90QeaZSA6Bk3EUhovu8H +eMmN/ZsdgMwIHOO3JZ9aWV7wkak7df6qbNVGDhp/QycBAm6J/iG2xYfncYp9nyw8 +UAkrht5EMAdG14Qm3Vq9GGihUsthl2ehPeD37d2/pitTMfnf2Ac6TieHbye0JgL0 +wC3WvL7cLXGmvtIRfXzNd4oDmjGtABEBAAGJAjYEGAEIACAWIQTyxkJC7BvsaeqP +vjXc5L/YB0YelgUCWl+1EwIbDAAKCRDc5L/YB0YelrVgEACjcrAN9bY+Kv8eNcn0 +TpRRKs6uoJc7DvWt7yYp3czbpTx92vo8zA7pnVBbzdwAkuBrdsfoBQMo5xRx066s +b8ZOxIFf9TcDRJOd8jZI89PcqBLPyBC+Jp1KgAe65Dg5Qev6yYgrJgEFKDDO0cHA +1LPpAz4xoRhjLyyPvvIhfxz0qtXw1P4i2E4M+wKg/dNebln5HnjgVFsxQhKLmqUl +QEup4GYUzGWouzlmcWGnwUy/oDuXswYrGkvH5ZkZNgyjYhSTYp/M4TVRs9BAxcPg +5SN+p9+9IxtmOLihoYh9am42nISoQzKXV64E5aB9rk4ux3VJez36KKQ1PJO057SZ +U//XI9kLTZgOZUmQFDI447AFti8h5AQKU1E6Iu4WrdnIjfRrZRqc2/xDqYEEWInM +Y7SkpXNZdAquT+HuiB182hdcUXPL7x2TQBwNwzxNKXZ69k+Rj15jJA+l2ENY7VAQ +FqbkvJjyGA+EudPjLmUEpROr/yewMwy3vfzXsO+sYv9aoXBW/B5e2tiGUxCm9e6W +WXc0h8oAa0aIwDEB50VjUMhS0mEaMbI5hdWvysVLXa6UtIIQCNAQefmuAhD4zZ/X +mo2VPaqhXaUmI8jEljMVgtzFZSAI+lSJO+Iv4Y4zeWSVE9KQJqquxqNaSIHh+LnJ +BqxvTcOR/5gnf89L6zOkcUE5Ig== +=UnR0 +-END PGP PUBLIC KEY BLOCK- + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r25324 - /dev/spark/v2.3.0-rc5-bin/ /release/spark/spark-2.3.0/
Author: yhuai Date: Wed Feb 28 07:25:53 2018 New Revision: 25324 Log: Releasing Apache Spark 2.3.0 Added: release/spark/spark-2.3.0/ - copied from r25323, dev/spark/v2.3.0-rc5-bin/ Removed: dev/spark/v2.3.0-rc5-bin/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[3/3] spark-website git commit: Add the news about spark-summit-eu-2017 agenda
Add the news about spark-summit-eu-2017 agenda Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/35eb1471 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/35eb1471 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/35eb1471 Branch: refs/heads/asf-site Commit: 35eb1471704a97c18e96b46f2495a7117565466d Parents: cca972e Author: Yin Huai Authored: Mon Aug 28 22:40:10 2017 + Committer: Yin Huai Committed: Mon Aug 28 15:54:26 2017 -0700 -- ...-08-28-spark-summit-eu-2017-agenda-posted.md | 17 ++ site/committers.html| 6 +- site/community.html | 6 +- site/contributing.html | 6 +- site/developer-tools.html | 6 +- site/documentation.html | 6 +- site/downloads.html | 6 +- site/examples.html | 6 +- site/faq.html | 6 +- site/graphx/index.html | 6 +- site/improvement-proposals.html | 6 +- site/index.html | 6 +- site/mailing-lists.html | 6 +- site/mllib/index.html | 6 +- site/news/amp-camp-2013-registration-ope.html | 6 +- .../news/announcing-the-first-spark-summit.html | 6 +- .../news/fourth-spark-screencast-published.html | 6 +- site/news/index.html| 16 +- site/news/nsdi-paper.html | 6 +- site/news/one-month-to-spark-summit-2015.html | 6 +- .../proposals-open-for-spark-summit-east.html | 6 +- ...registration-open-for-spark-summit-east.html | 6 +- .../news/run-spark-and-shark-on-amazon-emr.html | 6 +- site/news/spark-0-6-1-and-0-5-2-released.html | 6 +- site/news/spark-0-6-2-released.html | 6 +- site/news/spark-0-7-0-released.html | 6 +- site/news/spark-0-7-2-released.html | 6 +- site/news/spark-0-7-3-released.html | 6 +- site/news/spark-0-8-0-released.html | 6 +- site/news/spark-0-8-1-released.html | 6 +- site/news/spark-0-9-0-released.html | 6 +- site/news/spark-0-9-1-released.html | 6 +- site/news/spark-0-9-2-released.html | 6 +- site/news/spark-1-0-0-released.html | 6 +- site/news/spark-1-0-1-released.html | 6 +- site/news/spark-1-0-2-released.html | 6 +- site/news/spark-1-1-0-released.html | 6 +- site/news/spark-1-1-1-released.html | 6 +- site/news/spark-1-2-0-released.html | 6 +- site/news/spark-1-2-1-released.html | 6 +- site/news/spark-1-2-2-released.html | 6 +- site/news/spark-1-3-0-released.html | 6 +- site/news/spark-1-4-0-released.html | 6 +- site/news/spark-1-4-1-released.html | 6 +- site/news/spark-1-5-0-released.html | 6 +- site/news/spark-1-5-1-released.html | 6 +- site/news/spark-1-5-2-released.html | 6 +- site/news/spark-1-6-0-released.html | 6 +- site/news/spark-1-6-1-released.html | 6 +- site/news/spark-1-6-2-released.html | 6 +- site/news/spark-1-6-3-released.html | 6 +- site/news/spark-2-0-0-released.html | 6 +- site/news/spark-2-0-1-released.html | 6 +- site/news/spark-2-0-2-released.html | 6 +- site/news/spark-2-1-0-released.html | 6 +- site/news/spark-2-1-1-released.html | 6 +- site/news/spark-2-2-0-released.html | 6 +- site/news/spark-2.0.0-preview.html | 6 +- .../spark-accepted-into-apache-incubator.html | 6 +- site/news/spark-and-shark-in-the-news.html | 6 +- site/news/spark-becomes-tlp.html| 6 +- site/news/spark-featured-in-wired.html | 6 +- .../spark-mailing-lists-moving-to-apache.html | 6 +- site/news/spark-meetups.html| 6 +- site/news/spark-screencasts-published.html | 6 +- site/news/spark-summit-2013-is-a-wrap.html | 6 +- site/news/spark-summit-2014-videos-posted.html | 6 +- site/news/spark-summit-2015-videos-posted.html | 6 +- site/news/spark-summit-agenda-posted.html | 6 +- .../spark-summit-east-2015-videos-posted.html | 6 +- .../spark-summit-east-2016-cfp-closing.html | 6 +- .../spark-summit-east-2017-agenda-posted.html | 6 +- site/news/spark-summit-east-agenda-posted.html | 6 +- .../spark-summit-eu-2017-agenda-posted.html | 223 +++ .../news/spark-summit-europe-agenda-posted.html | 6 +- sit
[1/3] spark-website git commit: Add the news about spark-summit-eu-2017 agenda
Repository: spark-website Updated Branches: refs/heads/asf-site cca972e7f -> 35eb14717 http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/releases/spark-release-1-3-0.html -- diff --git a/site/releases/spark-release-1-3-0.html b/site/releases/spark-release-1-3-0.html index 10d934b..5e4d302 100644 --- a/site/releases/spark-release-1-3-0.html +++ b/site/releases/spark-release-1-3-0.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/releases/spark-release-1-3-1.html -- diff --git a/site/releases/spark-release-1-3-1.html b/site/releases/spark-release-1-3-1.html index 7df8028..116898f 100644 --- a/site/releases/spark-release-1-3-1.html +++ b/site/releases/spark-release-1-3-1.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/releases/spark-release-1-4-0.html -- diff --git a/site/releases/spark-release-1-4-0.html b/site/releases/spark-release-1-4-0.html index 143cc17..b75a496 100644 --- a/site/releases/spark-release-1-4-0.html +++ b/site/releases/spark-release-1-4-0.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/releases/spark-release-1-4-1.html -- diff --git a/site/releases/spark-release-1-4-1.html b/site/releases/spark-release-1-4-1.html index ccdd161..30b92fd 100644 --- a/site/releases/spark-release-1-4-1.html +++ b/site/releases/spark-release-1-4-1.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/releases/spark-release-1-5-0.html -- diff --git a/site/releases/spark-release-1-5-0.html b/site/releases/spark-release-1-5-0.html index f73ab5d..6e1411d 100644 --- a/site/releases/spark-release-1-5-0.html +++ b/site/releases/spark-release-1-5-0.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/releases/spark-release-1-5-1.html -- diff --git a/site/releases/spark-release-1-5-1.html b/site/releases/spark-release-1-5-1.html index 3af892e..b447dd7 100644 --- a/site/releases/spark-release-1-5-1.html +++ b/site/releases/spark-release-1-5
[2/3] spark-website git commit: Add the news about spark-summit-eu-2017 agenda
http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/news/spark-accepted-into-apache-incubator.html -- diff --git a/site/news/spark-accepted-into-apache-incubator.html b/site/news/spark-accepted-into-apache-incubator.html index 62638f2..a4a913f 100644 --- a/site/news/spark-accepted-into-apache-incubator.html +++ b/site/news/spark-accepted-into-apache-incubator.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/news/spark-and-shark-in-the-news.html -- diff --git a/site/news/spark-and-shark-in-the-news.html b/site/news/spark-and-shark-in-the-news.html index 4a0c4fc..55d2ade 100644 --- a/site/news/spark-and-shark-in-the-news.html +++ b/site/news/spark-and-shark-in-the-news.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/news/spark-becomes-tlp.html -- diff --git a/site/news/spark-becomes-tlp.html b/site/news/spark-becomes-tlp.html index 6c76d20..0f17857 100644 --- a/site/news/spark-becomes-tlp.html +++ b/site/news/spark-becomes-tlp.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/news/spark-featured-in-wired.html -- diff --git a/site/news/spark-featured-in-wired.html b/site/news/spark-featured-in-wired.html index 1d35e40..1c0b69a 100644 --- a/site/news/spark-featured-in-wired.html +++ b/site/news/spark-featured-in-wired.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/news/spark-mailing-lists-moving-to-apache.html -- diff --git a/site/news/spark-mailing-lists-moving-to-apache.html b/site/news/spark-mailing-lists-moving-to-apache.html index b586b65..4e12162 100644 --- a/site/news/spark-mailing-lists-moving-to-apache.html +++ b/site/news/spark-mailing-lists-moving-to-apache.html @@ -161,6 +161,9 @@ Latest News + Spark Summit Europe (October 24-26th, 2017, Dublin, Ireland) agenda posted + (Aug 28, 2017) + Spark 2.2.0 released (Jul 11, 2017) @@ -170,9 +173,6 @@ Spark Summit (June 5-7th, 2017, San Francisco) agenda posted (Mar 31, 2017) - Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted - (Jan 04, 2017) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/35eb1471/site/news/spark-meetups.html -- diff --git a/site/news/spark-meetups.html b/site/news/spark-meetups.html index 4de6525..92da537 100644 --- a/site/news/spark-meetups.html +++ b/site/news/spark-meetups.html @@ -161,6 +161,
spark git commit: [SPARK-21111][TEST][2.2] Fix the test failure of describe.sql
Repository: spark Updated Branches: refs/heads/branch-2.2 76ee41fd7 -> a585c870a [SPARK-2][TEST][2.2] Fix the test failure of describe.sql ## What changes were proposed in this pull request? Test failed in `describe.sql`. We need to fix the related bug introduced in (https://github.com/apache/spark/pull/17649) in the follow-up PR to master. ## How was this patch tested? N/A Author: gatorsmile Closes #18316 from gatorsmile/fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a585c870 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a585c870 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a585c870 Branch: refs/heads/branch-2.2 Commit: a585c870a066fa94d97462cefbaa4057a7a0ed44 Parents: 76ee41f Author: gatorsmile Authored: Thu Jun 15 18:25:39 2017 -0700 Committer: Yin Huai Committed: Thu Jun 15 18:25:39 2017 -0700 -- sql/core/src/test/resources/sql-tests/results/describe.sql.out | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a585c870/sql/core/src/test/resources/sql-tests/results/describe.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 329532c..ab9f278 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -127,6 +127,7 @@ Providerparquet Num Buckets2 Bucket Columns [`a`] Sort Columns [`b`] +Commenttable_comment Table Properties [e=3] Location [not included in comparison]sql/core/spark-warehouse/t Storage Properties [a=1, b=2] @@ -157,6 +158,7 @@ Providerparquet Num Buckets2 Bucket Columns [`a`] Sort Columns [`b`] +Commenttable_comment Table Properties [e=3] Location [not included in comparison]sql/core/spark-warehouse/t Storage Properties [a=1, b=2] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-20946][SQL] simplify the config setting logic in SparkSession.getOrCreate"
Repository: spark Updated Branches: refs/heads/branch-2.2 6c628e75e -> b560c975b Revert "[SPARK-20946][SQL] simplify the config setting logic in SparkSession.getOrCreate" This reverts commit e11d90bf8deb553fd41b8837e3856c11486c2503. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b560c975 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b560c975 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b560c975 Branch: refs/heads/branch-2.2 Commit: b560c975b7cdc8828fc9e27cbca740c5e550b9cd Parents: 6c628e7 Author: Yin Huai Authored: Fri Jun 2 15:36:21 2017 -0700 Committer: Yin Huai Committed: Fri Jun 2 15:37:38 2017 -0700 -- .../spark/ml/recommendation/ALSSuite.scala | 4 +++- .../apache/spark/ml/tree/impl/TreeTests.scala | 2 ++ .../org/apache/spark/sql/SparkSession.scala | 25 +--- 3 files changed, 21 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b560c975/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index 23f2256..701040f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -820,13 +820,15 @@ class ALSCleanerSuite extends SparkFunSuite { FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet try { conf.set("spark.local.dir", localDir.getAbsolutePath) - val sc = new SparkContext("local[2]", "ALSCleanerSuite", conf) + val sc = new SparkContext("local[2]", "test", conf) try { sc.setCheckpointDir(checkpointDir.getAbsolutePath) // Generate test data val (training, _) = ALSSuite.genImplicitTestData(sc, 20, 5, 1, 0.2, 0) // Implicitly test the cleaning of parents during ALS training val spark = SparkSession.builder + .master("local[2]") + .appName("ALSCleanerSuite") .sparkContext(sc) .getOrCreate() import spark.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/b560c975/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala index b6894b3..92a2369 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala @@ -43,6 +43,8 @@ private[ml] object TreeTests extends SparkFunSuite { categoricalFeatures: Map[Int, Int], numClasses: Int): DataFrame = { val spark = SparkSession.builder() + .master("local[2]") + .appName("TreeTests") .sparkContext(data.sparkContext) .getOrCreate() import spark.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/b560c975/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index bf37b76..d2bf350 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -757,8 +757,6 @@ object SparkSession { private[this] var userSuppliedContext: Option[SparkContext] = None -// The `SparkConf` inside the given `SparkContext` may get changed if you specify some options -// for this builder. private[spark] def sparkContext(sparkContext: SparkContext): Builder = synchronized { userSuppliedContext = Option(sparkContext) this @@ -856,7 +854,7 @@ object SparkSession { * * @since 2.2.0 */ -def withExtensions(f: SparkSessionExtensions => Unit): Builder = synchronized { +def withExtensions(f: SparkSessionExtensions => Unit): Builder = { f(extensions) this } @@ -901,14 +899,22 @@ object SparkSession { // No active nor global default session. Create a new one. val sparkContext = userSuppliedContext.getOrElse { + // set app name if not given + val randomAppName = java.util.UUID.randomUUID().toString val sparkConf = new SparkConf() - options.get("spark.master").foreach(sparkConf.setMaster) - // set a random app name if not given. - sparkConf.setAppNa
spark git commit: Revert "[SPARK-20946][SQL] simplify the config setting logic in SparkSession.getOrCreate"
Repository: spark Updated Branches: refs/heads/master 2a780ac7f -> 0eb1fc6cd Revert "[SPARK-20946][SQL] simplify the config setting logic in SparkSession.getOrCreate" This reverts commit e11d90bf8deb553fd41b8837e3856c11486c2503. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0eb1fc6c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0eb1fc6c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0eb1fc6c Branch: refs/heads/master Commit: 0eb1fc6cd512f19d94758643c512cd6db036aaab Parents: 2a780ac Author: Yin Huai Authored: Fri Jun 2 15:36:21 2017 -0700 Committer: Yin Huai Committed: Fri Jun 2 15:36:21 2017 -0700 -- .../spark/ml/recommendation/ALSSuite.scala | 4 +++- .../apache/spark/ml/tree/impl/TreeTests.scala | 2 ++ .../org/apache/spark/sql/SparkSession.scala | 25 +--- 3 files changed, 21 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0eb1fc6c/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index 23f2256..701040f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -820,13 +820,15 @@ class ALSCleanerSuite extends SparkFunSuite { FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet try { conf.set("spark.local.dir", localDir.getAbsolutePath) - val sc = new SparkContext("local[2]", "ALSCleanerSuite", conf) + val sc = new SparkContext("local[2]", "test", conf) try { sc.setCheckpointDir(checkpointDir.getAbsolutePath) // Generate test data val (training, _) = ALSSuite.genImplicitTestData(sc, 20, 5, 1, 0.2, 0) // Implicitly test the cleaning of parents during ALS training val spark = SparkSession.builder + .master("local[2]") + .appName("ALSCleanerSuite") .sparkContext(sc) .getOrCreate() import spark.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/0eb1fc6c/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala index b6894b3..92a2369 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala @@ -43,6 +43,8 @@ private[ml] object TreeTests extends SparkFunSuite { categoricalFeatures: Map[Int, Int], numClasses: Int): DataFrame = { val spark = SparkSession.builder() + .master("local[2]") + .appName("TreeTests") .sparkContext(data.sparkContext) .getOrCreate() import spark.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/0eb1fc6c/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index bf37b76..d2bf350 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -757,8 +757,6 @@ object SparkSession { private[this] var userSuppliedContext: Option[SparkContext] = None -// The `SparkConf` inside the given `SparkContext` may get changed if you specify some options -// for this builder. private[spark] def sparkContext(sparkContext: SparkContext): Builder = synchronized { userSuppliedContext = Option(sparkContext) this @@ -856,7 +854,7 @@ object SparkSession { * * @since 2.2.0 */ -def withExtensions(f: SparkSessionExtensions => Unit): Builder = synchronized { +def withExtensions(f: SparkSessionExtensions => Unit): Builder = { f(extensions) this } @@ -901,14 +899,22 @@ object SparkSession { // No active nor global default session. Create a new one. val sparkContext = userSuppliedContext.getOrElse { + // set app name if not given + val randomAppName = java.util.UUID.randomUUID().toString val sparkConf = new SparkConf() - options.get("spark.master").foreach(sparkConf.setMaster) - // set a random app name if not given. - sparkConf.setAppName(optio
spark git commit: Revert "[SPARK-20311][SQL] Support aliases for table value functions"
Repository: spark Updated Branches: refs/heads/branch-2.2 9e8d23b3a -> d191b962d Revert "[SPARK-20311][SQL] Support aliases for table value functions" This reverts commit 714811d0b5bcb5d47c39782ff74f898d276ecc59. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d191b962 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d191b962 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d191b962 Branch: refs/heads/branch-2.2 Commit: d191b962dc81c015fa92a38d882a8c7ea620ef06 Parents: 9e8d23b Author: Yin Huai Authored: Tue May 9 14:47:45 2017 -0700 Committer: Yin Huai Committed: Tue May 9 14:49:02 2017 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 20 ++ .../analysis/ResolveTableValuedFunctions.scala | 22 +++- .../sql/catalyst/analysis/unresolved.scala | 10 ++--- .../spark/sql/catalyst/parser/AstBuilder.scala | 17 --- .../sql/catalyst/analysis/AnalysisSuite.scala | 14 + .../sql/catalyst/parser/PlanParserSuite.scala | 13 +--- 6 files changed, 17 insertions(+), 79 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d191b962/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 15e4dd4..1ecb3d1 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -472,23 +472,15 @@ identifierComment ; relationPrimary -: tableIdentifier sample? (AS? strictIdentifier)? #tableName -| '(' queryNoWith ')' sample? (AS? strictIdentifier)? #aliasedQuery -| '(' relation ')' sample? (AS? strictIdentifier)? #aliasedRelation -| inlineTable #inlineTableDefault2 -| functionTable#tableValuedFunction +: tableIdentifier sample? (AS? strictIdentifier)? #tableName +| '(' queryNoWith ')' sample? (AS? strictIdentifier)? #aliasedQuery +| '(' relation ')' sample? (AS? strictIdentifier)? #aliasedRelation +| inlineTable #inlineTableDefault2 +| identifier '(' (expression (',' expression)*)? ')' #tableValuedFunction ; inlineTable -: VALUES expression (',' expression)* tableAlias -; - -functionTable -: identifier '(' (expression (',' expression)*)? ')' tableAlias -; - -tableAlias -: (AS? identifier identifierList?)? +: VALUES expression (',' expression)* (AS? identifier identifierList?)? ; rowFormat http://git-wip-us.apache.org/repos/asf/spark/blob/d191b962/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala index dad1340..de6de24 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.analysis import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} @@ -105,7 +105,7 @@ object ResolveTableValuedFunctions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => - val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { + builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { case Some(tvf) => val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { @@ -125,21 +125,5 @@ object ResolveTableValuedFunctions extends Rule[LogicalPlan] { case _ => u.failAnalysi
spark git commit: Revert "[SPARK-20311][SQL] Support aliases for table value functions"
Repository: spark Updated Branches: refs/heads/master ac1ab6b9d -> f79aa285c Revert "[SPARK-20311][SQL] Support aliases for table value functions" This reverts commit 714811d0b5bcb5d47c39782ff74f898d276ecc59. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f79aa285 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f79aa285 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f79aa285 Branch: refs/heads/master Commit: f79aa285cf115963ba06a9cacb3dbd7e3cbf7728 Parents: ac1ab6b Author: Yin Huai Authored: Tue May 9 14:47:45 2017 -0700 Committer: Yin Huai Committed: Tue May 9 14:47:45 2017 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 20 ++ .../analysis/ResolveTableValuedFunctions.scala | 22 +++- .../sql/catalyst/analysis/unresolved.scala | 10 ++--- .../spark/sql/catalyst/parser/AstBuilder.scala | 17 --- .../sql/catalyst/analysis/AnalysisSuite.scala | 14 + .../sql/catalyst/parser/PlanParserSuite.scala | 13 +--- 6 files changed, 17 insertions(+), 79 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f79aa285/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 41daf58..14c511f 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -472,23 +472,15 @@ identifierComment ; relationPrimary -: tableIdentifier sample? (AS? strictIdentifier)? #tableName -| '(' queryNoWith ')' sample? (AS? strictIdentifier)? #aliasedQuery -| '(' relation ')' sample? (AS? strictIdentifier)? #aliasedRelation -| inlineTable #inlineTableDefault2 -| functionTable#tableValuedFunction +: tableIdentifier sample? (AS? strictIdentifier)? #tableName +| '(' queryNoWith ')' sample? (AS? strictIdentifier)? #aliasedQuery +| '(' relation ')' sample? (AS? strictIdentifier)? #aliasedRelation +| inlineTable #inlineTableDefault2 +| identifier '(' (expression (',' expression)*)? ')' #tableValuedFunction ; inlineTable -: VALUES expression (',' expression)* tableAlias -; - -functionTable -: identifier '(' (expression (',' expression)*)? ')' tableAlias -; - -tableAlias -: (AS? identifier identifierList?)? +: VALUES expression (',' expression)* (AS? identifier identifierList?)? ; rowFormat http://git-wip-us.apache.org/repos/asf/spark/blob/f79aa285/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala index dad1340..de6de24 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.analysis import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} @@ -105,7 +105,7 @@ object ResolveTableValuedFunctions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => - val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { + builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { case Some(tvf) => val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { @@ -125,21 +125,5 @@ object ResolveTableValuedFunctions extends Rule[LogicalPlan] { case _ => u.failAnalysis(s"coul
spark git commit: [SPARK-20661][SPARKR][TEST] SparkR tableNames() test fails
Repository: spark Updated Branches: refs/heads/branch-2.2 23681e9ca -> 4179ffc03 [SPARK-20661][SPARKR][TEST] SparkR tableNames() test fails ## What changes were proposed in this pull request? Cleaning existing temp tables before running tableNames tests ## How was this patch tested? SparkR Unit tests Author: Hossein Closes #17903 from falaki/SPARK-20661. (cherry picked from commit 2abfee18b6511482b916c36f00bf3abf68a59e19) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4179ffc0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4179ffc0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4179ffc0 Branch: refs/heads/branch-2.2 Commit: 4179ffc031a0dbca6a93255c673de800ce7393fe Parents: 23681e9 Author: Hossein Authored: Mon May 8 14:48:11 2017 -0700 Committer: Yin Huai Committed: Mon May 8 14:48:29 2017 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4179ffc0/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3f445e2..58cd259 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -668,6 +668,8 @@ test_that("jsonRDD() on a RDD with json string", { }) test_that("test tableNames and tables", { + # Making sure there are no registered temp tables from previous tests + suppressWarnings(sapply(tableNames(), function(tname) { dropTempTable(tname) })) df <- read.json(jsonPath) createOrReplaceTempView(df, "table1") expect_equal(length(tableNames()), 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20661][SPARKR][TEST] SparkR tableNames() test fails
Repository: spark Updated Branches: refs/heads/master 829cd7b8b -> 2abfee18b [SPARK-20661][SPARKR][TEST] SparkR tableNames() test fails ## What changes were proposed in this pull request? Cleaning existing temp tables before running tableNames tests ## How was this patch tested? SparkR Unit tests Author: Hossein Closes #17903 from falaki/SPARK-20661. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2abfee18 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2abfee18 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2abfee18 Branch: refs/heads/master Commit: 2abfee18b6511482b916c36f00bf3abf68a59e19 Parents: 829cd7b Author: Hossein Authored: Mon May 8 14:48:11 2017 -0700 Committer: Yin Huai Committed: Mon May 8 14:48:11 2017 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2abfee18/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index f517ce6..ab6888e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -677,6 +677,8 @@ test_that("jsonRDD() on a RDD with json string", { }) test_that("test tableNames and tables", { + # Making sure there are no registered temp tables from previous tests + suppressWarnings(sapply(tableNames(), function(tname) { dropTempTable(tname) })) df <- read.json(jsonPath) createOrReplaceTempView(df, "table1") expect_equal(length(tableNames()), 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20358][CORE] Executors failing stage on interrupted exception thrown by cancelled tasks
Repository: spark Updated Branches: refs/heads/branch-2.2 32c5a105e -> e929cd767 [SPARK-20358][CORE] Executors failing stage on interrupted exception thrown by cancelled tasks ## What changes were proposed in this pull request? This was a regression introduced by my earlier PR here: https://github.com/apache/spark/pull/17531 It turns out NonFatal() does not in fact catch InterruptedException. ## How was this patch tested? Extended cancellation unit test coverage. The first test fails before this patch. cc JoshRosen mridulm Author: Eric Liang Closes #17659 from ericl/spark-20358. (cherry picked from commit b2ebadfd55283348b8a8b37e28075fca0798228a) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e929cd76 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e929cd76 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e929cd76 Branch: refs/heads/branch-2.2 Commit: e929cd76720f9f448f2774c33305a91318bce033 Parents: 32c5a10 Author: Eric Liang Authored: Thu Apr 20 09:55:10 2017 -0700 Committer: Yin Huai Committed: Thu Apr 20 09:55:22 2017 -0700 -- .../org/apache/spark/executor/Executor.scala| 3 ++- .../org/apache/spark/SparkContextSuite.scala| 26 +--- 2 files changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e929cd76/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 83469c5..18f0439 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -432,7 +432,8 @@ private[spark] class Executor( setTaskFinishedAndClearInterruptStatus() execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled(t.reason))) -case NonFatal(_) if task != null && task.reasonIfKilled.isDefined => +case _: InterruptedException | NonFatal(_) if +task != null && task.reasonIfKilled.isDefined => val killReason = task.reasonIfKilled.getOrElse("unknown reason") logInfo(s"Executor interrupted and killed $taskName (TID $taskId), reason: $killReason") setTaskFinishedAndClearInterruptStatus() http://git-wip-us.apache.org/repos/asf/spark/blob/e929cd76/core/src/test/scala/org/apache/spark/SparkContextSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 735f445..7e26139 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -540,10 +540,24 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } - // Launches one task that will run forever. Once the SparkListener detects the task has + testCancellingTasks("that raise interrupted exception on cancel") { +Thread.sleep(999) + } + + // SPARK-20217 should not fail stage if task throws non-interrupted exception + testCancellingTasks("that raise runtime exception on cancel") { +try { + Thread.sleep(999) +} catch { + case t: Throwable => +throw new RuntimeException("killed") +} + } + + // Launches one task that will block forever. Once the SparkListener detects the task has // started, kill and re-schedule it. The second run of the task will complete immediately. // If this test times out, then the first version of the task wasn't killed successfully. - test("Killing tasks") { + def testCancellingTasks(desc: String)(blockFn: => Unit): Unit = test(s"Killing tasks $desc") { sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) SparkContextSuite.isTaskStarted = false @@ -572,13 +586,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu // first attempt will hang if (!SparkContextSuite.isTaskStarted) { SparkContextSuite.isTaskStarted = true - try { -Thread.sleep(999) - } catch { -case t: Throwable => - // SPARK-20217 should not fail stage if task throws non-interrupted exception - throw new RuntimeException("killed") - } + blockFn } // second attempt succeeds immediately } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For ad
spark git commit: [SPARK-20358][CORE] Executors failing stage on interrupted exception thrown by cancelled tasks
Repository: spark Updated Branches: refs/heads/master c5a31d160 -> b2ebadfd5 [SPARK-20358][CORE] Executors failing stage on interrupted exception thrown by cancelled tasks ## What changes were proposed in this pull request? This was a regression introduced by my earlier PR here: https://github.com/apache/spark/pull/17531 It turns out NonFatal() does not in fact catch InterruptedException. ## How was this patch tested? Extended cancellation unit test coverage. The first test fails before this patch. cc JoshRosen mridulm Author: Eric Liang Closes #17659 from ericl/spark-20358. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b2ebadfd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b2ebadfd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b2ebadfd Branch: refs/heads/master Commit: b2ebadfd55283348b8a8b37e28075fca0798228a Parents: c5a31d1 Author: Eric Liang Authored: Thu Apr 20 09:55:10 2017 -0700 Committer: Yin Huai Committed: Thu Apr 20 09:55:10 2017 -0700 -- .../org/apache/spark/executor/Executor.scala| 3 ++- .../org/apache/spark/SparkContextSuite.scala| 26 +--- 2 files changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b2ebadfd/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 83469c5..18f0439 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -432,7 +432,8 @@ private[spark] class Executor( setTaskFinishedAndClearInterruptStatus() execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled(t.reason))) -case NonFatal(_) if task != null && task.reasonIfKilled.isDefined => +case _: InterruptedException | NonFatal(_) if +task != null && task.reasonIfKilled.isDefined => val killReason = task.reasonIfKilled.getOrElse("unknown reason") logInfo(s"Executor interrupted and killed $taskName (TID $taskId), reason: $killReason") setTaskFinishedAndClearInterruptStatus() http://git-wip-us.apache.org/repos/asf/spark/blob/b2ebadfd/core/src/test/scala/org/apache/spark/SparkContextSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 735f445..7e26139 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -540,10 +540,24 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } - // Launches one task that will run forever. Once the SparkListener detects the task has + testCancellingTasks("that raise interrupted exception on cancel") { +Thread.sleep(999) + } + + // SPARK-20217 should not fail stage if task throws non-interrupted exception + testCancellingTasks("that raise runtime exception on cancel") { +try { + Thread.sleep(999) +} catch { + case t: Throwable => +throw new RuntimeException("killed") +} + } + + // Launches one task that will block forever. Once the SparkListener detects the task has // started, kill and re-schedule it. The second run of the task will complete immediately. // If this test times out, then the first version of the task wasn't killed successfully. - test("Killing tasks") { + def testCancellingTasks(desc: String)(blockFn: => Unit): Unit = test(s"Killing tasks $desc") { sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) SparkContextSuite.isTaskStarted = false @@ -572,13 +586,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu // first attempt will hang if (!SparkContextSuite.isTaskStarted) { SparkContextSuite.isTaskStarted = true - try { -Thread.sleep(999) - } catch { -case t: Throwable => - // SPARK-20217 should not fail stage if task throws non-interrupted exception - throw new RuntimeException("killed") - } + blockFn } // second attempt succeeds immediately } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20217][CORE] Executor should not fail stage if killed task throws non-interrupted exception
Repository: spark Updated Branches: refs/heads/master 4000f128b -> 5142e5d4e [SPARK-20217][CORE] Executor should not fail stage if killed task throws non-interrupted exception ## What changes were proposed in this pull request? If tasks throw non-interrupted exceptions on kill (e.g. java.nio.channels.ClosedByInterruptException), their death is reported back as TaskFailed instead of TaskKilled. This causes stage failure in some cases. This is reproducible as follows. Run the following, and then use SparkContext.killTaskAttempt to kill one of the tasks. The entire stage will fail since we threw a RuntimeException instead of InterruptedException. ``` spark.range(100).repartition(100).foreach { i => try { Thread.sleep(1000) } catch { case t: InterruptedException => throw new RuntimeException(t) } } ``` Based on the code in TaskSetManager, I think this also affects kills of speculative tasks. However, since the number of speculated tasks is few, and usually you need to fail a task a few times before the stage is cancelled, it unlikely this would be noticed in production unless both speculation was enabled and the num allowed task failures was = 1. We should probably unconditionally return TaskKilled instead of TaskFailed if the task was killed by the driver, regardless of the actual exception thrown. ## How was this patch tested? Unit test. The test fails before the change in Executor.scala cc JoshRosen Author: Eric Liang Closes #17531 from ericl/fix-task-interrupt. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5142e5d4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5142e5d4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5142e5d4 Branch: refs/heads/master Commit: 5142e5d4e09c7cb36cf1d792934a21c5305c6d42 Parents: 4000f12 Author: Eric Liang Authored: Wed Apr 5 19:37:21 2017 -0700 Committer: Yin Huai Committed: Wed Apr 5 19:37:21 2017 -0700 -- core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +- core/src/test/scala/org/apache/spark/SparkContextSuite.scala | 8 +++- 2 files changed, 8 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5142e5d4/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 99b1608..83469c5 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -432,7 +432,7 @@ private[spark] class Executor( setTaskFinishedAndClearInterruptStatus() execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled(t.reason))) -case _: InterruptedException if task.reasonIfKilled.isDefined => +case NonFatal(_) if task != null && task.reasonIfKilled.isDefined => val killReason = task.reasonIfKilled.getOrElse("unknown reason") logInfo(s"Executor interrupted and killed $taskName (TID $taskId), reason: $killReason") setTaskFinishedAndClearInterruptStatus() http://git-wip-us.apache.org/repos/asf/spark/blob/5142e5d4/core/src/test/scala/org/apache/spark/SparkContextSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 2c94755..735f445 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -572,7 +572,13 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu // first attempt will hang if (!SparkContextSuite.isTaskStarted) { SparkContextSuite.isTaskStarted = true - Thread.sleep(999) + try { +Thread.sleep(999) + } catch { +case t: Throwable => + // SPARK-20217 should not fail stage if task throws non-interrupted exception + throw new RuntimeException("killed") + } } // second attempt succeeds immediately } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19620][SQL] Fix incorrect exchange coordinator id in the physical plan
Repository: spark Updated Branches: refs/heads/master fcb68e0f5 -> dd9049e04 [SPARK-19620][SQL] Fix incorrect exchange coordinator id in the physical plan ## What changes were proposed in this pull request? When adaptive execution is enabled, an exchange coordinator is used in the Exchange operators. For Join, the same exchange coordinator is used for its two Exchanges. But the physical plan shows two different coordinator Ids which is confusing. This PR is to fix the incorrect exchange coordinator id in the physical plan. The coordinator object instead of the `Option[ExchangeCoordinator]` should be used to generate the identity hash code of the same coordinator. ## How was this patch tested? Before the patch, the physical plan shows two different exchange coordinator id for Join. ``` == Physical Plan == *Project [key1#3L, value2#12L] +- *SortMergeJoin [key1#3L], [key2#11L], Inner :- *Sort [key1#3L ASC NULLS FIRST], false, 0 : +- Exchange(coordinator id: 1804587700) hashpartitioning(key1#3L, 10), coordinator[target post-shuffle partition size: 67108864] : +- *Project [(id#0L % 500) AS key1#3L] :+- *Filter isnotnull((id#0L % 500)) : +- *Range (0, 1000, step=1, splits=Some(10)) +- *Sort [key2#11L ASC NULLS FIRST], false, 0 +- Exchange(coordinator id: 793927319) hashpartitioning(key2#11L, 10), coordinator[target post-shuffle partition size: 67108864] +- *Project [(id#8L % 500) AS key2#11L, id#8L AS value2#12L] +- *Filter isnotnull((id#8L % 500)) +- *Range (0, 1000, step=1, splits=Some(10)) ``` After the patch, two exchange coordinator id are the same. Author: Carson Wang Closes #16952 from carsonwang/FixCoordinatorId. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dd9049e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dd9049e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dd9049e0 Branch: refs/heads/master Commit: dd9049e0492cc70b629518fee9b3d1632374c612 Parents: fcb68e0 Author: Carson Wang Authored: Fri Mar 10 11:13:26 2017 -0800 Committer: Yin Huai Committed: Fri Mar 10 11:13:26 2017 -0800 -- .../org/apache/spark/sql/execution/exchange/ShuffleExchange.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dd9049e0/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala index 125a493..f06544e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala @@ -46,7 +46,7 @@ case class ShuffleExchange( override def nodeName: String = { val extraInfo = coordinator match { case Some(exchangeCoordinator) => -s"(coordinator id: ${System.identityHashCode(coordinator)})" +s"(coordinator id: ${System.identityHashCode(exchangeCoordinator)})" case None => "" } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19816][SQL][TESTS] Fix an issue that DataFrameCallbackSuite doesn't recover the log level
Repository: spark Updated Branches: refs/heads/branch-2.1 da04d45c2 -> 664c9795c [SPARK-19816][SQL][TESTS] Fix an issue that DataFrameCallbackSuite doesn't recover the log level ## What changes were proposed in this pull request? "DataFrameCallbackSuite.execute callback functions when a DataFrame action failed" sets the log level to "fatal" but doesn't recover it. Hence, tests running after it won't output any logs except fatal logs. This PR uses `testQuietly` instead to avoid changing the log level. ## How was this patch tested? Jenkins Author: Shixiong Zhu Closes #17156 from zsxwing/SPARK-19816. (cherry picked from commit fbc4058037cf5b0be9f14a7dd28105f7f8151bed) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/664c9795 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/664c9795 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/664c9795 Branch: refs/heads/branch-2.1 Commit: 664c9795c94d3536ff9fe54af06e0fb6c0012862 Parents: da04d45 Author: Shixiong Zhu Authored: Fri Mar 3 19:00:35 2017 -0800 Committer: Yin Huai Committed: Fri Mar 3 19:09:38 2017 -0800 -- .../scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/664c9795/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala index 3ae5ce6..f372e94 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala @@ -58,7 +58,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSQLContext { spark.listenerManager.unregister(listener) } - test("execute callback functions when a DataFrame action failed") { + testQuietly("execute callback functions when a DataFrame action failed") { val metrics = ArrayBuffer.empty[(String, QueryExecution, Exception)] val listener = new QueryExecutionListener { override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { @@ -73,8 +73,6 @@ class DataFrameCallbackSuite extends QueryTest with SharedSQLContext { val errorUdf = udf[Int, Int] { _ => throw new RuntimeException("udf error") } val df = sparkContext.makeRDD(Seq(1 -> "a")).toDF("i", "j") -// Ignore the log when we are expecting an exception. -sparkContext.setLogLevel("FATAL") val e = intercept[SparkException](df.select(errorUdf($"i")).collect()) assert(metrics.length == 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19604][TESTS] Log the start of every Python test
Repository: spark Updated Branches: refs/heads/branch-2.1 88c43f4fb -> b9ab4c0e9 [SPARK-19604][TESTS] Log the start of every Python test ## What changes were proposed in this pull request? Right now, we only have info level log after we finish the tests of a Python test file. We should also log the start of a test. So, if a test is hanging, we can tell which test file is running. ## How was this patch tested? This is a change for python tests. Author: Yin Huai Closes #16935 from yhuai/SPARK-19604. (cherry picked from commit f6c3bba22501ee7753d85c6e51ffe851d43869c1) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b9ab4c0e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b9ab4c0e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b9ab4c0e Branch: refs/heads/branch-2.1 Commit: b9ab4c0e983df463232f1adbe6e5982b0d7d497d Parents: 88c43f4 Author: Yin Huai Authored: Wed Feb 15 14:41:15 2017 -0800 Committer: Yin Huai Committed: Wed Feb 15 18:43:57 2017 -0800 -- python/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b9ab4c0e/python/run-tests.py -- diff --git a/python/run-tests.py b/python/run-tests.py index 38b3bb8..53a0aef 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -72,7 +72,7 @@ def run_individual_python_test(test_name, pyspark_python): 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) }) -LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name) +LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() try: per_test_output = tempfile.TemporaryFile() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19295][SQL] IsolatedClientLoader's downloadVersion should log the location of downloaded metastore client jars
Repository: spark Updated Branches: refs/heads/master 640f94233 -> 63d839028 [SPARK-19295][SQL] IsolatedClientLoader's downloadVersion should log the location of downloaded metastore client jars ## What changes were proposed in this pull request? This will help the users to know the location of those downloaded jars when `spark.sql.hive.metastore.jars` is set to `maven`. ## How was this patch tested? jenkins Author: Yin Huai Closes #16649 from yhuai/SPARK-19295. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63d83902 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63d83902 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63d83902 Branch: refs/heads/master Commit: 63d839028a6e03644febc360519fa8e01c5534cf Parents: 640f942 Author: Yin Huai Authored: Thu Jan 19 14:23:36 2017 -0800 Committer: Yin Huai Committed: Thu Jan 19 14:23:36 2017 -0800 -- .../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63d83902/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 26b2de8..63fdd6b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -122,6 +122,7 @@ private[hive] object IsolatedClientLoader extends Logging { // TODO: Remove copy logic. val tempDir = Utils.createTempDir(namePrefix = s"hive-${version}") allFiles.foreach(f => FileUtils.copyFileToDirectory(f, tempDir)) +logInfo(s"Downloaded metastore jars to ${tempDir.getCanonicalPath}") tempDir.listFiles().map(_.toURI.toURL) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Update known_translations for contributor names
Repository: spark Updated Branches: refs/heads/master fe409f31d -> 0c9231858 Update known_translations for contributor names ## What changes were proposed in this pull request? Update known_translations per https://github.com/apache/spark/pull/16423#issuecomment-269739634 Author: Yin Huai Closes #16628 from yhuai/known_translations. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c923185 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c923185 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c923185 Branch: refs/heads/master Commit: 0c9231858866eff16f97df073d22811176fb6b36 Parents: fe409f3 Author: Yin Huai Authored: Wed Jan 18 18:18:51 2017 -0800 Committer: Yin Huai Committed: Wed Jan 18 18:18:51 2017 -0800 -- dev/create-release/known_translations | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c923185/dev/create-release/known_translations -- diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations index 0f30990..87bf2f2 100644 --- a/dev/create-release/known_translations +++ b/dev/create-release/known_translations @@ -177,7 +177,7 @@ anabranch - Bill Chambers ashangit - Nicolas Fraison avulanov - Alexander Ulanov biglobster - Liang Ke -cenyuhai - Cen Yu Hai +cenyuhai - Yuhai Cen codlife - Jianfei Wang david-weiluo-ren - Weiluo (David) Ren dding3 - Ding Ding @@ -198,7 +198,8 @@ petermaxlee - Peter Lee phalodi - Sandeep Purohit pkch - pkch priyankagargnitk - Priyanka Garg -sharkdtu - Sharkd Tu +sharkdtu - Xiaogang Tu shenh062326 - Shen Hong aokolnychyi - Anton Okolnychyi linbojin - Linbo Jin +lw-lin - Liwei Lin - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18885][SQL] unify CREATE TABLE syntax for data source and hive serde tables
Repository: spark Updated Branches: refs/heads/master f5d18af6a -> cca945b6a [SPARK-18885][SQL] unify CREATE TABLE syntax for data source and hive serde tables ## What changes were proposed in this pull request? Today we have different syntax to create data source or hive serde tables, we should unify them to not confuse users and step forward to make hive a data source. Please read https://issues.apache.org/jira/secure/attachment/12843835/CREATE-TABLE.pdf for details. TODO(for follow-up PRs): 1. TBLPROPERTIES is not added to the new syntax, we should decide if we wanna add it later. 2. `SHOW CREATE TABLE` should be updated to use the new syntax. 3. we should decide if we wanna change the behavior of `SET LOCATION`. ## How was this patch tested? new tests Author: Wenchen Fan Closes #16296 from cloud-fan/create-table. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cca945b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cca945b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cca945b6 Branch: refs/heads/master Commit: cca945b6aa679e61864c1cabae91e6ae7703362e Parents: f5d18af Author: Wenchen Fan Authored: Thu Jan 5 17:40:27 2017 -0800 Committer: Yin Huai Committed: Thu Jan 5 17:40:27 2017 -0800 -- docs/sql-programming-guide.md | 60 +-- .../examples/sql/hive/JavaSparkHiveExample.java | 2 +- examples/src/main/python/sql/hive.py| 2 +- examples/src/main/r/RSparkSQLExample.R | 2 +- .../examples/sql/hive/SparkHiveExample.scala| 2 +- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 10 +- .../sql/catalyst/util/CaseInsensitiveMap.scala | 2 + .../spark/sql/execution/SparkSqlParser.scala| 57 +- .../spark/sql/execution/SparkStrategies.scala | 6 +- .../spark/sql/execution/command/ddl.scala | 6 +- .../spark/sql/execution/datasources/rules.scala | 7 +- .../apache/spark/sql/internal/HiveSerDe.scala | 84 +-- .../sql/execution/SparkSqlParserSuite.scala | 3 +- .../sql/execution/command/DDLCommandSuite.scala | 79 +++--- .../spark/sql/hive/HiveExternalCatalog.scala| 4 +- .../spark/sql/hive/HiveSessionState.scala | 1 + .../apache/spark/sql/hive/HiveStrategies.scala | 73 +++-- .../spark/sql/hive/execution/HiveOptions.scala | 102 ++ .../spark/sql/hive/orc/OrcFileOperator.scala| 2 +- .../spark/sql/hive/HiveDDLCommandSuite.scala| 107 +-- .../sql/hive/HiveExternalCatalogSuite.scala | 2 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 15 --- .../spark/sql/hive/execution/HiveDDLSuite.scala | 39 +++ .../sql/hive/orc/OrcHadoopFsRelationSuite.scala | 2 - 24 files changed, 526 insertions(+), 143 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cca945b6/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 4cd21ae..0f6e344 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -522,14 +522,11 @@ Hive metastore. Persistent tables will still exist even after your Spark program long as you maintain your connection to the same metastore. A DataFrame for a persistent table can be created by calling the `table` method on a `SparkSession` with the name of the table. -By default `saveAsTable` will create a "managed table", meaning that the location of the data will -be controlled by the metastore. Managed tables will also have their data deleted automatically -when a table is dropped. - -Currently, `saveAsTable` does not expose an API supporting the creation of an "external table" from a `DataFrame`. -However, this functionality can be achieved by providing a `path` option to the `DataFrameWriter` with `path` as the key -and location of the external table as its value (a string) when saving the table with `saveAsTable`. When an External table -is dropped only its metadata is removed. +For file-based data source, e.g. text, parquet, json, etc. you can specify a custom table path via the +`path` option, e.g. `df.write.option("path", "/some/path").saveAsTable("t")`. When the table is dropped, +the custom table path will not be removed and the table data is still there. If no custom table path is +specifed, Spark will write data to a default table path under the warehouse directory. When the table is +dropped, the default table path will be removed too. Starting from Spark 2.1, persistent datasource tables have per-partition metadata stored in the Hive metastore. This brings several benefits: @@ -954,6 +951,53 @@ adds support for finding tables in the MetaStore and writing queries using
[1/3] spark-website git commit: Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted
Repository: spark-website Updated Branches: refs/heads/asf-site 426a68ba8 -> 46a7a8027 http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/screencasts/1-first-steps-with-spark.html -- diff --git a/site/screencasts/1-first-steps-with-spark.html b/site/screencasts/1-first-steps-with-spark.html index ac30748..8bcd8bb 100644 --- a/site/screencasts/1-first-steps-with-spark.html +++ b/site/screencasts/1-first-steps-with-spark.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/screencasts/2-spark-documentation-overview.html -- diff --git a/site/screencasts/2-spark-documentation-overview.html b/site/screencasts/2-spark-documentation-overview.html index b331b25..6d8f46e 100644 --- a/site/screencasts/2-spark-documentation-overview.html +++ b/site/screencasts/2-spark-documentation-overview.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/screencasts/3-transformations-and-caching.html -- diff --git a/site/screencasts/3-transformations-and-caching.html b/site/screencasts/3-transformations-and-caching.html index 7ab50f5..f7aa6b4 100644 --- a/site/screencasts/3-transformations-and-caching.html +++ b/site/screencasts/3-transformations-and-caching.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/screencasts/4-a-standalone-job-in-spark.html -- diff --git a/site/screencasts/4-a-standalone-job-in-spark.html b/site/screencasts/4-a-standalone-job-in-spark.html index 35cf6f0..d6cb311 100644 --- a/site/screencasts/4-a-standalone-job-in-spark.html +++ b/site/screencasts/4-a-standalone-job-in-spark.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/screencasts/index.html -- diff --git a/site/screencasts/index.html b/site/screencasts/index.html index bd9d33e..df951b8 100644 --- a/site/screencasts/index.html +++ b/site/screencasts/index.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/sitemap.xml -- diff --git a/site/sitemap.xml b/site/sitemap.xml index 47ed71f..1ed4c74 100644 --- a/site/sitemap.xml +++ b/site/sitemap.xml @@ -139,6 +139,10 @@ + http://spark.apache.org/news/spark-summit-east-2017-agenda-posted.html + weekly + + http://spark.apache.org/releases/spark-release-2-1-0.html weekly http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/sql/index.html -- diff --git a/site/sql/index.html b/site/sql/index.h
[3/3] spark-website git commit: Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted
Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/46a7a802 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/46a7a802 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/46a7a802 Branch: refs/heads/asf-site Commit: 46a7a802762fa2428265b422170821fc3fec3563 Parents: 426a68b Author: Yin Huai Authored: Wed Jan 4 18:27:47 2017 -0800 Committer: Yin Huai Committed: Wed Jan 4 18:27:47 2017 -0800 -- ...1-04-spark-summit-east-2017-agenda-posted.md | 15 ++ site/committers.html| 6 +- site/community.html | 6 +- site/contributing.html | 6 +- site/developer-tools.html | 6 +- site/documentation.html | 6 +- site/downloads.html | 6 +- site/examples.html | 6 +- site/faq.html | 6 +- site/graphx/index.html | 6 +- site/index.html | 6 +- site/mailing-lists.html | 6 +- site/mllib/index.html | 6 +- site/news/amp-camp-2013-registration-ope.html | 6 +- .../news/announcing-the-first-spark-summit.html | 6 +- .../news/fourth-spark-screencast-published.html | 6 +- site/news/index.html| 15 +- site/news/nsdi-paper.html | 6 +- site/news/one-month-to-spark-summit-2015.html | 6 +- .../proposals-open-for-spark-summit-east.html | 6 +- ...registration-open-for-spark-summit-east.html | 6 +- .../news/run-spark-and-shark-on-amazon-emr.html | 6 +- site/news/spark-0-6-1-and-0-5-2-released.html | 6 +- site/news/spark-0-6-2-released.html | 6 +- site/news/spark-0-7-0-released.html | 6 +- site/news/spark-0-7-2-released.html | 6 +- site/news/spark-0-7-3-released.html | 6 +- site/news/spark-0-8-0-released.html | 6 +- site/news/spark-0-8-1-released.html | 6 +- site/news/spark-0-9-0-released.html | 6 +- site/news/spark-0-9-1-released.html | 6 +- site/news/spark-0-9-2-released.html | 6 +- site/news/spark-1-0-0-released.html | 6 +- site/news/spark-1-0-1-released.html | 6 +- site/news/spark-1-0-2-released.html | 6 +- site/news/spark-1-1-0-released.html | 6 +- site/news/spark-1-1-1-released.html | 6 +- site/news/spark-1-2-0-released.html | 6 +- site/news/spark-1-2-1-released.html | 6 +- site/news/spark-1-2-2-released.html | 6 +- site/news/spark-1-3-0-released.html | 6 +- site/news/spark-1-4-0-released.html | 6 +- site/news/spark-1-4-1-released.html | 6 +- site/news/spark-1-5-0-released.html | 6 +- site/news/spark-1-5-1-released.html | 6 +- site/news/spark-1-5-2-released.html | 6 +- site/news/spark-1-6-0-released.html | 6 +- site/news/spark-1-6-1-released.html | 6 +- site/news/spark-1-6-2-released.html | 6 +- site/news/spark-1-6-3-released.html | 6 +- site/news/spark-2-0-0-released.html | 6 +- site/news/spark-2-0-1-released.html | 6 +- site/news/spark-2-0-2-released.html | 6 +- site/news/spark-2-1-0-released.html | 6 +- site/news/spark-2.0.0-preview.html | 6 +- .../spark-accepted-into-apache-incubator.html | 6 +- site/news/spark-and-shark-in-the-news.html | 6 +- site/news/spark-becomes-tlp.html| 6 +- site/news/spark-featured-in-wired.html | 6 +- .../spark-mailing-lists-moving-to-apache.html | 6 +- site/news/spark-meetups.html| 6 +- site/news/spark-screencasts-published.html | 6 +- site/news/spark-summit-2013-is-a-wrap.html | 6 +- site/news/spark-summit-2014-videos-posted.html | 6 +- site/news/spark-summit-2015-videos-posted.html | 6 +- site/news/spark-summit-agenda-posted.html | 6 +- .../spark-summit-east-2015-videos-posted.html | 6 +- .../spark-summit-east-2016-cfp-closing.html | 6 +- .../spark-summit-east-2017-agenda-posted.html | 220 +++ site/news/spark-summit-east-agenda-posted.html | 6 +- .../news/spark-summit-europe-agenda-posted.html | 6 +- site/news/spark-summit-europe.html | 6 +- .../spark-summit-june-2016-agenda-posted.html | 6 +- site/news/spark-tips-from-quantifind.html | 6 +- .../spark-user-survey-and-powered-by-page.html |
[2/3] spark-website git commit: Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted
http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/news/spark-summit-2014-videos-posted.html -- diff --git a/site/news/spark-summit-2014-videos-posted.html b/site/news/spark-summit-2014-videos-posted.html index 3efd1db..03cbcd3 100644 --- a/site/news/spark-summit-2014-videos-posted.html +++ b/site/news/spark-summit-2014-videos-posted.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/news/spark-summit-2015-videos-posted.html -- diff --git a/site/news/spark-summit-2015-videos-posted.html b/site/news/spark-summit-2015-videos-posted.html index 8aed6ba..1a93256 100644 --- a/site/news/spark-summit-2015-videos-posted.html +++ b/site/news/spark-summit-2015-videos-posted.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/news/spark-summit-agenda-posted.html -- diff --git a/site/news/spark-summit-agenda-posted.html b/site/news/spark-summit-agenda-posted.html index 2697ece..354035f 100644 --- a/site/news/spark-summit-agenda-posted.html +++ b/site/news/spark-summit-agenda-posted.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/news/spark-summit-east-2015-videos-posted.html -- diff --git a/site/news/spark-summit-east-2015-videos-posted.html b/site/news/spark-summit-east-2015-videos-posted.html index 84771e8..962aa1e 100644 --- a/site/news/spark-summit-east-2015-videos-posted.html +++ b/site/news/spark-summit-east-2015-videos-posted.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/news/spark-summit-east-2016-cfp-closing.html -- diff --git a/site/news/spark-summit-east-2016-cfp-closing.html b/site/news/spark-summit-east-2016-cfp-closing.html index 45e6385..cc43c32 100644 --- a/site/news/spark-summit-east-2016-cfp-closing.html +++ b/site/news/spark-summit-east-2016-cfp-closing.html @@ -159,6 +159,9 @@ Latest News + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted + (Jan 04, 2017) + Spark 2.1.0 released (Dec 28, 2016) @@ -168,9 +171,6 @@ Spark 2.0.2 released (Nov 14, 2016) - Spark 1.6.3 released - (Nov 07, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/46a7a802/site/news/spark-summit-east-2017-agenda-posted.html -- diff --git a/site/news/spark-summit-east-2017-agenda-posted.html b/site/news/spark-summit-east-2017-agenda-posted.html new file mode 100644 index 000..58af016 --- /dev/null +++ b/site/news/spark-summit-east-2017-agenda-posted.html @@ -0,0 +1,220 @@ + + + + + + + + + Spark Summit East (Feb 7-9th, 2017, Boston) agenda posted | Apache Spark + + + + + + + + + + + + + + + + + var _gaq = _gaq || []; + _gaq.push(['_setAccount', 'UA-32518208-2']); + _gaq.push
spark git commit: [SPARK-19072][SQL] codegen of Literal should not output boxed value
Repository: spark Updated Branches: refs/heads/master b67b35f76 -> cbd11d235 [SPARK-19072][SQL] codegen of Literal should not output boxed value ## What changes were proposed in this pull request? In https://github.com/apache/spark/pull/16402 we made a mistake that, when double/float is infinity, the `Literal` codegen will output boxed value and cause wrong result. This PR fixes this by special handling infinity to not output boxed value. ## How was this patch tested? new regression test Author: Wenchen Fan Closes #16469 from cloud-fan/literal. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cbd11d23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cbd11d23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cbd11d23 Branch: refs/heads/master Commit: cbd11d235752d0ab30cfdbf2351cb3e68a123606 Parents: b67b35f Author: Wenchen Fan Authored: Tue Jan 3 22:40:14 2017 -0800 Committer: Yin Huai Committed: Tue Jan 3 22:40:14 2017 -0800 -- .../sql/catalyst/expressions/literals.scala | 30 +--- .../catalyst/expressions/PredicateSuite.scala | 5 2 files changed, 24 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cbd11d23/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index ab45c41..cb0c4d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -266,33 +266,41 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { override def eval(input: InternalRow): Any = value override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +val javaType = ctx.javaType(dataType) // change the isNull and primitive to consts, to inline them if (value == null) { ev.isNull = "true" - ev.copy(s"final ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};") + ev.copy(s"final $javaType ${ev.value} = ${ctx.defaultValue(dataType)};") } else { ev.isNull = "false" - ev.value = dataType match { -case BooleanType | IntegerType | DateType => value.toString + dataType match { +case BooleanType | IntegerType | DateType => + ev.copy(code = "", value = value.toString) case FloatType => val v = value.asInstanceOf[Float] if (v.isNaN || v.isInfinite) { -ctx.addReferenceMinorObj(v) +val boxedValue = ctx.addReferenceMinorObj(v) +val code = s"final $javaType ${ev.value} = ($javaType) $boxedValue;" +ev.copy(code = code) } else { -s"${value}f" +ev.copy(code = "", value = s"${value}f") } case DoubleType => val v = value.asInstanceOf[Double] if (v.isNaN || v.isInfinite) { -ctx.addReferenceMinorObj(v) +val boxedValue = ctx.addReferenceMinorObj(v) +val code = s"final $javaType ${ev.value} = ($javaType) $boxedValue;" +ev.copy(code = code) } else { -s"${value}D" +ev.copy(code = "", value = s"${value}D") } -case ByteType | ShortType => s"(${ctx.javaType(dataType)})$value" -case TimestampType | LongType => s"${value}L" -case other => ctx.addReferenceMinorObj(value, ctx.javaType(dataType)) +case ByteType | ShortType => + ev.copy(code = "", value = s"($javaType)$value") +case TimestampType | LongType => + ev.copy(code = "", value = s"${value}L") +case other => + ev.copy(code = "", value = ctx.addReferenceMinorObj(value, ctx.javaType(dataType))) } - ev.copy("") } } http://git-wip-us.apache.org/repos/asf/spark/blob/cbd11d23/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala index 6fc3de1..6fe295c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala @@ -324,4 +324,9 @@ class PredicateSuite extends SparkFunSuite wit
spark git commit: Update known_translations for contributor names and also fix a small issue in translate-contributors.py
Repository: spark Updated Branches: refs/heads/master dba81e1dc -> 63036aee2 Update known_translations for contributor names and also fix a small issue in translate-contributors.py ## What changes were proposed in this pull request? This PR updates dev/create-release/known_translations to add more contributor name mapping. It also fixes a small issue in translate-contributors.py ## How was this patch tested? manually tested Author: Yin Huai Closes #16423 from yhuai/contributors. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63036aee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63036aee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63036aee Branch: refs/heads/master Commit: 63036aee2271cdbb7032b51b2ac67edbcb82389e Parents: dba81e1 Author: Yin Huai Authored: Thu Dec 29 14:20:56 2016 -0800 Committer: Yin Huai Committed: Thu Dec 29 14:20:56 2016 -0800 -- dev/create-release/known_translations| 37 +++ dev/create-release/translate-contributors.py | 4 ++- 2 files changed, 40 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63036aee/dev/create-release/known_translations -- diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations index 3563fe3..0f30990 100644 --- a/dev/create-release/known_translations +++ b/dev/create-release/known_translations @@ -165,3 +165,40 @@ stanzhai - Stan Zhai tien-dungle - Tien-Dung Le xuchenCN - Xu Chen zhangjiajin - Zhang JiaJin +ClassNotFoundExp - Fu Xing +KevinGrealish - Kevin Grealish +MasterDDT - Mitesh Patel +VinceShieh - Vincent Xie +WeichenXu123 - Weichen Xu +Yunni - Yun Ni +actuaryzhang - Wayne Zhang +alicegugu - Gu Huiqin Alice +anabranch - Bill Chambers +ashangit - Nicolas Fraison +avulanov - Alexander Ulanov +biglobster - Liang Ke +cenyuhai - Cen Yu Hai +codlife - Jianfei Wang +david-weiluo-ren - Weiluo (David) Ren +dding3 - Ding Ding +fidato13 - Tarun Kumar +frreiss - Fred Reiss +gatorsmile - Xiao Li +hayashidac - Chie Hayashida +invkrh - Hao Ren +jagadeesanas2 - Jagadeesan A S +jiangxb1987 - Jiang Xingbo +jisookim0513 - Jisoo Kim +junyangq - Junyang Qian +krishnakalyan3 - Krishna Kalyan +linbojin - Linbo Jin +mpjlu - Peng Meng +neggert - Nic Eggert +petermaxlee - Peter Lee +phalodi - Sandeep Purohit +pkch - pkch +priyankagargnitk - Priyanka Garg +sharkdtu - Sharkd Tu +shenh062326 - Shen Hong +aokolnychyi - Anton Okolnychyi +linbojin - Linbo Jin http://git-wip-us.apache.org/repos/asf/spark/blob/63036aee/dev/create-release/translate-contributors.py -- diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 86fa02d..2cc64e4 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -147,7 +147,9 @@ print "\n== Translating contributor list === lines = contributors_file.readlines() contributions = [] for i, line in enumerate(lines): -temp_author = line.strip(" * ").split(" -- ")[0] +# It is possible that a line in the contributor file only has the github name, e.g. yhuai. +# So, we need a strip() to remove the newline. +temp_author = line.strip(" * ").split(" -- ")[0].strip() print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) if not temp_author: error_msg = "ERROR: Expected the following format \" * -- \"\n" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark-website git commit: Fix the list of previous spark summits.
Repository: spark-website Updated Branches: refs/heads/asf-site e10180e67 -> 426a68ba8 Fix the list of previous spark summits. Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/426a68ba Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/426a68ba Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/426a68ba Branch: refs/heads/asf-site Commit: 426a68ba8cd6efeaffdacaa0d2b645c5c5ac6a5e Parents: e10180e Author: Yin Huai Authored: Thu Dec 29 13:07:47 2016 -0800 Committer: Yin Huai Committed: Thu Dec 29 13:07:47 2016 -0800 -- community.md| 19 ++- site/community.html | 19 ++- 2 files changed, 28 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/426a68ba/community.md -- diff --git a/community.md b/community.md index d887d31..d8ae250 100644 --- a/community.md +++ b/community.md @@ -88,19 +88,28 @@ Chat rooms are great for quick questions or discussions on specialized topics. T Conferences -https://spark-summit.org/";>Spark Summit Europe 2015. Oct 27 - Oct 29 in Amsterdam. +https://spark-summit.org/eu-2016/";>Spark Summit Europe 2016. Oct 25 - 27 in Brussels. -http://spark-summit.org/2015";>Spark Summit 2015. June 15 - 17 in San Francisco. +https://spark-summit.org/2016/";>Spark Summit 2016. June 6 - 8 in San Francisco. -http://spark-summit.org/east";>Spark Summit East 2015. March 18 - 19 in New York City. +https://spark-summit.org/east-2016/";>Spark Summit East 2016. Feb 16 - 18 in New York City. -http://spark-summit.org/2014";>Spark Summit 2014. June 30 - July 1 2014 in San Francisco. +https://spark-summit.org/eu-2015/";>Spark Summit Europe 2015. Oct 27 - 29 in Amsterdam. -http://spark-summit.org/2013";>Spark Summit 2013. December 2013 in San Francisco. +https://spark-summit.org/2015";>Spark Summit 2015. June 15 - 17 in San Francisco. + + +https://spark-summit.org/east-2015/";>Spark Summit East 2015. March 18 - 19 in New York City. + + +https://spark-summit.org/2014";>Spark Summit 2014. June 30 - July 1 2014 in San Francisco. + + +https://spark-summit.org/2013";>Spark Summit 2013. December 2013 in San Francisco. http://git-wip-us.apache.org/repos/asf/spark-website/blob/426a68ba/site/community.html -- diff --git a/site/community.html b/site/community.html index 7a38701..79604fb 100644 --- a/site/community.html +++ b/site/community.html @@ -286,19 +286,28 @@ and include only a few lines of the pertinent code / log within the email. Conferences -https://spark-summit.org/";>Spark Summit Europe 2015. Oct 27 - Oct 29 in Amsterdam. +https://spark-summit.org/eu-2016/";>Spark Summit Europe 2016. Oct 25 - 27 in Brussels. -http://spark-summit.org/2015";>Spark Summit 2015. June 15 - 17 in San Francisco. +https://spark-summit.org/2016/";>Spark Summit 2016. June 6 - 8 in San Francisco. -http://spark-summit.org/east";>Spark Summit East 2015. March 18 - 19 in New York City. +https://spark-summit.org/east-2016/";>Spark Summit East 2016. Feb 16 - 18 in New York City. -http://spark-summit.org/2014";>Spark Summit 2014. June 30 - July 1 2014 in San Francisco. +https://spark-summit.org/eu-2015/";>Spark Summit Europe 2015. Oct 27 - 29 in Amsterdam. -http://spark-summit.org/2013";>Spark Summit 2013. December 2013 in San Francisco. +https://spark-summit.org/2015";>Spark Summit 2015. June 15 - 17 in San Francisco. + + +https://spark-summit.org/east-2015/";>Spark Summit East 2015. March 18 - 19 in New York City. + + +https://spark-summit.org/2014";>Spark Summit 2014. June 30 - July 1 2014 in San Francisco. + + +https://spark-summit.org/2013";>Spark Summit 2013. December 2013 in San Francisco. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/5] spark-website git commit: Update Spark website for the release of Apache Spark 2.1.0
http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/releases/spark-release-0-9-1.html -- diff --git a/site/releases/spark-release-0-9-1.html b/site/releases/spark-release-0-9-1.html index 80401c4..5b08a0b 100644 --- a/site/releases/spark-release-0-9-1.html +++ b/site/releases/spark-release-0-9-1.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -168,9 +171,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive @@ -210,9 +210,9 @@ Fixed hash collision bug in external spilling [https://issues.apache.org/jira/browse/SPARK-1113";>SPARK-1113] Fixed conflict with Sparkâs log4j for users relying on other logging backends [https://issues.apache.org/jira/browse/SPARK-1190";>SPARK-1190] Fixed Graphx missing from Spark assembly jar in maven builds - Fixed silent failures due to map output status exceeding Akka frame size [https://issues.apache.org/jira/browse/SPARK-1244";>SPARK-1244] - Removed Sparkâs unnecessary direct dependency on ASM [https://issues.apache.org/jira/browse/SPARK-782";>SPARK-782] - Removed metrics-ganglia from default build due to LGPL license conflict [https://issues.apache.org/jira/browse/SPARK-1167";>SPARK-1167] + Fixed silent failures due to map output status exceeding Akka frame size [https://issues.apache.org/jira/browse/SPARK-1244";>SPARK-1244] + Removed Sparkâs unnecessary direct dependency on ASM [https://issues.apache.org/jira/browse/SPARK-782";>SPARK-782] + Removed metrics-ganglia from default build due to LGPL license conflict [https://issues.apache.org/jira/browse/SPARK-1167";>SPARK-1167] Fixed bug in distribution tarball not containing spark assembly jar [https://issues.apache.org/jira/browse/SPARK-1184";>SPARK-1184] Fixed bug causing infinite NullPointerException failures due to a null in map output locations [https://issues.apache.org/jira/browse/SPARK-1124";>SPARK-1124] Fixed bugs in post-job cleanup of schedulerâs data structures @@ -228,7 +228,7 @@ Fixed bug making Spark application stall when YARN registration fails [https://issues.apache.org/jira/browse/SPARK-1032";>SPARK-1032] Race condition in getting HDFS delegation tokens in yarn-client mode [https://issues.apache.org/jira/browse/SPARK-1203";>SPARK-1203] Fixed bug in yarn-client mode not exiting properly [https://issues.apache.org/jira/browse/SPARK-1049";>SPARK-1049] - Fixed regression bug in ADD_JAR environment variable not correctly adding custom jars [https://issues.apache.org/jira/browse/SPARK-1089";>SPARK-1089] + Fixed regression bug in ADD_JAR environment variable not correctly adding custom jars [https://issues.apache.org/jira/browse/SPARK-1089";>SPARK-1089] Improvements to other deployment scenarios @@ -239,19 +239,19 @@ Optimizations to MLLib - Optimized memory usage of ALS [https://issues.apache.org/jira/browse/MLLIB-25";>MLLIB-25] + Optimized memory usage of ALS [https://issues.apache.org/jira/browse/MLLIB-25";>MLLIB-25] Optimized computation of YtY for implicit ALS [https://issues.apache.org/jira/browse/SPARK-1237";>SPARK-1237] Support for negative implicit input in ALS [https://issues.apache.org/jira/browse/MLLIB-22";>MLLIB-22] Setting of a random seed in ALS [https://issues.apache.org/jira/browse/SPARK-1238";>SPARK-1238] - Faster construction of features with intercept [https://issues.apache.org/jira/browse/SPARK-1260";>SPARK-1260] + Faster construction of features with intercept [https://issues.apache.org/jira/browse/SPARK-1260";>SPARK-1260] Check for intercept and weight in GLMâs addIntercept [https://issues.apache.org/jira/browse/SPARK-1327";>SPARK-1327] Bug fixes and better API parity for PySpark Fixed bug in Python de-pickling [https://issues.apache.org/jira/browse/SPARK-1135";>SPARK-1135] - Fixed bug in serialization of strings longer than 64K [https://issues.apache.org/jira/browse/SPARK-1043";>SPARK-1043] - Fixed bug that made jobs hang when base file is not available [https://issues.apache.org/jira/browse/SPARK-1025";>SPARK-1025] + Fixed bug in serialization of strings longer than 64K [https://issues.apache.org/jira/browse/SPARK-1043";>SPARK-1043] + Fixed bug that made jobs hang when base file is not available [https://issues.apache.org/jira/browse/SPARK-1025";>SPARK-1025] Added Missing RDD operations to PySpark - top, zip, foldByKey, repartition, coalesce, getStorage
[4/5] spark-website git commit: Update Spark website for the release of Apache Spark 2.1.0
http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/mailing-lists.html -- diff --git a/site/mailing-lists.html b/site/mailing-lists.html index c113cdd..3e2334f 100644 --- a/site/mailing-lists.html +++ b/site/mailing-lists.html @@ -109,7 +109,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -162,6 +162,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -171,9 +174,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/mllib/index.html -- diff --git a/site/mllib/index.html b/site/mllib/index.html index e29228b..08f5dc4 100644 --- a/site/mllib/index.html +++ b/site/mllib/index.html @@ -109,7 +109,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -162,6 +162,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -171,9 +174,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/amp-camp-2013-registration-ope.html -- diff --git a/site/news/amp-camp-2013-registration-ope.html b/site/news/amp-camp-2013-registration-ope.html index b9d1aba..88d6d7d 100644 --- a/site/news/amp-camp-2013-registration-ope.html +++ b/site/news/amp-camp-2013-registration-ope.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -168,9 +171,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/announcing-the-first-spark-summit.html -- diff --git a/site/news/announcing-the-first-spark-summit.html b/site/news/announcing-the-first-spark-summit.html index 2215895..0c013dc 100644 --- a/site/news/announcing-the-first-spark-summit.html +++ b/site/news/announcing-the-first-spark-summit.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -168,9 +171,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/fourth-spark-screencast-published.html -- diff --git a/site/news/fourth-spark-screencast-published.html b/site/news/fourth-spark-screencast-published.html index fe28ecf..efa74d0 100644 --- a/site/news/fourth-spark-screencast-published.html +++ b/site/news/fourth-spark-screencast-published.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the mo
[1/5] spark-website git commit: Update Spark website for the release of Apache Spark 2.1.0
Repository: spark-website Updated Branches: refs/heads/asf-site d2bcf1854 -> e10180e67 http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/releases/spark-release-2-1-0.html -- diff --git a/site/releases/spark-release-2-1-0.html b/site/releases/spark-release-2-1-0.html new file mode 100644 index 000..53017ff --- /dev/null +++ b/site/releases/spark-release-2-1-0.html @@ -0,0 +1,370 @@ + + + + + + + + + Spark Release 2.1.0 | Apache Spark + + + + + + + + + + + + + + + + + var _gaq = _gaq || []; + _gaq.push(['_setAccount', 'UA-32518208-2']); + _gaq.push(['_trackPageview']); + (function() { +var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; +ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; +var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); + })(); + + + function trackOutboundLink(link, category, action) { +try { + _gaq.push(['_trackEvent', category , action]); +} catch(err){} + +setTimeout(function() { + document.location.href = link.href; +}, 100); + } + + + + + + + + +https://code.jquery.com/jquery.js";> +https://netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js";> + + + + + + + + + + + Lightning-fast cluster computing + + + + + + + + + + Toggle navigation + + + + + + + + + + Download + + + Libraries + + + SQL and DataFrames + Spark Streaming + MLlib (machine learning) + GraphX (graph) + + Third-Party Projects + + + + + Documentation + + + Latest Release (Spark 2.1.0) + Older Versions and Other Resources + Frequently Asked Questions + + + Examples + + + Community + + + Mailing Lists & Resources + Contributing to Spark + https://issues.apache.org/jira/browse/SPARK";>Issue Tracker + Powered By + Project Committers + + + + + Developers + + + Useful Developer Tools + Versioning Policy + Release Process + + + + + +http://www.apache.org/"; class="dropdown-toggle" data-toggle="dropdown"> + Apache Software Foundation + + http://www.apache.org/";>Apache Homepage + http://www.apache.org/licenses/";>License + http://www.apache.org/foundation/sponsorship.html";>Sponsorship + http://www.apache.org/foundation/thanks.html";>Thanks + http://www.apache.org/security/";>Security + + + + + + + + + + + + Latest News + + + Spark 2.1.0 released + (Dec 28, 2016) + + Spark wins CloudSort Benchmark as the most efficient engine + (Nov 15, 2016) + + Spark 2.0.2 released + (Nov 14, 2016) + + Spark 1.6.3 released + (Nov 07, 2016) + + + Archive + + + +Download Spark + + +Built-in Libraries: + + +SQL and DataFrames +Spark Streaming +MLlib (machine learning) +GraphX (graph) + + Third-Party Projects + + + + +Spark Release 2.1.0 + + +Apache Spark 2.1.0 is the second release on the 2.x line. This release makes significant strides in the production readiness of Structured Streaming, with added support for event time watermarks and Kafka 0.10 support. In addition, this release focuses more on usability, stability, and polish, resolving over 1200 tickets. + +To download Apache Spark 2.1.0, visit the downloads page. You can consult JIRA for the https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12315420&version=12335644";>detailed changes. We have curated a list of high level changes here, grouped by major modules. + + + Core and Spark SQL + Structured Streaming + MLlib + SparkR + GraphX + Deprecations + Changes of behavior + Known Issues + Credits + + +Core and Spark SQL + + + API updates + + SPARK-17864: Data type APIs are stable APIs. + SPARK-18351: from_json and to_json for parsing JSON for string columns + SPARK-16700: When creating a DataFrame in PySpark, Python dictionaries can be used as values of a StructType. + + + Performance and stability + +
[5/5] spark-website git commit: Update Spark website for the release of Apache Spark 2.1.0
Update Spark website for the release of Apache Spark 2.1.0 Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/e10180e6 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/e10180e6 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/e10180e6 Branch: refs/heads/asf-site Commit: e10180e6784e1d9d8771ef42481687aec0a423a2 Parents: d2bcf18 Author: Yin Huai Authored: Wed Dec 28 18:00:05 2016 -0800 Committer: Yin Huai Committed: Thu Dec 29 07:46:11 2016 -0800 -- _layouts/global.html| 2 +- documentation.md| 1 + downloads.md| 6 +- js/downloads.js | 1 + news/_posts/2016-12-28-spark-2-1-0-released.md | 14 + .../_posts/2016-12-28-spark-release-2-1-0.md| 120 ++ site/committers.html| 48 ++- site/community.html | 16 +- site/contributing.html | 28 +- site/developer-tools.html | 22 +- site/docs/latest| 2 +- site/documentation.html | 14 +- site/downloads.html | 14 +- site/examples.html | 100 ++--- site/faq.html | 8 +- site/graphx/index.html | 8 +- site/index.html | 8 +- site/js/downloads.js| 1 + site/mailing-lists.html | 8 +- site/mllib/index.html | 8 +- site/news/amp-camp-2013-registration-ope.html | 8 +- .../news/announcing-the-first-spark-summit.html | 8 +- .../news/fourth-spark-screencast-published.html | 8 +- site/news/index.html| 27 +- site/news/nsdi-paper.html | 8 +- site/news/one-month-to-spark-summit-2015.html | 8 +- .../proposals-open-for-spark-summit-east.html | 8 +- ...registration-open-for-spark-summit-east.html | 8 +- .../news/run-spark-and-shark-on-amazon-emr.html | 8 +- site/news/spark-0-6-1-and-0-5-2-released.html | 8 +- site/news/spark-0-6-2-released.html | 8 +- site/news/spark-0-7-0-released.html | 8 +- site/news/spark-0-7-2-released.html | 8 +- site/news/spark-0-7-3-released.html | 8 +- site/news/spark-0-8-0-released.html | 8 +- site/news/spark-0-8-1-released.html | 8 +- site/news/spark-0-9-0-released.html | 8 +- site/news/spark-0-9-1-released.html | 10 +- site/news/spark-0-9-2-released.html | 10 +- site/news/spark-1-0-0-released.html | 8 +- site/news/spark-1-0-1-released.html | 8 +- site/news/spark-1-0-2-released.html | 8 +- site/news/spark-1-1-0-released.html | 10 +- site/news/spark-1-1-1-released.html | 8 +- site/news/spark-1-2-0-released.html | 8 +- site/news/spark-1-2-1-released.html | 8 +- site/news/spark-1-2-2-released.html | 10 +- site/news/spark-1-3-0-released.html | 8 +- site/news/spark-1-4-0-released.html | 8 +- site/news/spark-1-4-1-released.html | 8 +- site/news/spark-1-5-0-released.html | 8 +- site/news/spark-1-5-1-released.html | 8 +- site/news/spark-1-5-2-released.html | 8 +- site/news/spark-1-6-0-released.html | 8 +- site/news/spark-1-6-1-released.html | 8 +- site/news/spark-1-6-2-released.html | 8 +- site/news/spark-1-6-3-released.html | 8 +- site/news/spark-2-0-0-released.html | 8 +- site/news/spark-2-0-1-released.html | 8 +- site/news/spark-2-0-2-released.html | 8 +- site/news/spark-2-1-0-released.html | 220 +++ site/news/spark-2.0.0-preview.html | 8 +- .../spark-accepted-into-apache-incubator.html | 8 +- site/news/spark-and-shark-in-the-news.html | 10 +- site/news/spark-becomes-tlp.html| 8 +- site/news/spark-featured-in-wired.html | 8 +- .../spark-mailing-lists-moving-to-apache.html | 8 +- site/news/spark-meetups.html| 8 +- site/news/spark-screencasts-published.html | 8 +- site/news/spark-summit-2013-is-a-wrap.html | 8 +- site/news/spark-summit-2014-videos-posted.html | 8 +- site/news/spark-summit-2015-videos-posted.html | 8 +- site/news/spark-summit-agenda-posted.html | 8 +- .../spark-summit-east-2015-videos-posted.html | 10 +- .../spark-summit-east-2016-cfp-closing.html | 8
[3/5] spark-website git commit: Update Spark website for the release of Apache Spark 2.1.0
http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/spark-2.0.0-preview.html -- diff --git a/site/news/spark-2.0.0-preview.html b/site/news/spark-2.0.0-preview.html index 64acf16..f135bf2 100644 --- a/site/news/spark-2.0.0-preview.html +++ b/site/news/spark-2.0.0-preview.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -168,9 +171,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/spark-accepted-into-apache-incubator.html -- diff --git a/site/news/spark-accepted-into-apache-incubator.html b/site/news/spark-accepted-into-apache-incubator.html index 57e4881..257e17e 100644 --- a/site/news/spark-accepted-into-apache-incubator.html +++ b/site/news/spark-accepted-into-apache-incubator.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -168,9 +171,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/spark-and-shark-in-the-news.html -- diff --git a/site/news/spark-and-shark-in-the-news.html b/site/news/spark-and-shark-in-the-news.html index 3994fe0..730a667 100644 --- a/site/news/spark-and-shark-in-the-news.html +++ b/site/news/spark-and-shark-in-the-news.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released + (Dec 28, 2016) + Spark wins CloudSort Benchmark as the most efficient engine (Nov 15, 2016) @@ -168,9 +171,6 @@ Spark 1.6.3 released (Nov 07, 2016) - Spark 2.0.1 released - (Oct 03, 2016) - Archive @@ -205,7 +205,7 @@ http://data-informed.com/spark-an-open-source-engine-for-iterative-data-mining/";>DataInformed interviewed two Spark users and wrote about their applications in anomaly detection, predictive analytics and data mining. -In other news, there will be a full day of tutorials on Spark and Shark at the http://strataconf.com/strata2013";>O’Reilly Strata conference in February. They include a three-hour http://strataconf.com/strata2013/public/schedule/detail/27438";>introduction to Spark, Shark and BDAS Tuesday morning, and a three-hour http://strataconf.com/strata2013/public/schedule/detail/27440";>hands-on exercise session. +In other news, there will be a full day of tutorials on Spark and Shark at the http://strataconf.com/strata2013";>O’Reilly Strata conference in February. They include a three-hour http://strataconf.com/strata2013/public/schedule/detail/27438";>introduction to Spark, Shark and BDAS Tuesday morning, and a three-hour http://strataconf.com/strata2013/public/schedule/detail/27440";>hands-on exercise session. http://git-wip-us.apache.org/repos/asf/spark-website/blob/e10180e6/site/news/spark-becomes-tlp.html -- diff --git a/site/news/spark-becomes-tlp.html b/site/news/spark-becomes-tlp.html index 803c919..7f6d730 100644 --- a/site/news/spark-becomes-tlp.html +++ b/site/news/spark-becomes-tlp.html @@ -106,7 +106,7 @@ Documentation - Latest Release (Spark 2.0.2) + Latest Release (Spark 2.1.0) Older Versions and Other Resources Frequently Asked Questions @@ -159,6 +159,9 @@ Latest News + Spark 2.1.0 released +
spark git commit: [SPARK-18567][SQL] Simplify CreateDataSourceTableAsSelectCommand
Repository: spark Updated Branches: refs/heads/master 93f35569f -> 7d19b6ab7 [SPARK-18567][SQL] Simplify CreateDataSourceTableAsSelectCommand ## What changes were proposed in this pull request? The `CreateDataSourceTableAsSelectCommand` is quite complex now, as it has a lot of work to do if the table already exists: 1. throw exception if we don't want to ignore it. 2. do some check and adjust the schema if we want to append data. 3. drop the table and create it again if we want to overwrite. The work 2 and 3 should be done by analyzer, so that we can also apply it to hive tables. ## How was this patch tested? existing tests. Author: Wenchen Fan Closes #15996 from cloud-fan/append. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d19b6ab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d19b6ab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d19b6ab Branch: refs/heads/master Commit: 7d19b6ab7d75b95d9eb1c7e1f228d23fd482306e Parents: 93f3556 Author: Wenchen Fan Authored: Wed Dec 28 21:50:21 2016 -0800 Committer: Yin Huai Committed: Wed Dec 28 21:50:21 2016 -0800 -- .../org/apache/spark/sql/DataFrameWriter.scala | 78 + .../command/createDataSourceTables.scala| 167 +-- .../spark/sql/execution/datasources/rules.scala | 164 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 2 +- 4 files changed, 213 insertions(+), 198 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d19b6ab/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 9c5660a..405f38a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -23,11 +23,12 @@ import scala.collection.JavaConverters._ import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable import org.apache.spark.sql.execution.command.DDLUtils -import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource} +import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation} +import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.StructType /** @@ -364,7 +365,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { throw new AnalysisException("Cannot create hive serde table with saveAsTable API") } -val tableExists = df.sparkSession.sessionState.catalog.tableExists(tableIdent) +val catalog = df.sparkSession.sessionState.catalog +val tableExists = catalog.tableExists(tableIdent) +val db = tableIdent.database.getOrElse(catalog.getCurrentDatabase) +val tableIdentWithDB = tableIdent.copy(database = Some(db)) +val tableName = tableIdentWithDB.unquotedString (tableExists, mode) match { case (true, SaveMode.Ignore) => @@ -373,39 +378,48 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { case (true, SaveMode.ErrorIfExists) => throw new AnalysisException(s"Table $tableIdent already exists.") - case _ => -val existingTable = if (tableExists) { - Some(df.sparkSession.sessionState.catalog.getTableMetadata(tableIdent)) -} else { - None + case (true, SaveMode.Overwrite) => +// Get all input data source relations of the query. +val srcRelations = df.logicalPlan.collect { + case LogicalRelation(src: BaseRelation, _, _) => src } -val storage = if (tableExists) { - existingTable.get.storage -} else { - DataSource.buildStorageFormatFromOptions(extraOptions.toMap) -} -val tableType = if (tableExists) { - existingTable.get.tableType -} else if (storage.locationUri.isDefined) { - CatalogTableType.EXTERNAL -} else { - CatalogTableType.MANAGED +EliminateSubqueryAliases(catalog.lookupRelation(tableIdentWithDB)) match { + // Only do the check if the table is a data source table (the relation is a BaseRelation). + case LogicalRelation(dest: BaseRelation, _, _) if srcRelations.contains(dest) => +throw new AnalysisException( +
[01/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
Repository: spark-website Updated Branches: refs/heads/asf-site ecf94f284 -> d2bcf1854 http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/submitting-applications.html -- diff --git a/site/docs/2.1.0/submitting-applications.html b/site/docs/2.1.0/submitting-applications.html index fc18fa9..0c91739 100644 --- a/site/docs/2.1.0/submitting-applications.html +++ b/site/docs/2.1.0/submitting-applications.html @@ -151,14 +151,14 @@ packaging them into a .zip or .egg. This script takes care of setting up the classpath with Spark and its dependencies, and can support different cluster managers and deploy modes that Spark supports: -./bin/spark-submit \ +./bin/spark-submit \ --class\ --master \ --deploy-mode \ --conf = \ - ... # other options + ... # other options \ - [application-arguments] + [application-arguments] Some of the commonly used options are: @@ -194,23 +194,23 @@ you can also specify --supervise to make sure that the driver is au fails with non-zero exit code. To enumerate all such options available to spark-submit, run it with --help. Here are a few examples of common options: -# Run application locally on 8 cores +# Run application locally on 8 cores ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ - --master local[8] \ + --master local[8] \ /path/to/examples.jar \ - 100 + 100 -# Run on a Spark standalone cluster in client deploy mode +# Run on a Spark standalone cluster in client deploy mode ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ --master spark://207.184.161.138:7077 \ --executor-memory 20G \ --total-executor-cores 100 \ /path/to/examples.jar \ - 1000 + 1000 -# Run on a Spark standalone cluster in cluster deploy mode with supervise +# Run on a Spark standalone cluster in cluster deploy mode with supervise ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ --master spark://207.184.161.138:7077 \ @@ -219,26 +219,26 @@ run it with --help. Here are a few examples of common options: --executor-memory 20G \ --total-executor-cores 100 \ /path/to/examples.jar \ - 1000 + 1000 -# Run on a YARN cluster -export HADOOP_CONF_DIR=XXX +# Run on a YARN cluster +export HADOOP_CONF_DIR=XXX ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ --master yarn \ - --deploy-mode cluster \ # can be client for client mode + --deploy-mode cluster \ # can be client for client mode --executor-memory 20G \ --num-executors 50 \ /path/to/examples.jar \ - 1000 + 1000 -# Run a Python application on a Spark standalone cluster +# Run a Python application on a Spark standalone cluster ./bin/spark-submit \ --master spark://207.184.161.138:7077 \ examples/src/main/python/pi.py \ - 1000 + 1000 -# Run on a Mesos cluster in cluster deploy mode with supervise +# Run on a Mesos cluster in cluster deploy mode with supervise ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ --master mesos://207.184.161.138:7077 \ @@ -247,7 +247,7 @@ run it with --help. Here are a few examples of common options: --executor-memory 20G \ --total-executor-cores 100 \ http://path/to/examples.jar \ - 1000 + 1000 Master URLs http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/tuning.html -- diff --git a/site/docs/2.1.0/tuning.html b/site/docs/2.1.0/tuning.html index ca4ad9f..33a6316 100644 --- a/site/docs/2.1.0/tuning.html +++ b/site/docs/2.1.0/tuning.html @@ -129,23 +129,23 @@ - Data Serialization - Memory Tuning - Memory Management Overview - Determining Memory Consumption - Tuning Data Structures - Serialized RDD Storage - Garbage Collection Tuning + Data Serialization + Memory Tuning + Memory Management Overview + Determining Memory Consumption + Tuning Data Structures + Serialized RDD Storage + Garbage Collection Tuning - Other Considerations - Level of Parallelism - Memory Usage of Reduce Tasks - Broadcasting Large Variables - Data Locality + Other Considerations + Level of Parallelism + Memory Usage of Reduce Tasks + Broadcasting Large Variables + Data Locality - Summary + Summary Because of the in-memory nature of most Spark computations, Spark programs can be bottlenecked @@ -194,9 +194,9 @@ in the AllScalaRegistrar from the https://github.com/twitter/chill";>Twi To register your own custom classes with Kryo, use the registerKryoClasses method. -val conf = new SparkConf().setMaster(...).setAppName(...) +val conf = new SparkConf().setMaster(...).setAppNa
[25/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294 This version is built from the docs source code generated by applying https://github.com/apache/spark/pull/16294 to v2.1.0 (so, other changes in branch 2.1 will not affect the doc). Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/d2bcf185 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/d2bcf185 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/d2bcf185 Branch: refs/heads/asf-site Commit: d2bcf1854b0e0409495e2f1d3c6beaad923f6e6b Parents: ecf94f2 Author: Yin Huai Authored: Wed Dec 28 14:32:43 2016 -0800 Committer: Yin Huai Committed: Wed Dec 28 14:32:43 2016 -0800 -- site/docs/2.1.0/building-spark.html | 46 +- site/docs/2.1.0/building-with-maven.html| 14 +- site/docs/2.1.0/configuration.html | 52 +- site/docs/2.1.0/ec2-scripts.html| 174 site/docs/2.1.0/graphx-programming-guide.html | 198 ++--- site/docs/2.1.0/hadoop-provided.html| 14 +- .../img/structured-streaming-watermark.png | Bin 0 -> 252000 bytes site/docs/2.1.0/img/structured-streaming.pptx | Bin 1105413 -> 1113902 bytes site/docs/2.1.0/job-scheduling.html | 40 +- site/docs/2.1.0/ml-advanced.html| 10 +- .../2.1.0/ml-classification-regression.html | 838 +- site/docs/2.1.0/ml-clustering.html | 124 +-- site/docs/2.1.0/ml-collaborative-filtering.html | 56 +- site/docs/2.1.0/ml-features.html| 764 site/docs/2.1.0/ml-migration-guides.html| 16 +- site/docs/2.1.0/ml-pipeline.html| 178 ++-- site/docs/2.1.0/ml-tuning.html | 172 ++-- site/docs/2.1.0/mllib-clustering.html | 186 ++-- .../2.1.0/mllib-collaborative-filtering.html| 48 +- site/docs/2.1.0/mllib-data-types.html | 208 ++--- site/docs/2.1.0/mllib-decision-tree.html| 94 +- .../2.1.0/mllib-dimensionality-reduction.html | 28 +- site/docs/2.1.0/mllib-ensembles.html| 182 ++-- site/docs/2.1.0/mllib-evaluation-metrics.html | 302 +++ site/docs/2.1.0/mllib-feature-extraction.html | 122 +-- .../2.1.0/mllib-frequent-pattern-mining.html| 28 +- site/docs/2.1.0/mllib-isotonic-regression.html | 38 +- site/docs/2.1.0/mllib-linear-methods.html | 174 ++-- site/docs/2.1.0/mllib-naive-bayes.html | 24 +- site/docs/2.1.0/mllib-optimization.html | 50 +- site/docs/2.1.0/mllib-pmml-model-export.html| 35 +- site/docs/2.1.0/mllib-statistics.html | 180 ++-- site/docs/2.1.0/programming-guide.html | 302 +++ site/docs/2.1.0/quick-start.html| 166 ++-- site/docs/2.1.0/running-on-mesos.html | 52 +- site/docs/2.1.0/running-on-yarn.html| 27 +- site/docs/2.1.0/spark-standalone.html | 30 +- site/docs/2.1.0/sparkr.html | 145 ++-- site/docs/2.1.0/sql-programming-guide.html | 819 +- site/docs/2.1.0/storage-openstack-swift.html| 12 +- site/docs/2.1.0/streaming-custom-receivers.html | 26 +- .../2.1.0/streaming-kafka-0-10-integration.html | 52 +- .../docs/2.1.0/streaming-programming-guide.html | 416 - .../structured-streaming-kafka-integration.html | 44 +- .../structured-streaming-programming-guide.html | 864 --- site/docs/2.1.0/submitting-applications.html| 36 +- site/docs/2.1.0/tuning.html | 30 +- 47 files changed, 3926 insertions(+), 3490 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/building-spark.html -- diff --git a/site/docs/2.1.0/building-spark.html b/site/docs/2.1.0/building-spark.html index b3a720c..5c20245 100644 --- a/site/docs/2.1.0/building-spark.html +++ b/site/docs/2.1.0/building-spark.html @@ -127,33 +127,33 @@ - Building Apache Spark - Apache Maven - Setting up Maven’s Memory Usage - build/mvn + Building Apache Spark + Apache Maven + Setting up Maven’s Memory Usage + build/mvn - Building a Runnable Distribution - Specifying the Hadoop Version - Building With Hive and JDBC Support - Packaging without Hadoop Dependencies for YARN - Building with Mesos support - Building for Scala 2.10 - Building submodules individually - Continuous Compilation - Speeding up Compilation with Zinc - Building with SBT - Â Encrypted Filesystems - IntelliJ IDEA or Eclipse + Bu
[11/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-linear-methods.html -- diff --git a/site/docs/2.1.0/mllib-linear-methods.html b/site/docs/2.1.0/mllib-linear-methods.html index 46a1a25..428d778 100644 --- a/site/docs/2.1.0/mllib-linear-methods.html +++ b/site/docs/2.1.0/mllib-linear-methods.html @@ -307,23 +307,23 @@ - Mathematical formulation - Loss functions - Regularizers - Optimization + Mathematical formulation + Loss functions + Regularizers + Optimization - Classification - Linear Support Vector Machines (SVMs) - Logistic regression + Classification + Linear Support Vector Machines (SVMs) + Logistic regression - Regression - Linear least squares, Lasso, and ridge regression - Streaming linear regression + Regression + Linear least squares, Lasso, and ridge regression + Streaming linear regression - Implementation (developer) + Implementation (developer) \[ @@ -489,7 +489,7 @@ error. Refer to the SVMWithSGD Scala docs and SVMModel Scala docs for details on the API. -import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} +import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils @@ -534,14 +534,14 @@ this way as well. For example, the following code produces an L1 regularized variant of SVMs with regularization parameter set to 0.1, and runs the training algorithm for 200 iterations. -import org.apache.spark.mllib.optimization.L1Updater +import org.apache.spark.mllib.optimization.L1Updater val svmAlg = new SVMWithSGD() svmAlg.optimizer .setNumIterations(200) .setRegParam(0.1) .setUpdater(new L1Updater) -val modelL1 = svmAlg.run(training) +val modelL1 = svmAlg.run(training) @@ -554,7 +554,7 @@ that is equivalent to the provided example in Scala is given below: Refer to the SVMWithSGD Java docs and SVMModel Java docs for details on the API. -import scala.Tuple2; +import scala.Tuple2; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; @@ -591,7 +591,7 @@ that is equivalent to the provided example in Scala is given below: // Get evaluation metrics. BinaryClassificationMetrics metrics = - new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); + new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); double auROC = metrics.areaUnderROC(); System.out.println("Area under ROC = " + auROC); @@ -610,14 +610,14 @@ this way as well. For example, the following code produces an L1 regularized variant of SVMs with regularization parameter set to 0.1, and runs the training algorithm for 200 iterations. -import org.apache.spark.mllib.optimization.L1Updater; +import org.apache.spark.mllib.optimization.L1Updater; -SVMWithSGD svmAlg = new SVMWithSGD(); +SVMWithSGD svmAlg = new SVMWithSGD(); svmAlg.optimizer() .setNumIterations(200) .setRegParam(0.1) - .setUpdater(new L1Updater()); -final SVMModel modelL1 = svmAlg.run(training.rdd()); + .setUpdater(new L1Updater()); +final SVMModel modelL1 = svmAlg.run(training.rdd()); In order to run the above application, follow the instructions provided in the Self-Contained @@ -632,28 +632,28 @@ and make predictions with the resulting model to compute the training error. Refer to the SVMWithSGD Python docs and SVMModel Python docs for more details on the API. -from pyspark.mllib.classification import SVMWithSGD, SVMModel +from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint -# Load and parse the data +# Load and parse the data def parsePoint(line): -values = [float(x) for x in line.split(' ')] +values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) -data = sc.textFile("data/mllib/sample_svm_data.txt") +data = sc.textFile("data/mllib/sample_svm_data.txt") parsedData = data.map(parsePoint) -# Build the model +# Build the model model = SVMWithSGD.train(parsedData, iterations=100) -# Evaluating the model on training data +# Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) -print("Training Error = " + str(trainErr)) +print("Training Error = " + str(trainErr)) -# Save and load model -model.save(sc, "target/tmp/pythonSVMWithSGDModel") -sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel") +# Save and load model +model.save(sc, "target/tmp/pythonSVMWithSGDModel") +sameMo
[06/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/sql-programming-guide.html -- diff --git a/site/docs/2.1.0/sql-programming-guide.html b/site/docs/2.1.0/sql-programming-guide.html index 17f5981..4534a98 100644 --- a/site/docs/2.1.0/sql-programming-guide.html +++ b/site/docs/2.1.0/sql-programming-guide.html @@ -127,95 +127,95 @@ - Overview - SQL - Datasets and DataFrames + Overview + SQL + Datasets and DataFrames - Getting Started - Starting Point: SparkSession - Creating DataFrames - Untyped Dataset Operations (aka DataFrame Operations) - Running SQL Queries Programmatically - Global Temporary View - Creating Datasets - Interoperating with RDDs - Inferring the Schema Using Reflection - Programmatically Specifying the Schema + Getting Started + Starting Point: SparkSession + Creating DataFrames + Untyped Dataset Operations (aka DataFrame Operations) + Running SQL Queries Programmatically + Global Temporary View + Creating Datasets + Interoperating with RDDs + Inferring the Schema Using Reflection + Programmatically Specifying the Schema - Data Sources - Generic Load/Save Functions - Manually Specifying Options - Run SQL on files directly - Save Modes - Saving to Persistent Tables + Data Sources + Generic Load/Save Functions + Manually Specifying Options + Run SQL on files directly + Save Modes + Saving to Persistent Tables - Parquet Files - Loading Data Programmatically - Partition Discovery - Schema Merging - Hive metastore Parquet table conversion - Hive/Parquet Schema Reconciliation - Metadata Refreshing + Parquet Files + Loading Data Programmatically + Partition Discovery + Schema Merging + Hive metastore Parquet table conversion + Hive/Parquet Schema Reconciliation + Metadata Refreshing - Configuration + Configuration - JSON Datasets - Hive Tables - Interacting with Different Versions of Hive Metastore + JSON Datasets + Hive Tables + Interacting with Different Versions of Hive Metastore - JDBC To Other Databases - Troubleshooting + JDBC To Other Databases + Troubleshooting - Performance Tuning - Caching Data In Memory - Other Configuration Options + Performance Tuning + Caching Data In Memory + Other Configuration Options - Distributed SQL Engine - Running the Thrift JDBC/ODBC server - Running the Spark SQL CLI + Distributed SQL Engine + Running the Thrift JDBC/ODBC server + Running the Spark SQL CLI - Migration Guide - Upgrading From Spark SQL 2.0 to 2.1 - Upgrading From Spark SQL 1.6 to 2.0 - Upgrading From Spark SQL 1.5 to 1.6 - Upgrading From Spark SQL 1.4 to 1.5 - Upgrading from Spark SQL 1.3 to 1.4 - DataFrame data reader/writer interface - DataFrame.groupBy retains grouping columns - Behavior change on DataFrame.withColumn + Migration Guide + Upgrading From Spark SQL 2.0 to 2.1 + Upgrading From Spark SQL 1.6 to 2.0 + Upgrading From Spark SQL 1.5 to 1.6 + Upgrading From Spark SQL 1.4 to 1.5 + Upgrading from Spark SQL 1.3 to 1.4 + DataFrame data reader/writer interface + DataFrame.groupBy retains grouping columns + Behavior change on DataFrame.withColumn - Upgrading from Spark SQL 1.0-1.2 to 1.3 - Rename of SchemaRDD to DataFrame - Unification of the Java and Scala APIs - Isolation of Implicit Conversions and Removal of dsl Package (Scala-only) - Removal of the type aliases in org.apache.spark.sql for DataType (Scala-only) - UDF Registration Moved to sqlContext.udf (Java & Scala) - Python DataTypes No Longer Singletons + Upgrading from Spark SQL 1.0-1.2 to 1.3 + Rename of SchemaRDD to DataFrame + Unification of the Java and Scala APIs + Isolation of Implicit Conversions and Removal of dsl Package (Scala-only) + Removal of the type aliases in org.apache.spark.sql for DataType (Scala-only) + UDF Registration Moved to sqlContext.udf (Java & Scala) + Python DataTypes No Longer Sing
[19/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-migration-guides.html -- diff --git a/site/docs/2.1.0/ml-migration-guides.html b/site/docs/2.1.0/ml-migration-guides.html index 5e8a913..24dfc31 100644 --- a/site/docs/2.1.0/ml-migration-guides.html +++ b/site/docs/2.1.0/ml-migration-guides.html @@ -344,21 +344,21 @@ for converting to mllib.linalg types. -import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.mllib.util.MLUtils // convert DataFrame columns val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF) val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF) // convert a single vector or matrix val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML -val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML +val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML Refer to the MLUtils Scala docs for further detail. -import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.Dataset; // convert DataFrame columns @@ -366,21 +366,21 @@ for converting to mllib.linalg types. DatasetconvertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF); // convert a single vector or matrix org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML(); -org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML(); +org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML(); Refer to the MLUtils Java docs for further detail. -from pyspark.mllib.util import MLUtils +from pyspark.mllib.util import MLUtils -# convert DataFrame columns +# convert DataFrame columns convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF) convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF) -# convert a single vector or matrix +# convert a single vector or matrix mlVec = mllibVec.asML() -mlMat = mllibMat.asML() +mlMat = mllibMat.asML() Refer to the MLUtils Python docs for further detail. http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-pipeline.html -- diff --git a/site/docs/2.1.0/ml-pipeline.html b/site/docs/2.1.0/ml-pipeline.html index fe17564..b57afde 100644 --- a/site/docs/2.1.0/ml-pipeline.html +++ b/site/docs/2.1.0/ml-pipeline.html @@ -331,27 +331,27 @@ machine learning pipelines. Table of Contents - Main concepts in Pipelines - DataFrame - Pipeline components - Transformers - Estimators - Properties of pipeline components + Main concepts in Pipelines + DataFrame + Pipeline components + Transformers + Estimators + Properties of pipeline components - Pipeline - How it works - Details + Pipeline + How it works + Details - Parameters - Saving and Loading Pipelines + Parameters + Saving and Loading Pipelines - Code examples - Example: Estimator, Transformer, and Param - Example: Pipeline - Model selection (hyperparameter tuning) + Code examples + Example: Estimator, Transformer, and Param + Example: Pipeline + Model selection (hyperparameter tuning) @@ -541,7 +541,7 @@ Refer to the [`Estimator` Scala docs](api/scala/index.html#org.apache.spark.ml.E the [`Transformer` Scala docs](api/scala/index.html#org.apache.spark.ml.Transformer) and the [`Params` Scala docs](api/scala/index.html#org.apache.spark.ml.param.Params) for details on the API. -import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Row @@ -601,7 +601,7 @@ the [`Params` Scala docs](api/scala/index.html#org.apache.spark.ml.param.Params) .select("features", "label", "myProbability", "prediction") .collect() .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => -println(s"($features, $label) -> prob=$prob, prediction=$prediction") +println(s"($features, $label) -> prob=$prob, prediction=$prediction") } Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala" in the Spark repo. @@ -612,7 +612,7 @@ Refer to the [`Estimator` Java docs](api/java/org/apache/spark/ml/Estimator.html the [`Transformer` Java docs](api/java/org/apache/spark/ml/Transformer.html) and the [`Params` Java docs](api/java/org/apache/spark/ml/param/Params.html) for details on the API. -import java.util.Arrays; +import java.util.Arrays; import java.util.List; import org.apach
[24/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/graphx-programming-guide.html -- diff --git a/site/docs/2.1.0/graphx-programming-guide.html b/site/docs/2.1.0/graphx-programming-guide.html index 780d1ab..08b3380 100644 --- a/site/docs/2.1.0/graphx-programming-guide.html +++ b/site/docs/2.1.0/graphx-programming-guide.html @@ -129,42 +129,42 @@ - Overview - Getting Started - The Property Graph - Example Property Graph + Overview + Getting Started + The Property Graph + Example Property Graph - Graph Operators - Summary List of Operators - Property Operators - Structural Operators - Join Operators - Neighborhood Aggregation - Aggregate Messages (aggregateMessages) - Map Reduce Triplets Transition Guide (Legacy) - Computing Degree Information - Collecting Neighbors + Graph Operators + Summary List of Operators + Property Operators + Structural Operators + Join Operators + Neighborhood Aggregation + Aggregate Messages (aggregateMessages) + Map Reduce Triplets Transition Guide (Legacy) + Computing Degree Information + Collecting Neighbors - Caching and Uncaching + Caching and Uncaching - Pregel API - Graph Builders - Vertex and Edge RDDs - VertexRDDs - EdgeRDDs + Pregel API + Graph Builders + Vertex and Edge RDDs + VertexRDDs + EdgeRDDs - Optimized Representation - Graph Algorithms - PageRank - Connected Components - Triangle Counting + Optimized Representation + Graph Algorithms + PageRank + Connected Components + Triangle Counting - Examples + Examples @@ -188,10 +188,10 @@ operators (e.g., subgraph, import org.apache.spark._ +import org.apache.spark._ import org.apache.spark.graphx._ // To make some of the examples work we will also need RDD -import org.apache.spark.rdd.RDD +import org.apache.spark.rdd.RDD If you are not using the Spark shell you will also need a SparkContext. To learn more about getting started with Spark refer to the Spark Quick Start Guide. @@ -222,11 +222,11 @@ arrays. This can be accomplished through inheritance. For example to model users and products as a bipartite graph we might do the following: -class VertexProperty() +class VertexProperty() case class UserProperty(val name: String) extends VertexProperty case class ProductProperty(val name: String, val price: Double) extends VertexProperty // The graph might then have the type: -var graph: Graph[VertexProperty, String] = null +var graph: Graph[VertexProperty, String] = null Like RDDs, property graphs are immutable, distributed, and fault-tolerant. Changes to the values or structure of the graph are accomplished by producing a new graph with the desired changes. Note @@ -239,10 +239,10 @@ RDDs, each partition of the graph can be recreated on a different machine in the properties for each vertex and edge. As a consequence, the graph class contains members to access the vertices and edges of the graph: -class Graph[VD, ED] { +class Graph[VD, ED] { val vertices: VertexRDD[VD] val edges: EdgeRDD[ED] -} +} The classes VertexRDD[VD] and EdgeRDD[ED] extend and are optimized versions of RDD[(VertexId, VD)] and RDD[Edge[ED]] respectively. Both VertexRDD[VD] and EdgeRDD[ED] provide additional @@ -264,7 +264,7 @@ with a string describing the relationships between collaborators: The resulting graph would have the type signature: -val userGraph: Graph[(String, String), String] +val userGraph: Graph[(String, String), String] There are numerous ways to construct a property graph from raw files, RDDs, and even synthetic generators and these are discussed in more detail in the section on @@ -272,7 +272,7 @@ generators and these are discussed in more detail in the section on Graph object. For example the following code constructs a graph from a collection of RDDs: -// Assume the SparkContext has already been constructed +// Assume the SparkContext has already been constructed val sc: SparkContext // Create an RDD for the vertices val users: RDD[(VertexId, (String, String))] = @@ -285,7 +285,7 @@ code constructs a graph from a collection of RDDs: // Define a default user in case there are relationship with missing user val defaultUser = ("John Doe", "Missing") // Build the initial Graph -val graph = Graph(users, relationships, defaultUser) +val graph = Graph(users, relationships, defaultUser) In the above example we make use of the Edge case class. Edges have a srcId and a dstId corresponding to the source and destination vertex identifiers. In addition, the Edge @@ -294,11 +294,11 @@
[08/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/quick-start.html -- diff --git a/site/docs/2.1.0/quick-start.html b/site/docs/2.1.0/quick-start.html index 76e67e1..9d5fad7 100644 --- a/site/docs/2.1.0/quick-start.html +++ b/site/docs/2.1.0/quick-start.html @@ -129,14 +129,14 @@ - Interactive Analysis with the Spark Shell - Basics - More on RDD Operations - Caching + Interactive Analysis with the Spark Shell + Basics + More on RDD Operations + Caching - Self-Contained Applications - Where to Go from Here + Self-Contained Applications + Where to Go from Here This tutorial provides a quick introduction to using Spark. We will first introduce the API through Spark’s @@ -164,26 +164,26 @@ or Python. Start it by running the following in the Spark directory: Spark’s primary abstraction is a distributed collection of items called a Resilient Distributed Dataset (RDD). RDDs can be created from Hadoop InputFormats (such as HDFS files) or by transforming other RDDs. Let’s make a new RDD from the text of the README file in the Spark source directory: -scala> val textFile = sc.textFile("README.md") -textFile: org.apache.spark.rdd.RDD[String] = README.md MapPartitionsRDD[1] at textFile at:25 +scala> val textFile = sc.textFile("README.md") +textFile: org.apache.spark.rdd.RDD[String] = README.md MapPartitionsRDD[1] at textFile at :25 RDDs have actions, which return values, and transformations, which return pointers to new RDDs. Let’s start with a few actions: -scala> textFile.count() // Number of items in this RDD +scala> textFile.count() // Number of items in this RDD res0: Long = 126 // May be different from yours as README.md will change over time, similar to other outputs scala> textFile.first() // First item in this RDD -res1: String = # Apache Spark +res1: String = # Apache Spark Now let’s use a transformation. We will use the filter transformation to return a new RDD with a subset of the items in the file. -scala> val linesWithSpark = textFile.filter(line => line.contains("Spark")) -linesWithSpark: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at :27 +scala> val linesWithSpark = textFile.filter(line => line.contains("Spark")) +linesWithSpark: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at :27 We can chain together transformations and actions: -scala> textFile.filter(line => line.contains("Spark")).count() // How many lines contain "Spark"? -res3: Long = 15 +scala> textFile.filter(line => line.contains("Spark")).count() // How many lines contain "Spark"? +res3: Long = 15 @@ -193,24 +193,24 @@ or Python. Start it by running the following in the Spark directory: Spark’s primary abstraction is a distributed collection of items called a Resilient Distributed Dataset (RDD). RDDs can be created from Hadoop InputFormats (such as HDFS files) or by transforming other RDDs. Let’s make a new RDD from the text of the README file in the Spark source directory: ->>> textFile = sc.textFile("README.md") +>>> textFile = sc.textFile("README.md") RDDs have actions, which return values, and transformations, which return pointers to new RDDs. Let’s start with a few actions: ->>> textFile.count() # Number of items in this RDD +>>> textFile.count() # Number of items in this RDD 126 ->>> textFile.first() # First item in this RDD -u'# Apache Spark' +>>> textFile.first() # First item in this RDD +u'# Apache Spark' Now let’s use a transformation. We will use the filter transformation to return a new RDD with a subset of the items in the file. ->>> linesWithSpark = textFile.filter(lambda line: "Spark" in line) +>>> linesWithSpark = textFile.filter(lambda line: "Spark" in line) We can chain together transformations and actions: ->>> textFile.filter(lambda line: "Spark" in line).count() # How many lines contain "Spark"? -15 +>>> textFile.filter(lambda line: "Spark" in line).count() # How many lines contain "Spark"? +15 @@ -221,38 +221,38 @@ or Python. Start it by running the following in the Spark directory: -scala> textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b) -res4: Long = 15 +scala> textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b) +res4: Long = 15 This first maps a line to an integer value, creating a new RDD. reduce is called on that RDD to find the largest line count. The arguments to map and reduce are Scala function literals (closures), and can use any language feature or Scala/Java library. For example, we can easily call functions declared elsewhere. We’ll use
[23/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/hadoop-provided.html -- diff --git a/site/docs/2.1.0/hadoop-provided.html b/site/docs/2.1.0/hadoop-provided.html index ff7afb7..9d77cf0 100644 --- a/site/docs/2.1.0/hadoop-provided.html +++ b/site/docs/2.1.0/hadoop-provided.html @@ -133,16 +133,16 @@ Apache Hadoop For Apache distributions, you can use Hadoop’s ‘classpath’ command. For instance: -### in conf/spark-env.sh ### +### in conf/spark-env.sh ### -# If 'hadoop' binary is on your PATH -export SPARK_DIST_CLASSPATH=$(hadoop classpath) +# If 'hadoop' binary is on your PATH +export SPARK_DIST_CLASSPATH=$(hadoop classpath) -# With explicit path to 'hadoop' binary -export SPARK_DIST_CLASSPATH=$(/path/to/hadoop/bin/hadoop classpath) +# With explicit path to 'hadoop' binary +export SPARK_DIST_CLASSPATH=$(/path/to/hadoop/bin/hadoop classpath) -# Passing a Hadoop configuration directory -export SPARK_DIST_CLASSPATH=$(hadoop --config /path/to/configs classpath) +# Passing a Hadoop configuration directory +export SPARK_DIST_CLASSPATH=$(hadoop --config /path/to/configs classpath) http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/img/structured-streaming-watermark.png -- diff --git a/site/docs/2.1.0/img/structured-streaming-watermark.png b/site/docs/2.1.0/img/structured-streaming-watermark.png new file mode 100644 index 000..f21fbda Binary files /dev/null and b/site/docs/2.1.0/img/structured-streaming-watermark.png differ http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/img/structured-streaming.pptx -- diff --git a/site/docs/2.1.0/img/structured-streaming.pptx b/site/docs/2.1.0/img/structured-streaming.pptx index 6aad2ed..f5bdfc0 100644 Binary files a/site/docs/2.1.0/img/structured-streaming.pptx and b/site/docs/2.1.0/img/structured-streaming.pptx differ http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/job-scheduling.html -- diff --git a/site/docs/2.1.0/job-scheduling.html b/site/docs/2.1.0/job-scheduling.html index 53161c2..9651607 100644 --- a/site/docs/2.1.0/job-scheduling.html +++ b/site/docs/2.1.0/job-scheduling.html @@ -127,24 +127,24 @@ - Overview - Scheduling Across Applications - Dynamic Resource Allocation - Configuration and Setup - Resource Allocation Policy - Request Policy - Remove Policy + Overview + Scheduling Across Applications + Dynamic Resource Allocation + Configuration and Setup + Resource Allocation Policy + Request Policy + Remove Policy - Graceful Decommission of Executors + Graceful Decommission of Executors - Scheduling Within an Application - Fair Scheduler Pools - Default Behavior of Pools - Configuring Pool Properties + Scheduling Within an Application + Fair Scheduler Pools + Default Behavior of Pools + Configuring Pool Properties @@ -321,9 +321,9 @@ mode is best for multi-user settings. To enable the fair scheduler, simply set the spark.scheduler.mode property to FAIR when configuring a SparkContext: -val conf = new SparkConf().setMaster(...).setAppName(...) +val conf = new SparkConf().setMaster(...).setAppName(...) conf.set("spark.scheduler.mode", "FAIR") -val sc = new SparkContext(conf) +val sc = new SparkContext(conf) Fair Scheduler Pools @@ -337,15 +337,15 @@ many concurrent jobs they have instead of giving jobs equal shares. Thi adding the spark.scheduler.pool “local property” to the SparkContext in the thread that’s submitting them. This is done as follows: -// Assuming sc is your SparkContext variable -sc.setLocalProperty("spark.scheduler.pool", "pool1") +// Assuming sc is your SparkContext variable +sc.setLocalProperty("spark.scheduler.pool", "pool1") After setting this local property, all jobs submitted within this thread (by calls in this thread to RDD.save, count, collect, etc) will use this pool name. The setting is per-thread to make it easy to have a thread run multiple jobs on behalf of the same user. If you’d like to clear the pool that a thread is associated with, simply call: -sc.setLocalProperty("spark.scheduler.pool", null) +sc.setLocalProperty("spark.scheduler.pool", null) Default Behavior of Pools @@ -379,12 +379,12 @@ of the cluster. By default, each pool’s minShare is 0. and setting a spark.scheduler.allocation.file property in your SparkConf. -conf.set("spark.sc
[12/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-feature-extraction.html -- diff --git a/site/docs/2.1.0/mllib-feature-extraction.html b/site/docs/2.1.0/mllib-feature-extraction.html index 4726b37..f8cd98e 100644 --- a/site/docs/2.1.0/mllib-feature-extraction.html +++ b/site/docs/2.1.0/mllib-feature-extraction.html @@ -307,32 +307,32 @@ - TF-IDF - Word2Vec - Model - Example + TF-IDF + Word2Vec + Model + Example - StandardScaler - Model Fitting - Example + StandardScaler + Model Fitting + Example - Normalizer - Example + Normalizer + Example - ChiSqSelector - Model Fitting - Example + ChiSqSelector + Model Fitting + Example - ElementwiseProduct - Example + ElementwiseProduct + Example - PCA - Example + PCA + Example @@ -390,7 +390,7 @@ Each record could be an iterable of strings or other types. Refer to the HashingTF Scala docs for details on the API. -import org.apache.spark.mllib.feature.{HashingTF, IDF} +import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD @@ -424,24 +424,24 @@ Each record could be an iterable of strings or other types. Refer to the HashingTF Python docs for details on the API. -from pyspark.mllib.feature import HashingTF, IDF +from pyspark.mllib.feature import HashingTF, IDF -# Load documents (one per line). -documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) +# Load documents (one per line). +documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) -# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: -# First to compute the IDF vector and second to scale the term frequencies by IDF. +# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: +# First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) -# spark.mllib's IDF implementation provides an option for ignoring terms -# which occur in less than a minimum number of documents. -# In such cases, the IDF for these terms is set to 0. -# This feature can be used by passing the minDocFreq value to the IDF constructor. +# spark.mllib's IDF implementation provides an option for ignoring terms +# which occur in less than a minimum number of documents. +# In such cases, the IDF for these terms is set to 0. +# This feature can be used by passing the minDocFreq value to the IDF constructor. idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) @@ -467,7 +467,7 @@ skip-gram model is to maximize the average log-likelihood \[ \frac{1}{T} \sum_{t = 1}^{T}\sum_{j=-k}^{j=k} \log p(w_{t+j} | w_t) \] -where $k$ is the size of the training window. +where $k$ is the size of the training window. In the skip-gram model, every word $w$ is associated with two vectors $u_w$ and $v_w$ which are vector representations of $w$ as word and context respectively. The probability of correctly @@ -475,7 +475,7 @@ predicting word $w_i$ given word $w_j$ is determined by the softmax model, which \[ p(w_i | w_j ) = \frac{\exp(u_{w_i}^{\top}v_{w_j})}{\sum_{l=1}^{V} \exp(u_l^{\top}v_{w_j})} \] -where $V$ is the vocabulary size. +where $V$ is the vocabulary size. The skip-gram model with softmax is expensive because the cost of computing $\log p(w_i | w_j)$ is proportional to $V$, which can be easily in order of millions. To speed up training of Word2Vec, @@ -488,13 +488,13 @@ $O(\log(V))$ construct a Word2Vec instance and then fit a Word2VecModel with the input data. Finally, we display the top 40 synonyms of the specified word. To run the example, first download the http://mattmahoney.net/dc/text8.zip";>text8 data and extract it to your preferred directory. -Here we assume the extracted file is text8 and in same directory as you run the spark shell. +Here we assume the extracted file is text8 and in same directory as you run the spark shell. Refer to the Word2Vec Scala docs for details on the API. -import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} +import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) @@ -505,7 +505,7 @@ Here we assume the extracted file is text8 and in same directory as val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synon
[17/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-clustering.html -- diff --git a/site/docs/2.1.0/mllib-clustering.html b/site/docs/2.1.0/mllib-clustering.html index 9667606..1b50dab 100644 --- a/site/docs/2.1.0/mllib-clustering.html +++ b/site/docs/2.1.0/mllib-clustering.html @@ -366,12 +366,12 @@ models are trained for each cluster). The spark.mllib package supports the following models: - K-means - Gaussian mixture - Power iteration clustering (PIC) - Latent Dirichlet allocation (LDA) - Bisecting k-means - Streaming k-means + K-means + Gaussian mixture + Power iteration clustering (PIC) + Latent Dirichlet allocation (LDA) + Bisecting k-means + Streaming k-means K-means @@ -408,7 +408,7 @@ optimal k is usually one where there is an “elbow” in the W Refer to the KMeans Scala docs and KMeansModel Scala docs for details on the API. -import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} +import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // Load and parse the data @@ -440,7 +440,7 @@ that is equivalent to the provided example in Scala is given below: Refer to the KMeans Java docs and KMeansModel Java docs for details on the API. -import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.clustering.KMeans; import org.apache.spark.mllib.clustering.KMeansModel; @@ -470,7 +470,7 @@ that is equivalent to the provided example in Scala is given below: KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); System.out.println("Cluster centers:"); -for (Vector center: clusters.clusterCenters()) { +for (Vector center: clusters.clusterCenters()) { System.out.println(" " + center); } double cost = clusters.computeCost(parsedData.rdd()); @@ -498,29 +498,29 @@ fact the optimal k is usually one where there is an “elbow” Refer to the KMeans Python docs and KMeansModel Python docs for more details on the API. -from numpy import array +from numpy import array from math import sqrt from pyspark.mllib.clustering import KMeans, KMeansModel -# Load and parse the data -data = sc.textFile("data/mllib/kmeans_data.txt") -parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) +# Load and parse the data +data = sc.textFile("data/mllib/kmeans_data.txt") +parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) -# Build the model (cluster the data) -clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") +# Build the model (cluster the data) +clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") -# Evaluate clustering by computing Within Set Sum of Squared Errors +# Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) -print("Within Set Sum of Squared Error = " + str(WSSSE)) +print("Within Set Sum of Squared Error = " + str(WSSSE)) -# Save and load model -clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") -sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") +# Save and load model +clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") +sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") Find full example code at "examples/src/main/python/mllib/k_means_example.py" in the Spark repo. @@ -554,7 +554,7 @@ to the algorithm. We then output the parameters of the mixture model. Refer to the GaussianMixture Scala docs and GaussianMixtureModel Scala docs for details on the API. -import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel} +import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel} import org.apache.spark.mllib.linalg.Vectors // Load and parse the data @@ -587,7 +587,7 @@ that is equivalent to the provided example in Scala is given below: Refer to the GaussianMixture Java docs and GaussianMixtureModel Java docs for details on the API. -import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.clustering.GaussianMixture; import org.apache.spark.mllib.clustering.GaussianMixtureModel; @@ -612,7 +612,7 @@ that is equivalent to the provided example in Scala is given below: parsedData.cache(); // Cluster the data into two classes using Gaussian
[13/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-evaluation-metrics.html -- diff --git a/site/docs/2.1.0/mllib-evaluation-metrics.html b/site/docs/2.1.0/mllib-evaluation-metrics.html index 4bc636d..0d5bb3b 100644 --- a/site/docs/2.1.0/mllib-evaluation-metrics.html +++ b/site/docs/2.1.0/mllib-evaluation-metrics.html @@ -307,20 +307,20 @@ - Classification model evaluation - Binary classification - Threshold tuning + Classification model evaluation + Binary classification + Threshold tuning - Multiclass classification - Label based metrics + Multiclass classification + Label based metrics - Multilabel classification - Ranking systems + Multilabel classification + Ranking systems - Regression model evaluation + Regression model evaluation spark.mllib comes with a number of machine learning algorithms that can be used to learn from and make predictions @@ -421,7 +421,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation Refer to the LogisticRegressionWithLBFGS Scala docs and BinaryClassificationMetrics Scala docs for details on the API. -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils @@ -453,13 +453,13 @@ data, and evaluate the performance of the algorithm by several binary evaluation // Precision by threshold val precision = metrics.precisionByThreshold precision.foreach { case (t, p) => - println(s"Threshold: $t, Precision: $p") + println(s"Threshold: $t, Precision: $p") } // Recall by threshold val recall = metrics.recallByThreshold recall.foreach { case (t, r) => - println(s"Threshold: $t, Recall: $r") + println(s"Threshold: $t, Recall: $r") } // Precision-Recall Curve @@ -468,13 +468,13 @@ data, and evaluate the performance of the algorithm by several binary evaluation // F-measure val f1Score = metrics.fMeasureByThreshold f1Score.foreach { case (t, f) => - println(s"Threshold: $t, F-score: $f, Beta = 1") + println(s"Threshold: $t, F-score: $f, Beta = 1") } val beta = 0.5 val fScore = metrics.fMeasureByThreshold(beta) f1Score.foreach { case (t, f) => - println(s"Threshold: $t, F-score: $f, Beta = 0.5") + println(s"Threshold: $t, F-score: $f, Beta = 0.5") } // AUPRC @@ -498,7 +498,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation Refer to the LogisticRegressionModel Java docs and LogisticRegressionWithLBFGS Java docs for details on the API. -import scala.Tuple2; +import scala.Tuple2; import org.apache.spark.api.java.*; import org.apache.spark.api.java.function.Function; @@ -518,7 +518,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation JavaRDDtest = splits[1]; // Run training algorithm to build the model. -final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() +final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(2) .run(training.rdd()); @@ -538,7 +538,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation // Get evaluation metrics. BinaryClassificationMetrics metrics = - new BinaryClassificationMetrics(predictionAndLabels.rdd()); + new BinaryClassificationMetrics(predictionAndLabels.rdd()); // Precision by threshold JavaRDD > precision = metrics.precisionByThreshold().toJavaRDD(); @@ -564,7 +564,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation new Function , Double>() { @Override public Double call(Tuple2
[14/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-decision-tree.html -- diff --git a/site/docs/2.1.0/mllib-decision-tree.html b/site/docs/2.1.0/mllib-decision-tree.html index 1a3d865..991610e 100644 --- a/site/docs/2.1.0/mllib-decision-tree.html +++ b/site/docs/2.1.0/mllib-decision-tree.html @@ -307,23 +307,23 @@ - Basic algorithm - Node impurity and information gain - Split candidates - Stopping rule + Basic algorithm + Node impurity and information gain + Split candidates + Stopping rule - Usage tips - Problem specification parameters - Stopping criteria - Tunable parameters - Caching and checkpointing + Usage tips + Problem specification parameters + Stopping criteria + Tunable parameters + Caching and checkpointing - Scaling - Examples - Classification - Regression + Scaling + Examples + Classification + Regression @@ -548,7 +548,7 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a Refer to the DecisionTree Scala docs and DecisionTreeModel Scala docs for details on the API. -import org.apache.spark.mllib.tree.DecisionTree +import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.mllib.util.MLUtils @@ -588,7 +588,7 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a Refer to the DecisionTree Java docs and DecisionTreeModel Java docs for details on the API. -import java.util.HashMap; +import java.util.HashMap; import java.util.Map; import scala.Tuple2; @@ -604,8 +604,8 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a import org.apache.spark.mllib.tree.model.DecisionTreeModel; import org.apache.spark.mllib.util.MLUtils; -SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample"); -JavaSparkContext jsc = new JavaSparkContext(sparkConf); +SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample"); +JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; @@ -657,30 +657,30 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a Refer to the DecisionTree Python docs and DecisionTreeModel Python docs for more details on the API. -from pyspark.mllib.tree import DecisionTree, DecisionTreeModel +from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils -# Load and parse the data file into an RDD of LabeledPoint. -data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') -# Split the data into training and test sets (30% held out for testing) +# Load and parse the data file into an RDD of LabeledPoint. +data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') +# Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) -# Train a DecisionTree model. -# Empty categoricalFeaturesInfo indicates all features are continuous. +# Train a DecisionTree model. +# Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, - impurity='gini', maxDepth=5, maxBins=32) + impurity='gini', maxDepth=5, maxBins=32) -# Evaluate model on test instances and compute test error +# Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) -print('Test Error = ' + str(testErr)) -print('Learned classification tree model:') +print('Test Error = ' + str(testErr)) +print('Learned classification tree model:') print(model.toDebugString()) -# Save and load model -model.save(sc, "target/tmp/myDecisionTreeClassificationModel") -sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel") +# Save and load model +model.save(sc, "target/tmp/myDecisionTreeClassificationModel") +sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel") Find full example code at "examples/src/main/python/mllib/decision_tree_classification_example.py" in the Spark repo. @@ -701,7 +701,7 @@ depth of 5. The Mean Squared Error (MSE) is computed at the end to evaluate Refer to the DecisionTree Scala docs
[20/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-features.html -- diff --git a/site/docs/2.1.0/ml-features.html b/site/docs/2.1.0/ml-features.html index 64463de..a2f102b 100644 --- a/site/docs/2.1.0/ml-features.html +++ b/site/docs/2.1.0/ml-features.html @@ -318,52 +318,52 @@ Table of Contents - Feature Extractors - TF-IDF - Word2Vec - CountVectorizer + Feature Extractors + TF-IDF + Word2Vec + CountVectorizer - Feature Transformers - Tokenizer - StopWordsRemover - $n$-gram - Binarizer - PCA - PolynomialExpansion - Discrete Cosine Transform (DCT) - StringIndexer - IndexToString - OneHotEncoder - VectorIndexer - Interaction - Normalizer - StandardScaler - MinMaxScaler - MaxAbsScaler - Bucketizer - ElementwiseProduct - SQLTransformer - VectorAssembler - QuantileDiscretizer + Feature Transformers + Tokenizer + StopWordsRemover + $n$-gram + Binarizer + PCA + PolynomialExpansion + Discrete Cosine Transform (DCT) + StringIndexer + IndexToString + OneHotEncoder + VectorIndexer + Interaction + Normalizer + StandardScaler + MinMaxScaler + MaxAbsScaler + Bucketizer + ElementwiseProduct + SQLTransformer + VectorAssembler + QuantileDiscretizer - Feature Selectors - VectorSlicer - RFormula - ChiSqSelector + Feature Selectors + VectorSlicer + RFormula + ChiSqSelector - Locality Sensitive Hashing - LSH Operations - Feature Transformation - Approximate Similarity Join - Approximate Nearest Neighbor Search + Locality Sensitive Hashing + LSH Operations + Feature Transformation + Approximate Similarity Join + Approximate Nearest Neighbor Search - LSH Algorithms - Bucketed Random Projection for Euclidean Distance - MinHash for Jaccard Distance + LSH Algorithms + Bucketed Random Projection for Euclidean Distance + MinHash for Jaccard Distance @@ -395,7 +395,7 @@ TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D). There are several variants on the definition of term frequency and document frequency. In MLlib, we separate TF and IDF to make them flexible. -TF: Both HashingTF and CountVectorizer can be used to generate the term frequency vectors. +TF: Both HashingTF and CountVectorizer can be used to generate the term frequency vectors. HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might be a bag of words. @@ -437,7 +437,7 @@ when using text as features. Our feature vectors could then be passed to a lear Refer to the HashingTF Scala docs and the IDF Scala docs for more details on the API. -import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} +import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), @@ -468,7 +468,7 @@ the IDF Scala doc Refer to the HashingTF Java docs and the IDF Java docs for more details on the API. -import java.util.Arrays; +import java.util.Arrays; import java.util.List; import org.apache.spark.ml.feature.HashingTF; @@ -489,17 +489,17 @@ the IDF Scala doc RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") ); -StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) +StructType schema = new StructType(new StructField[]{ + new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), + new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); DatasetsentenceData = spark.createDataFrame(data, schema); -Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); +Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset
wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; -HashingTF hashingTF = new HashingTF() +HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); @@ -507,7 +507,7 @@ the IDF Scala doc Dataset
featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors -IDF idf = new IDF().setInputCol("rawFeatur
[05/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/storage-openstack-swift.html -- diff --git a/site/docs/2.1.0/storage-openstack-swift.html b/site/docs/2.1.0/storage-openstack-swift.html index bbb3446..a20c67f 100644 --- a/site/docs/2.1.0/storage-openstack-swift.html +++ b/site/docs/2.1.0/storage-openstack-swift.html @@ -144,7 +144,7 @@ Current Swift driver requires Swift to use Keystone authentication method. The Spark application should include hadoop-openstack dependency. For example, for Maven support, add the following to the pom.xml file: -+ Configuration Parameters Create core-site.xml and place it inside Spark’s conf directory. There are two main categories of parameters that should to be configured: declaration of the -Swift driver and the parameters that are required by Keystone. +Swift driver and the parameters that are required by Keystone. -Configuration of Hadoop to use Swift File system achieved via +Configuration of Hadoop to use Swift File system achieved via Property NameValue @@ -221,7 +221,7 @@ contains a list of Keystone mandatory parameters. PROVIDER can be a For example, assume PROVIDER=SparkTest and Keystone contains user tester with password testing defined for tenant test. Then core-site.xml should include: -... +... - org.apache.hadoop @@ -152,15 +152,15 @@ For example, for Maven support, add the following to the pom.xml fi2.3.0 + Notice that fs.swift.service.PROVIDER.tenant, http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/streaming-custom-receivers.html -- diff --git a/site/docs/2.1.0/streaming-custom-receivers.html b/site/docs/2.1.0/streaming-custom-receivers.html index d31647d..846c797 100644 --- a/site/docs/2.1.0/streaming-custom-receivers.html +++ b/site/docs/2.1.0/streaming-custom-receivers.html @@ -171,7 +171,7 @@ has any error connecting or receiving, the receiver is restarted to make another -class CustomReceiver(host: String, port: Int) +class CustomReceiver(host: String, port: Int) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging { def onStart() { @@ -216,12 +216,12 @@ has any error connecting or receiving, the receiver is restarted to make another restart("Error receiving data", t) } } -} +} -public class JavaCustomReceiver extends Receiver+ - fs.swift.impl org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem @@ -257,7 +257,7 @@ defined for tenant test. Then core-site.xml should incfs.swift.service.SparkTest.password testing { +public class JavaCustomReceiver extends Receiver { String host = null; int port = -1; @@ -234,7 +234,7 @@ has any error connecting or receiving, the receiver is restarted to make another public void onStart() { // Start the thread that receives data over a connection -new Thread() { +new Thread() { @Override public void run() { receive(); } @@ -253,10 +253,10 @@ has any error connecting or receiving, the receiver is restarted to make another try { // connect to the server - socket = new Socket(host, port); + socket = new Socket(host, port); - BufferedReader reader = new BufferedReader( -new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)); + BufferedReader reader = new BufferedReader( +new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)); // Until stopped or connection broken continue reading while (!isStopped() && (userInput = reader.readLine()) != null) { @@ -276,7 +276,7 @@ has any error connecting or receiving, the receiver is restarted to make another restart("Error receiving data", t); } } -} +} @@ -290,20 +290,20 @@ an input DStream using data received by the instance of custom receiver, as show -// Assuming ssc is the StreamingContext +// Assuming ssc is the StreamingContext val customReceiverStream = ssc.receiverStream(new CustomReceiver(host, port)) val words = lines.flatMap(_.split(" ")) -... +... The full source code is in the example https://github.com/apache/spark/blob/v2.1.0/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala";>CustomReceiver.scala. -// Assuming ssc is the JavaStreamingContext -JavaDStream customReceiverStream = ssc.receiverStream(new JavaCustomReceiver(host, port));
[21/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-clustering.html -- diff --git a/site/docs/2.1.0/ml-clustering.html b/site/docs/2.1.0/ml-clustering.html index e225281..df38605 100644 --- a/site/docs/2.1.0/ml-clustering.html +++ b/site/docs/2.1.0/ml-clustering.html @@ -313,21 +313,21 @@ about these algorithms. Table of Contents - K-means - Input Columns - Output Columns - Example + K-means + Input Columns + Output Columns + Example - Latent Dirichlet allocation (LDA) - Bisecting k-means - Example + Latent Dirichlet allocation (LDA) + Bisecting k-means + Example - Gaussian Mixture Model (GMM) - Input Columns - Output Columns - Example + Gaussian Mixture Model (GMM) + Input Columns + Output Columns + Example @@ -391,7 +391,7 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea Refer to the Scala API docs for more details. -import org.apache.spark.ml.clustering.KMeans +import org.apache.spark.ml.clustering.KMeans // Loads data. val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") @@ -402,7 +402,7 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea // Evaluate clustering by computing Within Set Sum of Squared Errors. val WSSSE = model.computeCost(dataset) -println(s"Within Set Sum of Squared Errors = $WSSSE") +println(s"Within Set Sum of Squared Errors = $WSSSE") // Shows the result. println("Cluster Centers: ") @@ -414,7 +414,7 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea Refer to the Java API docs for more details. -import org.apache.spark.ml.clustering.KMeansModel; +import org.apache.spark.ml.clustering.KMeansModel; import org.apache.spark.ml.clustering.KMeans; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; @@ -424,7 +424,7 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea Datasetdataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); // Trains a k-means model. -KMeans kmeans = new KMeans().setK(2).setSeed(1L); +KMeans kmeans = new KMeans().setK(2).setSeed(1L); KMeansModel model = kmeans.fit(dataset); // Evaluate clustering by computing Within Set Sum of Squared Errors. @@ -434,7 +434,7 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea // Shows the result. Vector[] centers = model.clusterCenters(); System.out.println("Cluster Centers: "); -for (Vector center: centers) { +for (Vector center: centers) { System.out.println(center); } @@ -444,22 +444,22 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea Refer to the Python API docs for more details. -from pyspark.ml.clustering import KMeans +from pyspark.ml.clustering import KMeans -# Loads data. -dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") +# Loads data. +dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") -# Trains a k-means model. +# Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) -# Evaluate clustering by computing Within Set Sum of Squared Errors. +# Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(dataset) -print("Within Set Sum of Squared Errors = " + str(wssse)) +print("Within Set Sum of Squared Errors = " + str(wssse)) -# Shows the result. +# Shows the result. centers = model.clusterCenters() -print("Cluster Centers: ") +print("Cluster Centers: ") for center in centers: print(center) @@ -470,7 +470,7 @@ called http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf";>kmea Refer to the R API docs for more details. -# Fit a k-means model with spark.kmeans +# Fit a k-means model with spark.kmeans irisDF <- suppressWarnings(createDataFrame(iris)) kmeansDF <- irisDF kmeansTestDF <- irisDF @@ -504,7 +504,7 @@ and generates a LDAModel as the base model. Expert users may cast a Refer to the Scala API docs for more details. -import org.apache.spark.ml.clustering.LDA +import org.apache.spark.ml.clustering.LDA // Loads data. val dataset = spark.read.format("libsvm") @@ -516,8 +516,8 @@ and generates a LDAModel as the base model. Expert users may cast a val ll = model.logLikelihood(dataset) val lp = model.logPerplexity(dataset) -println(s"The lower bound on the log likelihood of the entire corpus: $ll") -println(s"The upper bound bound on perplexity: $lp") +println(s"The lower bound on the log likelihood of the entire corpus: $ll") +println(s"The upper bound bound on perplexity: $lp") // Describe topics. val topics = model.describeTopics(3) @@ -53
[02/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/structured-streaming-programming-guide.html -- diff --git a/site/docs/2.1.0/structured-streaming-programming-guide.html b/site/docs/2.1.0/structured-streaming-programming-guide.html index e54c101..3a1ac5f 100644 --- a/site/docs/2.1.0/structured-streaming-programming-guide.html +++ b/site/docs/2.1.0/structured-streaming-programming-guide.html @@ -127,45 +127,50 @@ - Overview - Quick Example - Programming Model - Basic Concepts - Handling Event-time and Late Data - Fault Tolerance Semantics + Overview + Quick Example + Programming Model + Basic Concepts + Handling Event-time and Late Data + Fault Tolerance Semantics - API using Datasets and DataFrames - Creating streaming DataFrames and streaming Datasets - Data Sources - Schema inference and partition of streaming DataFrames/Datasets + API using Datasets and DataFrames + Creating streaming DataFrames and streaming Datasets + Data Sources + Schema inference and partition of streaming DataFrames/Datasets - Operations on streaming DataFrames/Datasets - Basic Operations - Selection, Projection, Aggregation - Window Operations on Event Time - Join Operations - Unsupported Operations + Operations on streaming DataFrames/Datasets + Basic Operations - Selection, Projection, Aggregation + Window Operations on Event Time + Handling Late Data and Watermarking + Join Operations + Unsupported Operations - Starting Streaming Queries - Output Modes - Output Sinks - Using Foreach + Starting Streaming Queries + Output Modes + Output Sinks + Using Foreach - Managing Streaming Queries - Monitoring Streaming Queries - Recovering from Failures with Checkpointing + Managing Streaming Queries + Monitoring Streaming Queries + Interactive APIs + Asynchronous API + + + Recovering from Failures with Checkpointing - Where to go from here + Where to go from here Overview Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data.The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java or Python to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming. -Spark 2.0 is the ALPHA RELEASE of Structured Streaming and the APIs are still experimental. In this guide, we are going to walk you through the programming model and the APIs. First, let’s start with a simple example - a streaming word count. +Structured Streaming is still ALPHA in Spark 2.1 and the APIs are still experimental. In this guide, we are going to walk you through the programming model and the APIs. First, let’s start with a simple example - a streaming word count. Quick Example Letâs say you want to maintain a running word count of text data received from a data server listening on a TCP socket. Letâs see how you can express this using Structured Streaming. You can see the full code in @@ -175,7 +180,7 @@ And if you http://spark.apache.org/downloads.html";>download Spark, -import org.apache.spark.sql.functions._ +import org.apache.spark.sql.functions._ import org.apache.spark.sql.SparkSession val spark = SparkSession @@ -183,12 +188,12 @@ And if you http://spark.apache.org/downloads.html";>download Spark, .appName("StructuredNetworkWordCount") .getOrCreate() -import spark.implicits._ +import spark.implicits._ -import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.streaming.StreamingQuery; @@ -198,19 +203,19 @@ And if you http://spark.apache.org/downloads.html";>download Spark, SparkSession spark = SparkSession .builder() .appName("JavaStructuredNetworkWordCount") - .getOrCr
[10/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-pmml-model-export.html -- diff --git a/site/docs/2.1.0/mllib-pmml-model-export.html b/site/docs/2.1.0/mllib-pmml-model-export.html index 30815e0..3f2fd91 100644 --- a/site/docs/2.1.0/mllib-pmml-model-export.html +++ b/site/docs/2.1.0/mllib-pmml-model-export.html @@ -307,8 +307,8 @@ - spark.mllib supported models - Examples + spark.mllib supported models + Examples spark.mllib supported models @@ -353,32 +353,31 @@ Refer to the KMeans Scala docs and Vectors Scala docs for details on the API. -Here a complete example of building a KMeansModel and print it out in PMML format: -import org.apache.spark.mllib.clustering.KMeans -import org.apache.spark.mllib.linalg.Vectors +Here a complete example of building a KMeansModel and print it out in PMML format: +import org.apache.spark.mllib.clustering.KMeans +import org.apache.spark.mllib.linalg.Vectors -// Load and parse the data +// Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") -val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() +val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() -// Cluster the data into two classes using KMeans +// Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 -val clusters = KMeans.train(parsedData, numClusters, numIterations) +val clusters = KMeans.train(parsedData, numClusters, numIterations) -// Export to PMML to a String in PMML format -println("PMML Model:\n" + clusters.toPMML) +// Export to PMML to a String in PMML format +println("PMML Model:\n" + clusters.toPMML) -// Export the model to a local file in PMML format -clusters.toPMML("/tmp/kmeans.xml") +// Export the model to a local file in PMML format +clusters.toPMML("/tmp/kmeans.xml") -// Export the model to a directory on a distributed file system in PMML format -clusters.toPMML(sc, "/tmp/kmeans") +// Export the model to a directory on a distributed file system in PMML format +clusters.toPMML(sc, "/tmp/kmeans") -// Export the model to the OutputStream in PMML format +// Export the model to the OutputStream in PMML format clusters.toPMML(System.out) - -Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala" in the Spark repo. +Find full example code at “examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala” in the Spark repo.For unsupported models, either you will not find a .toPMML method or an IllegalArgumentException will be thrown. http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-statistics.html -- diff --git a/site/docs/2.1.0/mllib-statistics.html b/site/docs/2.1.0/mllib-statistics.html index 4485ecf..f04924c 100644 --- a/site/docs/2.1.0/mllib-statistics.html +++ b/site/docs/2.1.0/mllib-statistics.html @@ -358,15 +358,15 @@ - Summary statistics - Correlations - Stratified sampling - Hypothesis testing - Streaming Significance Testing + Summary statistics + Correlations + Stratified sampling + Hypothesis testing + Streaming Significance Testing - Random data generation - Kernel density estimation + Random data generation + Kernel density estimation \[ @@ -401,7 +401,7 @@ total count. Refer to the MultivariateStatisticalSummary Scala docs for details on the API. -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} val observations = sc.parallelize( @@ -430,7 +430,7 @@ total count. Refer to the MultivariateStatisticalSummary Java docs for details on the API. -import java.util.Arrays; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.mllib.linalg.Vector; @@ -463,19 +463,19 @@ total count. Refer to the MultivariateStatisticalSummary Python docs for more details on the API. -import numpy as np +import numpy as np from pyspark.mllib.stat import Statistics mat = sc.parallelize( [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])] -) # an RDD of Vectors +) # an RDD of Vectors -# Compute column summary statistics. +# Compute column summary statistics. summary = Statistics.colStats(mat) -print(summary.mean()) # a dense vector containing the mean value for each column -print(summary.variance()) # column-wise variance -print(summary.numNonzero
[09/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/programming-guide.html -- diff --git a/site/docs/2.1.0/programming-guide.html b/site/docs/2.1.0/programming-guide.html index 12458af..0e06e86 100644 --- a/site/docs/2.1.0/programming-guide.html +++ b/site/docs/2.1.0/programming-guide.html @@ -129,50 +129,50 @@ - Overview - Linking with Spark - Initializing Spark - Using the Shell + Overview + Linking with Spark + Initializing Spark + Using the Shell - Resilient Distributed Datasets (RDDs) - Parallelized Collections - External Datasets - RDD Operations - Basics - Passing Functions to Spark - Understanding closures - Example - Local vs. cluster modes - Printing elements of an RDD + Resilient Distributed Datasets (RDDs) + Parallelized Collections + External Datasets + RDD Operations + Basics + Passing Functions to Spark + Understanding closures + Example + Local vs. cluster modes + Printing elements of an RDD - Working with Key-Value Pairs - Transformations - Actions - Shuffle operations - Background - Performance Impact + Working with Key-Value Pairs + Transformations + Actions + Shuffle operations + Background + Performance Impact - RDD Persistence - Which Storage Level to Choose? - Removing Data + RDD Persistence + Which Storage Level to Choose? + Removing Data - Shared Variables - Broadcast Variables - Accumulators + Shared Variables + Broadcast Variables + Accumulators - Deploying to a Cluster - Launching Spark jobs from Java / Scala - Unit Testing - Where to Go from Here + Deploying to a Cluster + Launching Spark jobs from Java / Scala + Unit Testing + Where to Go from Here Overview @@ -212,8 +212,8 @@ version =Finally, you need to import some Spark classes into your program. Add the following lines: -import org.apache.spark.SparkContext -import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkConf (Before Spark 1.3.0, you need to explicitly import org.apache.spark.SparkContext._ to enable essential implicit conversions.) @@ -245,9 +245,9 @@ version = Finally, you need to import some Spark classes into your program. Add the following lines: -import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.api.java.JavaRDD -import org.apache.spark.SparkConf +import org.apache.spark.SparkConf @@ -269,13 +269,13 @@ for common HDFS versions. Finally, you need to import some Spark classes into your program. Add the following line: -from pyspark import SparkContext, SparkConf +from pyspark import SparkContext, SparkConf PySpark requires the same minor version of Python in both driver and workers. It uses the default python version in PATH, you can specify which version of Python you want to use by PYSPARK_PYTHON, for example: -$ PYSPARK_PYTHON=python3.4 bin/pyspark -$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py +$ PYSPARK_PYTHON=python3.4 bin/pyspark +$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py @@ -293,8 +293,8 @@ that contains information about your application. Only one SparkContext may be active per JVM. You must stop() the active SparkContext before creating a new one. -val conf = new SparkConf().setAppName(appName).setMaster(master) -new SparkContext(conf) +val conf = new SparkConf().setAppName(appName).setMaster(master) +new SparkContext(conf) @@ -304,8 +304,8 @@ that contains information about your application. how to access a cluster. To create a SparkContext you first need to build a SparkConf object that contains information about your application. -SparkConf conf = new SparkConf().setAppName(appName).setMaster(master); -JavaSparkContext sc = new JavaSparkContext(conf); +SparkConf conf = new SparkConf().setAppName(appName).setMaster(master); +JavaSparkContext sc = new JavaSparkContext(conf); @@ -315,8 +315,8 @@ that contains information about your application. how to access a cluster. To create a SparkContext you first need to build a SparkConf object that
[22/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-classification-regression.html -- diff --git a/site/docs/2.1.0/ml-classification-regression.html b/site/docs/2.1.0/ml-classification-regression.html index 1e0665b..0b264bb 100644 --- a/site/docs/2.1.0/ml-classification-regression.html +++ b/site/docs/2.1.0/ml-classification-regression.html @@ -329,58 +329,58 @@ discussing specific classes of algorithms, such as linear methods, trees, and en Table of Contents - Classification - Logistic regression - Binomial logistic regression - Multinomial logistic regression + Classification + Logistic regression + Binomial logistic regression + Multinomial logistic regression - Decision tree classifier - Random forest classifier - Gradient-boosted tree classifier - Multilayer perceptron classifier - One-vs-Rest classifier (a.k.a. One-vs-All) - Naive Bayes + Decision tree classifier + Random forest classifier + Gradient-boosted tree classifier + Multilayer perceptron classifier + One-vs-Rest classifier (a.k.a. One-vs-All) + Naive Bayes - Regression - Linear regression - Generalized linear regression - Available families + Regression + Linear regression + Generalized linear regression + Available families - Decision tree regression - Random forest regression - Gradient-boosted tree regression - Survival regression - Isotonic regression - Examples + Decision tree regression + Random forest regression + Gradient-boosted tree regression + Survival regression + Isotonic regression + Examples - Linear methods - Decision trees - Inputs and Outputs - Input Columns - Output Columns + Linear methods + Decision trees + Inputs and Outputs + Input Columns + Output Columns - Tree Ensembles - Random Forests - Inputs and Outputs - Input Columns - Output Columns (Predictions) + Tree Ensembles + Random Forests + Inputs and Outputs + Input Columns + Output Columns (Predictions) - Gradient-Boosted Trees (GBTs) - Inputs and Outputs - Input Columns - Output Columns (Predictions) + Gradient-Boosted Trees (GBTs) + Inputs and Outputs + Input Columns + Output Columns (Predictions) @@ -407,7 +407,7 @@ parameter to select between these two algorithms, or leave it unset and Spark wi Binomial logistic regression -For more background and more details about the implementation of binomial logistic regression, refer to the documentation of logistic regression in spark.mllib. +For more background and more details about the implementation of binomial logistic regression, refer to the documentation of logistic regression in spark.mllib. Example @@ -421,7 +421,7 @@ $\alpha$ and regParam corresponds to $\lambda$. More details on parameters can be found in the Scala API documentation. -import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.classification.LogisticRegression // Load training data val training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -435,7 +435,7 @@ $\alpha$ and regParam corresponds to $\lambda$. val lrModel = lr.fit(training) // Print the coefficients and intercept for logistic regression -println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") +println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // We can also use the multinomial family for binary classification val mlr = new LogisticRegression() @@ -447,8 +447,8 @@ $\alpha$ and regParam corresponds to $\lambda$. val mlrModel = mlr.fit(training) // Print the coefficients and intercepts for logistic regression with multinomial family -println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}") -println(s"Multinomial intercepts: ${mlrModel.interceptVector}") +println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}") +println(s"Multinomial intercepts: ${mlrModel.interceptVector}") Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala" in the Spark repo. @@ -457,7 +457,7 @@ $\alpha$ and regParam corresponds
[07/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/sparkr.html -- diff --git a/site/docs/2.1.0/sparkr.html b/site/docs/2.1.0/sparkr.html index 0a1a347..e861a01 100644 --- a/site/docs/2.1.0/sparkr.html +++ b/site/docs/2.1.0/sparkr.html @@ -127,53 +127,53 @@ - Overview - SparkDataFrame - Starting Up: SparkSession - Starting Up from RStudio - Creating SparkDataFrames - From local data frames - From Data Sources - From Hive tables + Overview + SparkDataFrame + Starting Up: SparkSession + Starting Up from RStudio + Creating SparkDataFrames + From local data frames + From Data Sources + From Hive tables - SparkDataFrame Operations - Selecting rows, columns - Grouping, Aggregation - Operating on Columns - Applying User-Defined Function - Run a given function on a large dataset using dapply or dapplyCollect - dapply - dapplyCollect + SparkDataFrame Operations + Selecting rows, columns + Grouping, Aggregation + Operating on Columns + Applying User-Defined Function + Run a given function on a large dataset using dapply or dapplyCollect + dapply + dapplyCollect - Run a given function on a large dataset grouping by input column(s) and using gapply or gapplyCollect - gapply - gapplyCollect + Run a given function on a large dataset grouping by input column(s) and using gapply or gapplyCollect + gapply + gapplyCollect - Data type mapping between R and Spark - Run local R functions distributed using spark.lapply - spark.lapply + Data type mapping between R and Spark + Run local R functions distributed using spark.lapply + spark.lapply - Running SQL Queries from SparkR + Running SQL Queries from SparkR - Machine Learning - Algorithms - Model persistence + Machine Learning + Algorithms + Model persistence - R Function Name Conflicts - Migration Guide - Upgrading From SparkR 1.5.x to 1.6.x - Upgrading From SparkR 1.6.x to 2.0 - Upgrading to SparkR 2.1.0 + R Function Name Conflicts + Migration Guide + Upgrading From SparkR 1.5.x to 1.6.x + Upgrading From SparkR 1.6.x to 2.0 + Upgrading to SparkR 2.1.0 @@ -202,7 +202,7 @@ You can create a SparkSession using sparkR.session and -sparkR.session() +sparkR.session() @@ -223,11 +223,11 @@ them, pass them as you would other configuration properties in the sparkCo -if (nchar(Sys.getenv("SPARK_HOME")) < 1) { +if (nchar(Sys.getenv("SPARK_HOME")) < 1) { Sys.setenv(SPARK_HOME = "/home/spark") } library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"))) -sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g")) +sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g")) @@ -282,14 +282,14 @@ sparkR.session(master = - df <- as.DataFrame(faithful) + df <- as.DataFrame(faithful) # Displays the first part of the SparkDataFrame head(df) ## eruptions waiting ##1 3.600 79 ##2 1.800 54 -##3 3.333 74 +##3 3.333 74 @@ -303,7 +303,7 @@ specifying --packages with spark-submit or spark - sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") + sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") @@ -311,7 +311,7 @@ specifying --packages with spark-submit or spark - people <- read.df("./examples/src/main/resources/people.json", "json") + people <- read.df("./examples/src/main/resources/people.json", "json") head(people) ## agename ##1 NA Michael @@ -325,7 +325,7 @@ printSchema(people) # |-- name: string (nullable = true) # Similarly, multiple files can be read with read.json -people <- read.json(c("./examples/src/main/resources/people.json", "./examples/src/main/resources/people2.json")) +people <- read.json(c("./examples/src/main/resources/people.json", "./examples/src/main/resources/people2.json")) @@ -333,7 +333,7 @@ people <- read.json( - df <- read.df(csvPath, "csv", header = "true", inferSchema = "true",
[18/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-tuning.html -- diff --git a/site/docs/2.1.0/ml-tuning.html b/site/docs/2.1.0/ml-tuning.html index 0c36a98..2246cc2 100644 --- a/site/docs/2.1.0/ml-tuning.html +++ b/site/docs/2.1.0/ml-tuning.html @@ -329,13 +329,13 @@ Built-in Cross-Validation and other tooling allow users to optimize hyperparamet Table of contents - Model selection (a.k.a. hyperparameter tuning) - Cross-Validation - Example: model selection via cross-validation + Model selection (a.k.a. hyperparameter tuning) + Cross-Validation + Example: model selection via cross-validation - Train-Validation Split - Example: model selection via train validation split + Train-Validation Split + Example: model selection via train validation split @@ -396,7 +396,7 @@ However, it is also a well-established method for choosing parameters which is m Refer to the [`CrossValidator` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) for details on the API. -import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer} @@ -467,7 +467,7 @@ Refer to the [`CrossValidator` Scala docs](api/scala/index.html#org.apache.spark .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => -println(s"($id, $text) --> prob=$prob, prediction=$prediction") +println(s"($id, $text) --> prob=$prob, prediction=$prediction") } Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala" in the Spark repo. @@ -476,7 +476,7 @@ Refer to the [`CrossValidator` Scala docs](api/scala/index.html#org.apache.spark Refer to the [`CrossValidator` Java docs](api/java/org/apache/spark/ml/tuning/CrossValidator.html) for details on the API. -import java.util.Arrays; +import java.util.Arrays; import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineStage; @@ -493,38 +493,38 @@ Refer to the [`CrossValidator` Java docs](api/java/org/apache/spark/ml/tuning/Cr // Prepare training documents, which are labeled. Datasettraining = spark.createDataFrame(Arrays.asList( - new JavaLabeledDocument(0L, "a b c d e spark", 1.0), - new JavaLabeledDocument(1L, "b d", 0.0), - new JavaLabeledDocument(2L,"spark f g h", 1.0), - new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0), - new JavaLabeledDocument(4L, "b spark who", 1.0), - new JavaLabeledDocument(5L, "g d a y", 0.0), - new JavaLabeledDocument(6L, "spark fly", 1.0), - new JavaLabeledDocument(7L, "was mapreduce", 0.0), - new JavaLabeledDocument(8L, "e spark program", 1.0), - new JavaLabeledDocument(9L, "a e c l", 0.0), - new JavaLabeledDocument(10L, "spark compile", 1.0), - new JavaLabeledDocument(11L, "hadoop software", 0.0) + new JavaLabeledDocument(0L, "a b c d e spark", 1.0), + new JavaLabeledDocument(1L, "b d", 0.0), + new JavaLabeledDocument(2L,"spark f g h", 1.0), + new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0), + new JavaLabeledDocument(4L, "b spark who", 1.0), + new JavaLabeledDocument(5L, "g d a y", 0.0), + new JavaLabeledDocument(6L, "spark fly", 1.0), + new JavaLabeledDocument(7L, "was mapreduce", 0.0), + new JavaLabeledDocument(8L, "e spark program", 1.0), + new JavaLabeledDocument(9L, "a e c l", 0.0), + new JavaLabeledDocument(10L, "spark compile", 1.0), + new JavaLabeledDocument(11L, "hadoop software", 0.0) ), JavaLabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. -Tokenizer tokenizer = new Tokenizer() +Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); -HashingTF hashingTF = new HashingTF() +HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); -LogisticRegression lr = new LogisticRegression() +LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); -Pipeline pipeline = new Pipeline() +Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); // We use a ParamGridBuilder to construct a grid of parameters to search over. // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. -ParamMap[] paramGrid = new ParamGridBuilder() +ParamMap[] paramGrid = new ParamGridBuilder() .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000}) .addGrid(lr.regParam(), new double[] {0.1, 0.0
[04/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/streaming-programming-guide.html -- diff --git a/site/docs/2.1.0/streaming-programming-guide.html b/site/docs/2.1.0/streaming-programming-guide.html index 9a87d23..b1ce1e1 100644 --- a/site/docs/2.1.0/streaming-programming-guide.html +++ b/site/docs/2.1.0/streaming-programming-guide.html @@ -129,32 +129,32 @@ - Overview - A Quick Example - Basic Concepts - Linking - Initializing StreamingContext - Discretized Streams (DStreams) - Input DStreams and Receivers - Transformations on DStreams - Output Operations on DStreams - DataFrame and SQL Operations - MLlib Operations - Caching / Persistence - Checkpointing - Accumulators, Broadcast Variables, and Checkpoints - Deploying Applications - Monitoring Applications + Overview + A Quick Example + Basic Concepts + Linking + Initializing StreamingContext + Discretized Streams (DStreams) + Input DStreams and Receivers + Transformations on DStreams + Output Operations on DStreams + DataFrame and SQL Operations + MLlib Operations + Caching / Persistence + Checkpointing + Accumulators, Broadcast Variables, and Checkpoints + Deploying Applications + Monitoring Applications - Performance Tuning - Reducing the Batch Processing Times - Setting the Right Batch Interval - Memory Tuning + Performance Tuning + Reducing the Batch Processing Times + Setting the Right Batch Interval + Memory Tuning - Fault-tolerance Semantics - Where to Go from Here + Fault-tolerance Semantics + Where to Go from Here Overview @@ -209,7 +209,7 @@ conversions from StreamingContext into our environment in order to add useful me other classes we need (like DStream). StreamingContext is the main entry point for all streaming functionality. We create a local StreamingContext with two execution threads, and a batch interval of 1 second. -import org.apache.spark._ +import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3 @@ -217,33 +217,33 @@ main entry point for all streaming functionality. We create a local StreamingCon // The master requires 2 cores to prevent from a starvation scenario. val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount") -val ssc = new StreamingContext(conf, Seconds(1)) +val ssc = new StreamingContext(conf, Seconds(1)) Using this context, we can create a DStream that represents streaming data from a TCP source, specified as hostname (e.g. localhost) and port (e.g. ). -// Create a DStream that will connect to hostname:port, like localhost: -val lines = ssc.socketTextStream("localhost", ) +// Create a DStream that will connect to hostname:port, like localhost: +val lines = ssc.socketTextStream("localhost", ) This lines DStream represents the stream of data that will be received from the data server. Each record in this DStream is a line of text. Next, we want to split the lines by space characters into words. -// Split each line into words -val words = lines.flatMap(_.split(" ")) +// Split each line into words +val words = lines.flatMap(_.split(" ")) flatMap is a one-to-many DStream operation that creates a new DStream by generating multiple new records from each record in the source DStream. In this case, each line will be split into multiple words and the stream of words is represented as the words DStream. Next, we want to count these words. -import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3 +import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3 // Count each word in each batch val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console -wordCounts.print() +wordCounts.print() The words DStream is further mapped (one-to-one transformation) to a DStream of (word, 1) pairs, which is then reduced to get the frequency of words in each batch of data. @@ -253,8 +253,8 @@ Finally, wordCounts.print() will print a few of the counts generate will perform when it is started, and no real processing has started yet. To start the processing after all the transformations have been setup, we finally call -ssc.start() // Start the computation -ssc.awaitTermination() // Wait for the computation to terminate +ssc.start() // Start the computation +ssc.awaitTermination() // Wait for the computation to terminate
[03/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/structured-streaming-kafka-integration.html -- diff --git a/site/docs/2.1.0/structured-streaming-kafka-integration.html b/site/docs/2.1.0/structured-streaming-kafka-integration.html index 5ca9259..7d2254f 100644 --- a/site/docs/2.1.0/structured-streaming-kafka-integration.html +++ b/site/docs/2.1.0/structured-streaming-kafka-integration.html @@ -144,7 +144,7 @@ application. See the Deploying subsection below. -// Subscribe to 1 topic +// Subscribe to 1 topic val ds1 = spark .readStream .format("kafka") @@ -172,12 +172,12 @@ application. See the Deploying subsection below. .option("subscribePattern", "topic.*") .load() ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") - .as[(String, String)] + .as[(String, String)] -// Subscribe to 1 topic +// Subscribe to 1 topic Datasetds1 = spark .readStream() .format("kafka") @@ -202,43 +202,43 @@ application. See the Deploying subsection below. .option("kafka.bootstrap.servers", "host1:port1,host2:port2") .option("subscribePattern", "topic.*") .load() -ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") +ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") -# Subscribe to 1 topic +# Subscribe to 1 topic ds1 = spark .readStream() - .format("kafka") - .option("kafka.bootstrap.servers", "host1:port1,host2:port2") - .option("subscribe", "topic1") + .format("kafka") + .option("kafka.bootstrap.servers", "host1:port1,host2:port2") + .option("subscribe", "topic1") .load() -ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") +ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") -# Subscribe to multiple topics +# Subscribe to multiple topics ds2 = spark .readStream - .format("kafka") - .option("kafka.bootstrap.servers", "host1:port1,host2:port2") - .option("subscribe", "topic1,topic2") + .format("kafka") + .option("kafka.bootstrap.servers", "host1:port1,host2:port2") + .option("subscribe", "topic1,topic2") .load() -ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") +ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") -# Subscribe to a pattern +# Subscribe to a pattern ds3 = spark .readStream() - .format("kafka") - .option("kafka.bootstrap.servers", "host1:port1,host2:port2") - .option("subscribePattern", "topic.*") + .format("kafka") + .option("kafka.bootstrap.servers", "host1:port1,host2:port2") + .option("subscribePattern", "topic.*") .load() -ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") +ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") -Each row in the source has the following schema: - +Each row in the source has the following schema: +
ColumnType key @@ -268,7 +268,7 @@ application. See the Deploying subsection below. timestampType int - +
The following options must be set for the Kafka source. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[16/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-collaborative-filtering.html -- diff --git a/site/docs/2.1.0/mllib-collaborative-filtering.html b/site/docs/2.1.0/mllib-collaborative-filtering.html index e453032..b3f9e08 100644 --- a/site/docs/2.1.0/mllib-collaborative-filtering.html +++ b/site/docs/2.1.0/mllib-collaborative-filtering.html @@ -322,13 +322,13 @@ - Collaborative filtering - Explicit vs. implicit feedback - Scaling of the regularization parameter + Collaborative filtering + Explicit vs. implicit feedback + Scaling of the regularization parameter - Examples - Tutorial + Examples + Tutorial Collaborative filtering @@ -393,7 +393,7 @@ recommendation model by measuring the Mean Squared Error of rating prediction.Refer to the ALS Scala docs for more details on the API. -import org.apache.spark.mllib.recommendation.ALS +import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating @@ -434,9 +434,9 @@ recommendation model by measuring the Mean Squared Error of rating prediction.If the rating matrix is derived from another source of information (i.e. it is inferred from other signals), you can use the trainImplicit method to get better results. -val alpha = 0.01 +val alpha = 0.01 val lambda = 0.01 -val model = ALS.trainImplicit(ratings, rank, numIterations, lambda, alpha) +val model = ALS.trainImplicit(ratings, rank, numIterations, lambda, alpha) @@ -449,7 +449,7 @@ that is equivalent to the provided example in Scala is given below: Refer to the ALS Java docs for more details on the API. -import scala.Tuple2; +import scala.Tuple2; import org.apache.spark.api.java.*; import org.apache.spark.api.java.function.Function; @@ -458,8 +458,8 @@ that is equivalent to the provided example in Scala is given below: import org.apache.spark.mllib.recommendation.Rating; import org.apache.spark.SparkConf; -SparkConf conf = new SparkConf().setAppName("Java Collaborative Filtering Example"); -JavaSparkContext jsc = new JavaSparkContext(conf); +SparkConf conf = new SparkConf().setAppName("Java Collaborative Filtering Example"); +JavaSparkContext jsc = new JavaSparkContext(conf); // Load and parse the data String path = "data/mllib/als/test.data"; @@ -468,7 +468,7 @@ that is equivalent to the provided example in Scala is given below: new Function() { public Rating call(String s) { String[] sarray = s.split(","); - return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), + return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), Double.parseDouble(sarray[2])); } } @@ -528,36 +528,36 @@ recommendation by measuring the Mean Squared Error of rating prediction. Refer to the ALS Python docs for more details on the API. -from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating +from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating -# Load and parse the data -data = sc.textFile("data/mllib/als/test.data") -ratings = data.map(lambda l: l.split(','))\ +# Load and parse the data +data = sc.textFile("data/mllib/als/test.data") +ratings = data.map(lambda l: l.split(','))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) -# Build the recommendation model using Alternating Least Squares +# Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 10 model = ALS.train(ratings, rank, numIterations) -# Evaluate the model on training data +# Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() -print("Mean Squared Error = " + str(MSE)) +print("Mean Squared Error = " + str(MSE)) -# Save and load model -model.save(sc, "target/tmp/myCollaborativeFilter") -sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") +# Save and load model +model.save(sc, "target/tmp/myCollaborativeFilter") +sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") Find full example code at "examples/src/main/python/mllib/recommendation_example.py" in the Spark repo. If the rating matrix is derived from other source of information (i.e. it is inferred from other signals), you can use the trainImplicit method to get better results. -# Build the recommendation model using Alternating Least Squares based on implicit ratings -model = ALS.
[15/25] spark-website git commit: Update 2.1.0 docs to include https://github.com/apache/spark/pull/16294
http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/mllib-data-types.html -- diff --git a/site/docs/2.1.0/mllib-data-types.html b/site/docs/2.1.0/mllib-data-types.html index 546d921..f7b5358 100644 --- a/site/docs/2.1.0/mllib-data-types.html +++ b/site/docs/2.1.0/mllib-data-types.html @@ -307,14 +307,14 @@ - Local vector - Labeled point - Local matrix - Distributed matrix - RowMatrix - IndexedRowMatrix - CoordinateMatrix - BlockMatrix + Local vector + Labeled point + Local matrix + Distributed matrix + RowMatrix + IndexedRowMatrix + CoordinateMatrix + BlockMatrix @@ -347,14 +347,14 @@ using the factory methods implemented in Refer to the Vector Scala docs and Vectors Scala docs for details on the API. -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.{Vector, Vectors} // Create a dense vector (1.0, 0.0, 3.0). val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to nonzero entries. val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)) // Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries. -val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0))) +val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0))) Note: Scala imports scala.collection.immutable.Vector by default, so you have to import @@ -373,13 +373,13 @@ using the factory methods implemented in Refer to the Vector Java docs and Vectors Java docs for details on the API. -import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; // Create a dense vector (1.0, 0.0, 3.0). Vector dv = Vectors.dense(1.0, 0.0, 3.0); // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to nonzero entries. -Vector sv = Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}); +Vector sv = Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}); @@ -405,18 +405,18 @@ in Ve Refer to the Vectors Python docs for more details on the API. -import numpy as np +import numpy as np import scipy.sparse as sps from pyspark.mllib.linalg import Vectors -# Use a NumPy array as a dense vector. +# Use a NumPy array as a dense vector. dv1 = np.array([1.0, 0.0, 3.0]) -# Use a Python list as a dense vector. +# Use a Python list as a dense vector. dv2 = [1.0, 0.0, 3.0] -# Create a SparseVector. +# Create a SparseVector. sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) -# Use a single-column SciPy csc_matrix as a sparse vector. -sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1)) +# Use a single-column SciPy csc_matrix as a sparse vector. +sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1)) @@ -438,14 +438,14 @@ For multiclass classification, labels should be class indices starting from zero Refer to the LabeledPoint Scala docs for details on the API. -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint // Create a labeled point with a positive label and a dense feature vector. val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) // Create a labeled point with a negative label and a sparse feature vector. -val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))) +val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))) @@ -456,14 +456,14 @@ For multiclass classification, labels should be class indices starting from zero Refer to the LabeledPoint Java docs for details on the API. -import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; // Create a labeled point with a positive label and a dense feature vector. -LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); +LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); // Create a labeled point with a negative label and a sparse feature vector. -LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0})); +LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0})); @@ -474,14 +474,14 @@ For multiclass classification, labels should be class indices starting from zero Refer to the LabeledPoint Python docs for more details on the API. -from pyspark.mllib.linalg import SparseVector +from pyspark.mllib.linalg im
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.1.0 [created] cd0a08361 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-18990][SQL] make DatasetBenchmark fairer for Dataset"
Repository: spark Updated Branches: refs/heads/master a05cc425a -> 2404d8e54 Revert "[SPARK-18990][SQL] make DatasetBenchmark fairer for Dataset" This reverts commit a05cc425a0a7d18570b99883993a04ad175aa071. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2404d8e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2404d8e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2404d8e5 Branch: refs/heads/master Commit: 2404d8e54b6b2cfc78d892e7ebb31578457518a3 Parents: a05cc42 Author: Yin Huai Authored: Tue Dec 27 10:03:52 2016 -0800 Committer: Yin Huai Committed: Tue Dec 27 10:03:52 2016 -0800 -- .../org/apache/spark/sql/DatasetBenchmark.scala | 75 +--- 1 file changed, 33 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2404d8e5/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index cd925e6..66d94d6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ @@ -33,13 +34,11 @@ object DatasetBenchmark { def backToBackMap(spark: SparkSession, numRows: Long, numChains: Int): Benchmark = { import spark.implicits._ -val rdd = spark.sparkContext.range(0, numRows) -val ds = spark.range(0, numRows) -val df = ds.toDF("l") -val func = (l: Long) => l + 1 - +val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) val benchmark = new Benchmark("back-to-back map", numRows) +val func = (d: Data) => Data(d.l + 1, d.s) +val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) benchmark.addCase("RDD") { iter => var res = rdd var i = 0 @@ -54,14 +53,14 @@ object DatasetBenchmark { var res = df var i = 0 while (i < numChains) { -res = res.select($"l" + 1 as "l") +res = res.select($"l" + 1 as "l", $"s") i += 1 } res.queryExecution.toRdd.foreach(_ => Unit) } benchmark.addCase("Dataset") { iter => - var res = ds.as[Long] + var res = df.as[Data] var i = 0 while (i < numChains) { res = res.map(func) @@ -76,14 +75,14 @@ object DatasetBenchmark { def backToBackFilter(spark: SparkSession, numRows: Long, numChains: Int): Benchmark = { import spark.implicits._ -val rdd = spark.sparkContext.range(0, numRows) -val ds = spark.range(0, numRows) -val df = ds.toDF("l") -val func = (l: Long, i: Int) => l % (100L + i) == 0L -val funcs = 0.until(numChains).map { i => (l: Long) => func(l, i) } - +val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) val benchmark = new Benchmark("back-to-back filter", numRows) +val func = (d: Data, i: Int) => d.l % (100L + i) == 0L +val funcs = 0.until(numChains).map { i => + (d: Data) => func(d, i) +} +val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) benchmark.addCase("RDD") { iter => var res = rdd var i = 0 @@ -105,7 +104,7 @@ object DatasetBenchmark { } benchmark.addCase("Dataset") { iter => - var res = ds.as[Long] + var res = df.as[Data] var i = 0 while (i < numChains) { res = res.filter(funcs(i)) @@ -134,29 +133,24 @@ object DatasetBenchmark { def aggregate(spark: SparkSession, numRows: Long): Benchmark = { import spark.implicits._ -val rdd = spark.sparkContext.range(0, numRows) -val ds = spark.range(0, numRows) -val df = ds.toDF("l") - +val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) val benchmark = new Benchmark("aggregate", numRows) +val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) benchmark.addCase("RDD sum") { iter => - rdd.map(l => (l % 10, l)).reduceByKey(_ + _).foreach(_ => Unit) + rdd.aggregate(0L)(_ + _.l, _ + _) } benchmark.addCase("DataFrame sum") { iter => - df.groupBy($"l" % 10).agg(sum($"l")).queryExecution.toRdd.foreach(_ => Unit) + df.select(sum($"l")).queryExecution.toRdd.foreach(_ => Unit) } benchmark.addCase("Dataset sum using Aggregator") { iter => - val result = ds.as[
spark git commit: [SPARK-18951] Upgrade com.thoughtworks.paranamer/paranamer to 2.6
Repository: spark Updated Branches: refs/heads/master b7650f11c -> 1a6438897 [SPARK-18951] Upgrade com.thoughtworks.paranamer/paranamer to 2.6 ## What changes were proposed in this pull request? I recently hit a bug of com.thoughtworks.paranamer/paranamer, which causes jackson fail to handle byte array defined in a case class. Then I find https://github.com/FasterXML/jackson-module-scala/issues/48, which suggests that it is caused by a bug in paranamer. Let's upgrade paranamer. Since we are using jackson 2.6.5 and jackson-module-paranamer 2.6.5 use com.thoughtworks.paranamer/paranamer 2.6, I suggests that we upgrade paranamer to 2.6. Author: Yin Huai Closes #16359 from yhuai/SPARK-18951. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a643889 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a643889 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a643889 Branch: refs/heads/master Commit: 1a64388973711b4e567f25fa33d752066a018b49 Parents: b7650f1 Author: Yin Huai Authored: Wed Dec 21 09:26:13 2016 -0800 Committer: Yin Huai Committed: Wed Dec 21 09:26:13 2016 -0800 -- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- pom.xml| 7 ++- 6 files changed, 11 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a643889/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index afbdae0..9cbab3d8 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -128,7 +128,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.6.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1a643889/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index adf3863..63ce6c6 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -135,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.6.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1a643889/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 88e6b3f..122d5c2 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -135,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.6.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1a643889/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 15c5d9f..776aabd 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -143,7 +143,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.6.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1a643889/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 77fb537..524e824 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -144,7 +144,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.6.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1a643889/pom.xml -- diff --git a/pom.xml b/pom.xml index 4f12085..72e5442 100644 --- a/pom.xml +++ b/pom.xml @@ -179,7 +179,7 @@ 4.5.3 1.1 2.52.0 -2.8 +2.6 1.8 1.0.0 @@ -1863,6 +1863,11 @@ + +com.thoughtworks.paranamer +paranamer +${paranamer
spark git commit: [SPARK-18928][BRANCH-2.0] Check TaskContext.isInterrupted() in FileScanRDD, JDBCRDD & UnsafeSorter
Repository: spark Updated Branches: refs/heads/branch-2.0 678d91c1d -> 2aae220b5 [SPARK-18928][BRANCH-2.0] Check TaskContext.isInterrupted() in FileScanRDD, JDBCRDD & UnsafeSorter This is a branch-2.0 backport of #16340; the original description follows: ## What changes were proposed in this pull request? In order to respond to task cancellation, Spark tasks must periodically check `TaskContext.isInterrupted()`, but this check is missing on a few critical read paths used in Spark SQL, including `FileScanRDD`, `JDBCRDD`, and UnsafeSorter-based sorts. This can cause interrupted / cancelled tasks to continue running and become zombies (as also described in #16189). This patch aims to fix this problem by adding `TaskContext.isInterrupted()` checks to these paths. Note that I could have used `InterruptibleIterator` to simply wrap a bunch of iterators but in some cases this would have an adverse performance penalty or might not be effective due to certain special uses of Iterators in Spark SQL. Instead, I inlined `InterruptibleIterator`-style logic into existing iterator subclasses. ## How was this patch tested? Tested manually in `spark-shell` with two different reproductions of non-cancellable tasks, one involving scans of huge files and another involving sort-merge joins that spill to disk. Both causes of zombie tasks are fixed by the changes added here. Author: Josh Rosen Closes #16357 from JoshRosen/sql-task-interruption-branch-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2aae220b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2aae220b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2aae220b Branch: refs/heads/branch-2.0 Commit: 2aae220b536065f55b2cf644a2a223aab0d051d0 Parents: 678d91c Author: Josh Rosen Authored: Tue Dec 20 16:05:04 2016 -0800 Committer: Yin Huai Committed: Tue Dec 20 16:05:04 2016 -0800 -- .../collection/unsafe/sort/UnsafeInMemorySorter.java| 11 +++ .../collection/unsafe/sort/UnsafeSorterSpillReader.java | 11 +++ .../spark/sql/execution/datasources/FileScanRDD.scala | 12 ++-- .../spark/sql/execution/datasources/jdbc/JDBCRDD.scala | 9 - 4 files changed, 40 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2aae220b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index b517371..2bd756f 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -21,6 +21,8 @@ import java.util.Comparator; import org.apache.avro.reflect.Nullable; +import org.apache.spark.TaskContext; +import org.apache.spark.TaskKilledException; import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.unsafe.Platform; @@ -226,6 +228,7 @@ public final class UnsafeInMemorySorter { private long keyPrefix; private int recordLength; private long currentPageNumber; +private final TaskContext taskContext = TaskContext.get(); private SortedIterator(int numRecords, int offset) { this.numRecords = numRecords; @@ -256,6 +259,14 @@ public final class UnsafeInMemorySorter { @Override public void loadNext() { + // Kill the task in case it has been marked as killed. This logic is from + // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order + // to avoid performance overhead. This check is added here in `loadNext()` instead of in + // `hasNext()` because it's technically possible for the caller to be relying on + // `getNumRecords()` instead of `hasNext()` to know when to stop. + if (taskContext != null && taskContext.isInterrupted()) { +throw new TaskKilledException(); + } // This pointer points to a 4-byte record length, followed by the record's bytes final long recordPointer = array.get(offset + position); currentPageNumber = TaskMemoryManager.decodePageNumber(recordPointer); http://git-wip-us.apache.org/repos/asf/spark/blob/2aae220b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark
spark git commit: [SPARK-18761][BRANCH-2.0] Introduce "task reaper" to oversee task killing in executors
Repository: spark Updated Branches: refs/heads/branch-2.0 1f0c5fa75 -> 678d91c1d [SPARK-18761][BRANCH-2.0] Introduce "task reaper" to oversee task killing in executors Branch-2.0 backport of #16189; original description follows: ## What changes were proposed in this pull request? Spark's current task cancellation / task killing mechanism is "best effort" because some tasks may not be interruptible or may not respond to their "killed" flags being set. If a significant fraction of a cluster's task slots are occupied by tasks that have been marked as killed but remain running then this can lead to a situation where new jobs and tasks are starved of resources that are being used by these zombie tasks. This patch aims to address this problem by adding a "task reaper" mechanism to executors. At a high-level, task killing now launches a new thread which attempts to kill the task and then watches the task and periodically checks whether it has been killed. The TaskReaper will periodically re-attempt to call `TaskRunner.kill()` and will log warnings if the task keeps running. I modified TaskRunner to rename its thread at the start of the task, allowing TaskReaper to take a thread dump and filter it in order to log stacktraces from the exact task thread that we are waiting to finish. If the task has not stopped after a configurable timeout then the TaskReaper will throw an exception to trigger executor JVM death, thereby forcibly freeing any resources consumed by the zombie tasks. This feature is flagged off by default and is controlled by four new configurations under the `spark.task.reaper.*` namespace. See the updated `configuration.md` doc for details. ## How was this patch tested? Tested via a new test case in `JobCancellationSuite`, plus manual testing. Author: Josh Rosen Closes #16358 from JoshRosen/cancellation-branch-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/678d91c1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/678d91c1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/678d91c1 Branch: refs/heads/branch-2.0 Commit: 678d91c1d2283d9965a39656af9d383bad093ba8 Parents: 1f0c5fa Author: Josh Rosen Authored: Tue Dec 20 15:56:56 2016 -0800 Committer: Yin Huai Committed: Tue Dec 20 15:56:56 2016 -0800 -- .../org/apache/spark/executor/Executor.scala| 169 ++- .../scala/org/apache/spark/util/Utils.scala | 26 ++- .../org/apache/spark/JobCancellationSuite.scala | 77 + docs/configuration.md | 42 + 4 files changed, 300 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/678d91c1/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 9a017f2..93e994b 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -84,6 +84,16 @@ private[spark] class Executor( // Start worker thread pool private val threadPool = ThreadUtils.newDaemonCachedThreadPool("Executor task launch worker") private val executorSource = new ExecutorSource(threadPool, executorId) + // Pool used for threads that supervise task killing / cancellation + private val taskReaperPool = ThreadUtils.newDaemonCachedThreadPool("Task reaper") + // For tasks which are in the process of being killed, this map holds the most recently created + // TaskReaper. All accesses to this map should be synchronized on the map itself (this isn't + // a ConcurrentHashMap because we use the synchronization for purposes other than simply guarding + // the integrity of the map's internal state). The purpose of this map is to prevent the creation + // of a separate TaskReaper for every killTask() of a given task. Instead, this map allows us to + // track whether an existing TaskReaper fulfills the role of a TaskReaper that we would otherwise + // create. The map key is a task id. + private val taskReaperForTask: HashMap[Long, TaskReaper] = HashMap[Long, TaskReaper]() if (!isLocal) { env.metricsSystem.registerSource(executorSource) @@ -93,6 +103,9 @@ private[spark] class Executor( // Whether to load classes in user jars before those in Spark jars private val userClassPathFirst = conf.getBoolean("spark.executor.userClassPathFirst", false) + // Whether to monitor killed / interrupted tasks + private val taskReaperEnabled = conf.getBoolean("spark.task.reaper.enabled", false) + // Create our ClassLoader // do this after SparkEnv creation so can access the SecurityManager
spark git commit: [SPARK-18761][CORE] Introduce "task reaper" to oversee task killing in executors
Repository: spark Updated Branches: refs/heads/master 5857b9ac2 -> fa829ce21 [SPARK-18761][CORE] Introduce "task reaper" to oversee task killing in executors ## What changes were proposed in this pull request? Spark's current task cancellation / task killing mechanism is "best effort" because some tasks may not be interruptible or may not respond to their "killed" flags being set. If a significant fraction of a cluster's task slots are occupied by tasks that have been marked as killed but remain running then this can lead to a situation where new jobs and tasks are starved of resources that are being used by these zombie tasks. This patch aims to address this problem by adding a "task reaper" mechanism to executors. At a high-level, task killing now launches a new thread which attempts to kill the task and then watches the task and periodically checks whether it has been killed. The TaskReaper will periodically re-attempt to call `TaskRunner.kill()` and will log warnings if the task keeps running. I modified TaskRunner to rename its thread at the start of the task, allowing TaskReaper to take a thread dump and filter it in order to log stacktraces from the exact task thread that we are waiting to finish. If the task has not stopped after a configurable timeout then the TaskReaper will throw an exception to trigger executor JVM death, thereby forcibly freeing any resources consumed by the zombie tasks. This feature is flagged off by default and is controlled by four new configurations under the `spark.task.reaper.*` namespace. See the updated `configuration.md` doc for details. ## How was this patch tested? Tested via a new test case in `JobCancellationSuite`, plus manual testing. Author: Josh Rosen Closes #16189 from JoshRosen/cancellation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa829ce2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa829ce2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa829ce2 Branch: refs/heads/master Commit: fa829ce21fb84028d90b739a49c4ece70a17ccfd Parents: 5857b9a Author: Josh Rosen Authored: Mon Dec 19 18:43:59 2016 -0800 Committer: Yin Huai Committed: Mon Dec 19 18:43:59 2016 -0800 -- .../org/apache/spark/executor/Executor.scala| 169 ++- .../scala/org/apache/spark/util/Utils.scala | 56 +++--- .../org/apache/spark/JobCancellationSuite.scala | 77 + docs/configuration.md | 42 + 4 files changed, 316 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa829ce2/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 9501dd9..3346f6d 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -84,6 +84,16 @@ private[spark] class Executor( // Start worker thread pool private val threadPool = ThreadUtils.newDaemonCachedThreadPool("Executor task launch worker") private val executorSource = new ExecutorSource(threadPool, executorId) + // Pool used for threads that supervise task killing / cancellation + private val taskReaperPool = ThreadUtils.newDaemonCachedThreadPool("Task reaper") + // For tasks which are in the process of being killed, this map holds the most recently created + // TaskReaper. All accesses to this map should be synchronized on the map itself (this isn't + // a ConcurrentHashMap because we use the synchronization for purposes other than simply guarding + // the integrity of the map's internal state). The purpose of this map is to prevent the creation + // of a separate TaskReaper for every killTask() of a given task. Instead, this map allows us to + // track whether an existing TaskReaper fulfills the role of a TaskReaper that we would otherwise + // create. The map key is a task id. + private val taskReaperForTask: HashMap[Long, TaskReaper] = HashMap[Long, TaskReaper]() if (!isLocal) { env.metricsSystem.registerSource(executorSource) @@ -93,6 +103,9 @@ private[spark] class Executor( // Whether to load classes in user jars before those in Spark jars private val userClassPathFirst = conf.getBoolean("spark.executor.userClassPathFirst", false) + // Whether to monitor killed / interrupted tasks + private val taskReaperEnabled = conf.getBoolean("spark.task.reaper.enabled", false) + // Create our ClassLoader // do this after SparkEnv creation so can access the SecurityManager private val urlClassLoader = createClassLoader() @@ -148,9 +161,27 @@ private[spa
spark git commit: [SPARK-18921][SQL] check database existence with Hive.databaseExists instead of getDatabase
Repository: spark Updated Branches: refs/heads/branch-2.1 fc1b25660 -> c1a26b458 [SPARK-18921][SQL] check database existence with Hive.databaseExists instead of getDatabase ## What changes were proposed in this pull request? It's weird that we use `Hive.getDatabase` to check the existence of a database, while Hive has a `databaseExists` interface. What's worse, `Hive.getDatabase` will produce an error message if the database doesn't exist, which is annoying when we only want to check the database existence. This PR fixes this and use `Hive.databaseExists` to check database existence. ## How was this patch tested? N/A Author: Wenchen Fan Closes #16332 from cloud-fan/minor. (cherry picked from commit 7a75ee1c9224aa5c2e954fe2a71f9ad506f6782b) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1a26b45 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1a26b45 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1a26b45 Branch: refs/heads/branch-2.1 Commit: c1a26b458dd353be3ab1a2b3f9bb80809cf63479 Parents: fc1b256 Author: Wenchen Fan Authored: Mon Dec 19 11:42:59 2016 -0800 Committer: Yin Huai Committed: Mon Dec 19 11:43:55 2016 -0800 -- .../apache/spark/sql/hive/HiveExternalCatalog.scala| 2 +- .../org/apache/spark/sql/hive/client/HiveClient.scala | 8 +++- .../apache/spark/sql/hive/client/HiveClientImpl.scala | 12 .../apache/spark/sql/hive/client/VersionsSuite.scala | 13 +++-- 4 files changed, 19 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c1a26b45/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index f67ddc9..f321c45 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -167,7 +167,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } override def databaseExists(db: String): Boolean = withClient { -client.getDatabaseOption(db).isDefined +client.databaseExists(db) } override def listDatabases(): Seq[String] = withClient { http://git-wip-us.apache.org/repos/asf/spark/blob/c1a26b45/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index 8e7c871..0be5b0b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -58,12 +58,10 @@ private[hive] trait HiveClient { def setCurrentDatabase(databaseName: String): Unit /** Returns the metadata for specified database, throwing an exception if it doesn't exist */ - final def getDatabase(name: String): CatalogDatabase = { -getDatabaseOption(name).getOrElse(throw new NoSuchDatabaseException(name)) - } + def getDatabase(name: String): CatalogDatabase - /** Returns the metadata for a given database, or None if it doesn't exist. */ - def getDatabaseOption(name: String): Option[CatalogDatabase] + /** Return whether a table/view with the specified name exists. */ + def databaseExists(dbName: String): Boolean /** List the names of all the databases that match the specified pattern. */ def listDatabases(pattern: String): Seq[String] http://git-wip-us.apache.org/repos/asf/spark/blob/c1a26b45/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index db73596..e0f7156 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -300,7 +300,7 @@ private[hive] class HiveClientImpl( } override def setCurrentDatabase(databaseName: String): Unit = withHiveState { -if (getDatabaseOption(databaseName).isDefined) { +if (databaseExists(databaseName)) { state.setCurrentDatabase(databaseName) } else { throw new NoSuchDatabaseException(databaseName) @@ -336,14 +336,18 @@ private[hive] class HiveClientImpl(
spark git commit: [SPARK-18921][SQL] check database existence with Hive.databaseExists instead of getDatabase
Repository: spark Updated Branches: refs/heads/master 24482858e -> 7a75ee1c9 [SPARK-18921][SQL] check database existence with Hive.databaseExists instead of getDatabase ## What changes were proposed in this pull request? It's weird that we use `Hive.getDatabase` to check the existence of a database, while Hive has a `databaseExists` interface. What's worse, `Hive.getDatabase` will produce an error message if the database doesn't exist, which is annoying when we only want to check the database existence. This PR fixes this and use `Hive.databaseExists` to check database existence. ## How was this patch tested? N/A Author: Wenchen Fan Closes #16332 from cloud-fan/minor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a75ee1c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a75ee1c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a75ee1c Branch: refs/heads/master Commit: 7a75ee1c9224aa5c2e954fe2a71f9ad506f6782b Parents: 2448285 Author: Wenchen Fan Authored: Mon Dec 19 11:42:59 2016 -0800 Committer: Yin Huai Committed: Mon Dec 19 11:42:59 2016 -0800 -- .../apache/spark/sql/hive/HiveExternalCatalog.scala| 2 +- .../org/apache/spark/sql/hive/client/HiveClient.scala | 8 +++- .../apache/spark/sql/hive/client/HiveClientImpl.scala | 12 .../apache/spark/sql/hive/client/VersionsSuite.scala | 13 +++-- 4 files changed, 19 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a75ee1c/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 544f277..9c19a0e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -167,7 +167,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } override def databaseExists(db: String): Boolean = withClient { -client.getDatabaseOption(db).isDefined +client.databaseExists(db) } override def listDatabases(): Seq[String] = withClient { http://git-wip-us.apache.org/repos/asf/spark/blob/7a75ee1c/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index 837b6c5..8bdcf31 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -58,12 +58,10 @@ private[hive] trait HiveClient { def setCurrentDatabase(databaseName: String): Unit /** Returns the metadata for specified database, throwing an exception if it doesn't exist */ - final def getDatabase(name: String): CatalogDatabase = { -getDatabaseOption(name).getOrElse(throw new NoSuchDatabaseException(name)) - } + def getDatabase(name: String): CatalogDatabase - /** Returns the metadata for a given database, or None if it doesn't exist. */ - def getDatabaseOption(name: String): Option[CatalogDatabase] + /** Return whether a table/view with the specified name exists. */ + def databaseExists(dbName: String): Boolean /** List the names of all the databases that match the specified pattern. */ def listDatabases(pattern: String): Seq[String] http://git-wip-us.apache.org/repos/asf/spark/blob/7a75ee1c/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index b75f6e9..bacae8a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -300,7 +300,7 @@ private[hive] class HiveClientImpl( } override def setCurrentDatabase(databaseName: String): Unit = withHiveState { -if (getDatabaseOption(databaseName).isDefined) { +if (databaseExists(databaseName)) { state.setCurrentDatabase(databaseName) } else { throw new NoSuchDatabaseException(databaseName) @@ -336,14 +336,18 @@ private[hive] class HiveClientImpl( Option(database.properties).map(_.asJava).orNull)) } - override def getDatabaseOption(na
spark git commit: [SPARK-13747][CORE] Fix potential ThreadLocal leaks in RPC when using ForkJoinPool
Repository: spark Updated Branches: refs/heads/master d53f18cae -> fb3081d3b [SPARK-13747][CORE] Fix potential ThreadLocal leaks in RPC when using ForkJoinPool ## What changes were proposed in this pull request? Some places in SQL may call `RpcEndpointRef.askWithRetry` (e.g., ParquetFileFormat.buildReader -> SparkContext.broadcast -> ... -> BlockManagerMaster.updateBlockInfo -> RpcEndpointRef.askWithRetry), which will finally call `Await.result`. It may cause `java.lang.IllegalArgumentException: spark.sql.execution.id is already set` when running in Scala ForkJoinPool. This PR includes the following changes to fix this issue: - Remove `ThreadUtils.awaitResult` - Rename `ThreadUtils. awaitResultInForkJoinSafely` to `ThreadUtils.awaitResult` - Replace `Await.result` in RpcTimeout with `ThreadUtils.awaitResult`. ## How was this patch tested? Jenkins Author: Shixiong Zhu Closes #16230 from zsxwing/fix-SPARK-13747. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb3081d3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb3081d3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb3081d3 Branch: refs/heads/master Commit: fb3081d3b38a50aa5e023c603e1b191e57f7c876 Parents: d53f18c Author: Shixiong Zhu Authored: Tue Dec 13 09:53:22 2016 -0800 Committer: Yin Huai Committed: Tue Dec 13 09:53:22 2016 -0800 -- .../scala/org/apache/spark/rpc/RpcTimeout.scala | 12 ++ .../org/apache/spark/util/ThreadUtils.scala | 41 .../apache/spark/rdd/AsyncRDDActionsSuite.scala | 3 +- .../OutputCommitCoordinatorSuite.scala | 3 +- scalastyle-config.xml | 1 - .../sql/execution/basicPhysicalOperators.scala | 2 +- .../exchange/BroadcastExchangeExec.scala| 3 +- 7 files changed, 23 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb3081d3/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala -- diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala index 2761d39..efd2648 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala @@ -24,7 +24,7 @@ import scala.concurrent.duration._ import scala.util.control.NonFatal import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.util.Utils +import org.apache.spark.util.{ThreadUtils, Utils} /** * An exception thrown if RpcTimeout modifies a [[TimeoutException]]. @@ -72,15 +72,9 @@ private[spark] class RpcTimeout(val duration: FiniteDuration, val timeoutProp: S * is still not ready */ def awaitResult[T](future: Future[T]): T = { -val wrapAndRethrow: PartialFunction[Throwable, T] = { - case NonFatal(t) => -throw new SparkException("Exception thrown in awaitResult", t) -} try { - // scalastyle:off awaitresult - Await.result(future, duration) - // scalastyle:on awaitresult -} catch addMessageIfTimeout.orElse(wrapAndRethrow) + ThreadUtils.awaitResult(future, duration) +} catch addMessageIfTimeout } } http://git-wip-us.apache.org/repos/asf/spark/blob/fb3081d3/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 60a6e82..1aa4456 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -19,7 +19,7 @@ package org.apache.spark.util import java.util.concurrent._ -import scala.concurrent.{Await, Awaitable, ExecutionContext, ExecutionContextExecutor} +import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor} import scala.concurrent.duration.Duration import scala.concurrent.forkjoin.{ForkJoinPool => SForkJoinPool, ForkJoinWorkerThread => SForkJoinWorkerThread} import scala.util.control.NonFatal @@ -180,39 +180,30 @@ private[spark] object ThreadUtils { // scalastyle:off awaitresult /** - * Preferred alternative to `Await.result()`. This method wraps and re-throws any exceptions - * thrown by the underlying `Await` call, ensuring that this thread's stack trace appears in - * logs. - */ - @throws(classOf[SparkException]) - def awaitResult[T](awaitable: Awaitable[T], atMost: Duration): T = { -try { - Await.result(awaitable, atMost) - // scalastyle:on awaitresult -} catch { - case NonFatal(t) => -throw new SparkException("Exception thrown in awaitResult
spark git commit: [SPARK-18675][SQL] CTAS for hive serde table should work for all hive versions
Repository: spark Updated Branches: refs/heads/master 096f868b7 -> d53f18cae [SPARK-18675][SQL] CTAS for hive serde table should work for all hive versions ## What changes were proposed in this pull request? Before hive 1.1, when inserting into a table, hive will create the staging directory under a common scratch directory. After the writing is finished, hive will simply empty the table directory and move the staging directory to it. After hive 1.1, hive will create the staging directory under the table directory, and when moving staging directory to table directory, hive will still empty the table directory, but will exclude the staging directory there. In `InsertIntoHiveTable`, we simply copy the code from hive 1.2, which means we will always create the staging directory under the table directory, no matter what the hive version is. This causes problems if the hive version is prior to 1.1, because the staging directory will be removed by hive when hive is trying to empty the table directory. This PR copies the code from hive 0.13, so that we have 2 branches to create staging directory. If hive version is prior to 1.1, we'll go to the old style branch(i.e. create the staging directory under a common scratch directory), else, go to the new style branch(i.e. create the staging directory under the table directory) ## How was this patch tested? new test Author: Wenchen Fan Closes #16104 from cloud-fan/hive-0.13. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d53f18ca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d53f18ca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d53f18ca Branch: refs/heads/master Commit: d53f18cae41c6c77a0cff3f1fd266e4c1b9ea79a Parents: 096f868 Author: Wenchen Fan Authored: Tue Dec 13 09:46:58 2016 -0800 Committer: Yin Huai Committed: Tue Dec 13 09:46:58 2016 -0800 -- .../hive/execution/InsertIntoHiveTable.scala| 68 +--- .../spark/sql/hive/client/VersionsSuite.scala | 19 +- 2 files changed, 75 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d53f18ca/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index db2239d..82c7b1a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -22,7 +22,6 @@ import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.exec.TaskRunner @@ -86,6 +85,7 @@ case class InsertIntoHiveTable( val hadoopConf = sessionState.newHadoopConf() val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") + val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive") private def executionId: String = { val rand: Random = new Random @@ -93,7 +93,7 @@ case class InsertIntoHiveTable( "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) } - private def getStagingDir(inputPath: Path, hadoopConf: Configuration): Path = { + private def getStagingDir(inputPath: Path): Path = { val inputPathUri: URI = inputPath.toUri val inputPathName: String = inputPathUri.getPath val fs: FileSystem = inputPath.getFileSystem(hadoopConf) @@ -121,21 +121,69 @@ case class InsertIntoHiveTable( return dir } - private def getExternalScratchDir(extURI: URI, hadoopConf: Configuration): Path = { -getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), hadoopConf) + private def getExternalScratchDir(extURI: URI): Path = { +getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath)) } - def getExternalTmpPath(path: Path, hadoopConf: Configuration): Path = { + def getExternalTmpPath(path: Path): Path = { +import org.apache.spark.sql.hive.client.hive._ + +val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version +// Before Hive 1.1, when inserting into a table, Hive will create the staging directory under +// a common scratch directory. After the writing is finished, Hive will simply empty the table +// directory and move the staging directory to it. +// After Hive 1.1, Hive will create the staging directory under the table directory, and when +//
spark git commit: [SPARK-18631][SQL] Changed ExchangeCoordinator re-partitioning to avoid more data skew
Repository: spark Updated Branches: refs/heads/master d57a594b8 -> f8878a4c6 [SPARK-18631][SQL] Changed ExchangeCoordinator re-partitioning to avoid more data skew ## What changes were proposed in this pull request? Re-partitioning logic in ExchangeCoordinator changed so that adding another pre-shuffle partition to the post-shuffle partition will not be done if doing so would cause the size of the post-shuffle partition to exceed the target partition size. ## How was this patch tested? Existing tests updated to reflect new expectations. Author: Mark Hamstra Closes #16065 from markhamstra/SPARK-17064. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f8878a4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f8878a4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f8878a4c Branch: refs/heads/master Commit: f8878a4c6f7c4ebb16e4aef26ad0869ba12eb9fc Parents: d57a594 Author: Mark Hamstra Authored: Tue Nov 29 15:01:12 2016 -0800 Committer: Yin Huai Committed: Tue Nov 29 15:01:12 2016 -0800 -- .../exchange/ExchangeCoordinator.scala | 32 .../execution/ExchangeCoordinatorSuite.scala| 40 ++-- 2 files changed, 35 insertions(+), 37 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f8878a4c/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala index 57da85f..deb2c24 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala @@ -69,15 +69,18 @@ import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan} * post-shuffle partition. Once we have size statistics of pre-shuffle partitions from stages * corresponding to the registered [[ShuffleExchange]]s, we will do a pass of those statistics and * pack pre-shuffle partitions with continuous indices to a single post-shuffle partition until - * the size of a post-shuffle partition is equal or greater than the target size. + * adding another pre-shuffle partition would cause the size of a post-shuffle partition to be + * greater than the target size. + * * For example, we have two stages with the following pre-shuffle partition size statistics: * stage 1: [100 MB, 20 MB, 100 MB, 10MB, 30 MB] * stage 2: [10 MB, 10 MB, 70 MB, 5 MB, 5 MB] * assuming the target input size is 128 MB, we will have three post-shuffle partitions, * which are: - * - post-shuffle partition 0: pre-shuffle partition 0 and 1 - * - post-shuffle partition 1: pre-shuffle partition 2 - * - post-shuffle partition 2: pre-shuffle partition 3 and 4 + * - post-shuffle partition 0: pre-shuffle partition 0 (size 110 MB) + * - post-shuffle partition 1: pre-shuffle partition 1 (size 30 MB) + * - post-shuffle partition 2: pre-shuffle partition 2 (size 170 MB) + * - post-shuffle partition 3: pre-shuffle partition 3 and 4 (size 50 MB) */ class ExchangeCoordinator( numExchanges: Int, @@ -164,25 +167,20 @@ class ExchangeCoordinator( while (i < numPreShufflePartitions) { // We calculate the total size of ith pre-shuffle partitions from all pre-shuffle stages. // Then, we add the total size to postShuffleInputSize. + var nextShuffleInputSize = 0L var j = 0 while (j < mapOutputStatistics.length) { -postShuffleInputSize += mapOutputStatistics(j).bytesByPartitionId(i) +nextShuffleInputSize += mapOutputStatistics(j).bytesByPartitionId(i) j += 1 } - // If the current postShuffleInputSize is equal or greater than the - // targetPostShuffleInputSize, We need to add a new element in partitionStartIndices. - if (postShuffleInputSize >= targetPostShuffleInputSize) { -if (i < numPreShufflePartitions - 1) { - // Next start index. - partitionStartIndices += i + 1 -} else { - // This is the last element. So, we do not need to append the next start index to - // partitionStartIndices. -} + // If including the nextShuffleInputSize would exceed the target partition size, then start a + // new partition. + if (i > 0 && postShuffleInputSize + nextShuffleInputSize > targetPostShuffleInputSize) { +partitionStartIndices += i // reset postShuffleInputSize. -postShuffleInputSize = 0L - } +postShuffleInputSize = nextShuffleInputSize + } else postShuffleInputSi
spark git commit: [SPARK-18602] Set the version of org.codehaus.janino:commons-compiler to 3.0.0 to match the version of org.codehaus.janino:janino
Repository: spark Updated Branches: refs/heads/branch-2.1 32b259fae -> 34ad4d520 [SPARK-18602] Set the version of org.codehaus.janino:commons-compiler to 3.0.0 to match the version of org.codehaus.janino:janino ## What changes were proposed in this pull request? org.codehaus.janino:janino depends on org.codehaus.janino:commons-compiler and we have been upgraded to org.codehaus.janino:janino 3.0.0. However, seems we are still pulling in org.codehaus.janino:commons-compiler 2.7.6 because of calcite. It looks like an accident because we exclude janino from calcite (see here https://github.com/apache/spark/blob/branch-2.1/pom.xml#L1759). So, this PR upgrades org.codehaus.janino:commons-compiler to 3.0.0. ## How was this patch tested? jenkins Author: Yin Huai Closes #16025 from yhuai/janino-commons-compile. (cherry picked from commit eba727757ed5dc23c635e1926795aea62ec0fc66) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34ad4d52 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34ad4d52 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34ad4d52 Branch: refs/heads/branch-2.1 Commit: 34ad4d520ae0e4302972097c5985ab2c5a8d5e04 Parents: 32b259f Author: Yin Huai Authored: Mon Nov 28 10:09:30 2016 -0800 Committer: Yin Huai Committed: Mon Nov 28 10:09:50 2016 -0800 -- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- pom.xml| 9 + sql/catalyst/pom.xml | 4 7 files changed, 18 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34ad4d52/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index bbdea06..89bfcef 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -24,7 +24,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/34ad4d52/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index a2dec41..8df3858 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/34ad4d52/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index c1f02b9..71e7fb6 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/34ad4d52/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 4f04636..ba31391 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/34ad4d52/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index da3af9f..b129e5a 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/34ad4
spark git commit: [SPARK-18602] Set the version of org.codehaus.janino:commons-compiler to 3.0.0 to match the version of org.codehaus.janino:janino
Repository: spark Updated Branches: refs/heads/master 237c3b964 -> eba727757 [SPARK-18602] Set the version of org.codehaus.janino:commons-compiler to 3.0.0 to match the version of org.codehaus.janino:janino ## What changes were proposed in this pull request? org.codehaus.janino:janino depends on org.codehaus.janino:commons-compiler and we have been upgraded to org.codehaus.janino:janino 3.0.0. However, seems we are still pulling in org.codehaus.janino:commons-compiler 2.7.6 because of calcite. It looks like an accident because we exclude janino from calcite (see here https://github.com/apache/spark/blob/branch-2.1/pom.xml#L1759). So, this PR upgrades org.codehaus.janino:commons-compiler to 3.0.0. ## How was this patch tested? jenkins Author: Yin Huai Closes #16025 from yhuai/janino-commons-compile. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eba72775 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eba72775 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eba72775 Branch: refs/heads/master Commit: eba727757ed5dc23c635e1926795aea62ec0fc66 Parents: 237c3b9 Author: Yin Huai Authored: Mon Nov 28 10:09:30 2016 -0800 Committer: Yin Huai Committed: Mon Nov 28 10:09:30 2016 -0800 -- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- pom.xml| 9 + sql/catalyst/pom.xml | 4 7 files changed, 18 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eba72775/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index bbdea06..89bfcef 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -24,7 +24,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/eba72775/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index a2dec41..8df3858 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/eba72775/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index c1f02b9..71e7fb6 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/eba72775/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 4f04636..ba31391 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/eba72775/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index da3af9f..b129e5a 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-2.7.6.jar +commons-compiler-3.0.0.jar commons-compress-1.4.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/eba72775/pom.xml -- diff --git a/pom.xm
spark git commit: [SPARK-18360][SQL] default table path of tables in default database should depend on the location of default database
Repository: spark Updated Branches: refs/heads/branch-2.1 978798880 -> fc466be4f [SPARK-18360][SQL] default table path of tables in default database should depend on the location of default database ## What changes were proposed in this pull request? The current semantic of the warehouse config: 1. it's a static config, which means you can't change it once your spark application is launched. 2. Once a database is created, its location won't change even the warehouse path config is changed. 3. default database is a special case, although its location is fixed, but the locations of tables created in it are not. If a Spark app starts with warehouse path B(while the location of default database is A), then users create a table `tbl` in default database, its location will be `B/tbl` instead of `A/tbl`. If uses change the warehouse path config to C, and create another table `tbl2`, its location will still be `B/tbl2` instead of `C/tbl2`. rule 3 doesn't make sense and I think we made it by mistake, not intentionally. Data source tables don't follow rule 3 and treat default database like normal ones. This PR fixes hive serde tables to make it consistent with data source tables. ## How was this patch tested? HiveSparkSubmitSuite Author: Wenchen Fan Closes #15812 from cloud-fan/default-db. (cherry picked from commit ce13c2672318242748f7520ed4ce6bcfad4fb428) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc466be4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc466be4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc466be4 Branch: refs/heads/branch-2.1 Commit: fc466be4fd8def06880f59d50e5567c22cc53d6a Parents: 9787988 Author: Wenchen Fan Authored: Thu Nov 17 17:31:12 2016 -0800 Committer: Yin Huai Committed: Thu Nov 17 17:31:43 2016 -0800 -- .../spark/sql/hive/HiveExternalCatalog.scala| 237 ++- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 76 +- 2 files changed, 190 insertions(+), 123 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc466be4/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 8433058..cacffcf 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -197,136 +197,151 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat if (tableDefinition.tableType == VIEW) { client.createTable(tableDefinition, ignoreIfExists) -} else if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) { - // Here we follow data source tables and put table metadata like provider, schema, etc. in - // table properties, so that we can work around the Hive metastore issue about not case - // preserving and make Hive serde table support mixed-case column names. - val tableWithDataSourceProps = tableDefinition.copy( -properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition)) - client.createTable(tableWithDataSourceProps, ignoreIfExists) } else { - // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type - // support, no column nullability, etc., we should do some extra works before saving table - // metadata into Hive metastore: - // 1. Put table metadata like provider, schema, etc. in table properties. - // 2. Check if this table is hive compatible. - //2.1 If it's not hive compatible, set location URI, schema, partition columns and bucket - // spec to empty and save table metadata to Hive. - //2.2 If it's hive compatible, set serde information in table metadata and try to save - // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 - val tableProperties = tableMetaToTableProps(tableDefinition) - // Ideally we should not create a managed table with location, but Hive serde table can // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have // to create the table directory and write out data before we create this table, to avoid // exposing a partial written table. val needDefaultTableLocation = tableDefinition.tableType == MANAGED && tableDefinition.storage.locationUri.isEmpty + val tableLocation = if (needDefaultTableLocation) { Some(defaultTablePath(tableDefinition.identif
spark git commit: [SPARK-18360][SQL] default table path of tables in default database should depend on the location of default database
Repository: spark Updated Branches: refs/heads/master b0aa1aa1a -> ce13c2672 [SPARK-18360][SQL] default table path of tables in default database should depend on the location of default database ## What changes were proposed in this pull request? The current semantic of the warehouse config: 1. it's a static config, which means you can't change it once your spark application is launched. 2. Once a database is created, its location won't change even the warehouse path config is changed. 3. default database is a special case, although its location is fixed, but the locations of tables created in it are not. If a Spark app starts with warehouse path B(while the location of default database is A), then users create a table `tbl` in default database, its location will be `B/tbl` instead of `A/tbl`. If uses change the warehouse path config to C, and create another table `tbl2`, its location will still be `B/tbl2` instead of `C/tbl2`. rule 3 doesn't make sense and I think we made it by mistake, not intentionally. Data source tables don't follow rule 3 and treat default database like normal ones. This PR fixes hive serde tables to make it consistent with data source tables. ## How was this patch tested? HiveSparkSubmitSuite Author: Wenchen Fan Closes #15812 from cloud-fan/default-db. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce13c267 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce13c267 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce13c267 Branch: refs/heads/master Commit: ce13c2672318242748f7520ed4ce6bcfad4fb428 Parents: b0aa1aa Author: Wenchen Fan Authored: Thu Nov 17 17:31:12 2016 -0800 Committer: Yin Huai Committed: Thu Nov 17 17:31:12 2016 -0800 -- .../spark/sql/hive/HiveExternalCatalog.scala| 237 ++- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 76 +- 2 files changed, 190 insertions(+), 123 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ce13c267/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 8433058..cacffcf 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -197,136 +197,151 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat if (tableDefinition.tableType == VIEW) { client.createTable(tableDefinition, ignoreIfExists) -} else if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) { - // Here we follow data source tables and put table metadata like provider, schema, etc. in - // table properties, so that we can work around the Hive metastore issue about not case - // preserving and make Hive serde table support mixed-case column names. - val tableWithDataSourceProps = tableDefinition.copy( -properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition)) - client.createTable(tableWithDataSourceProps, ignoreIfExists) } else { - // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type - // support, no column nullability, etc., we should do some extra works before saving table - // metadata into Hive metastore: - // 1. Put table metadata like provider, schema, etc. in table properties. - // 2. Check if this table is hive compatible. - //2.1 If it's not hive compatible, set location URI, schema, partition columns and bucket - // spec to empty and save table metadata to Hive. - //2.2 If it's hive compatible, set serde information in table metadata and try to save - // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 - val tableProperties = tableMetaToTableProps(tableDefinition) - // Ideally we should not create a managed table with location, but Hive serde table can // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have // to create the table directory and write out data before we create this table, to avoid // exposing a partial written table. val needDefaultTableLocation = tableDefinition.tableType == MANAGED && tableDefinition.storage.locationUri.isEmpty + val tableLocation = if (needDefaultTableLocation) { Some(defaultTablePath(tableDefinition.identifier)) } else { tableDefinition.storage.locationUri } - // Ideally we should
spark git commit: [SPARK-18186] Migrate HiveUDAFFunction to TypedImperativeAggregate for partial aggregation support
Repository: spark Updated Branches: refs/heads/master a36a76ac4 -> 2ca8ae9aa [SPARK-18186] Migrate HiveUDAFFunction to TypedImperativeAggregate for partial aggregation support ## What changes were proposed in this pull request? While being evaluated in Spark SQL, Hive UDAFs don't support partial aggregation. This PR migrates `HiveUDAFFunction`s to `TypedImperativeAggregate`, which already provides partial aggregation support for aggregate functions that may use arbitrary Java objects as aggregation states. The following snippet shows the effect of this PR: ```scala import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax sql(s"CREATE FUNCTION hive_max AS '${classOf[GenericUDAFMax].getName}'") spark.range(100).createOrReplaceTempView("t") // A query using both Spark SQL native `max` and Hive `max` sql(s"SELECT max(id), hive_max(id) FROM t").explain() ``` Before this PR: ``` == Physical Plan == SortAggregate(key=[], functions=[max(id#1L), default.hive_max(default.hive_max, HiveFunctionWrapper(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax,org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax7475f57e), id#1L, false, 0, 0)]) +- Exchange SinglePartition +- *Range (0, 100, step=1, splits=Some(1)) ``` After this PR: ``` == Physical Plan == SortAggregate(key=[], functions=[max(id#1L), default.hive_max(default.hive_max, HiveFunctionWrapper(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax,org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax5e18a6a7), id#1L, false, 0, 0)]) +- Exchange SinglePartition +- SortAggregate(key=[], functions=[partial_max(id#1L), partial_default.hive_max(default.hive_max, HiveFunctionWrapper(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax,org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax5e18a6a7), id#1L, false, 0, 0)]) +- *Range (0, 100, step=1, splits=Some(1)) ``` The tricky part of the PR is mostly about updating and passing around aggregation states of `HiveUDAFFunction`s since the aggregation state of a Hive UDAF may appear in three different forms. Let's take a look at the testing `MockUDAF` added in this PR as an example. This UDAF computes the count of non-null values together with the count of nulls of a given column. Its aggregation state may appear as the following forms at different time: 1. A `MockUDAFBuffer`, which is a concrete subclass of `GenericUDAFEvaluator.AggregationBuffer` The form used by Hive UDAF API. This form is required by the following scenarios: - Calling `GenericUDAFEvaluator.iterate()` to update an existing aggregation state with new input values. - Calling `GenericUDAFEvaluator.terminate()` to get the final aggregated value from an existing aggregation state. - Calling `GenericUDAFEvaluator.merge()` to merge other aggregation states into an existing aggregation state. The existing aggregation state to be updated must be in this form. Conversions: - To form 2: `GenericUDAFEvaluator.terminatePartial()` - To form 3: Convert to form 2 first, and then to 3. 2. An `Object[]` array containing two `java.lang.Long` values. The form used to interact with Hive's `ObjectInspector`s. This form is required by the following scenarios: - Calling `GenericUDAFEvaluator.terminatePartial()` to convert an existing aggregation state in form 1 to form 2. - Calling `GenericUDAFEvaluator.merge()` to merge other aggregation states into an existing aggregation state. The input aggregation state must be in this form. Conversions: - To form 1: No direct method. Have to create an empty `AggregationBuffer` and merge it into the empty buffer. - To form 3: `unwrapperFor()`/`unwrap()` method of `HiveInspectors` 3. The byte array that holds data of an `UnsafeRow` with two `LongType` fields. The form used by Spark SQL to shuffle partial aggregation results. This form is required because `TypedImperativeAggregate` always asks its subclasses to serialize their aggregation states into a byte array. Conversions: - To form 1: Convert to form 2 first, and then to 1. - To form 2: `wrapperFor()`/`wrap()` method of `HiveInspectors` Here're some micro-benchmark results produced by the most recent master and this PR branch. Master: ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.10.5 Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz hive udaf vs spark af: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative w/o groupBy339 / 372 3.1 323.2 1.0X w/ groupBy 503 / 529 2.1 479.7 0.7X ``` This PR: ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.10.5 Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz hive udaf vs spark af: Best/Avg Time(ms)Rate
spark git commit: [SPARK-18379][SQL] Make the parallelism of parallelPartitionDiscovery configurable.
Repository: spark Updated Branches: refs/heads/master f14ae4900 -> 745ab8bc5 [SPARK-18379][SQL] Make the parallelism of parallelPartitionDiscovery configurable. ## What changes were proposed in this pull request? The largest parallelism in PartitioningAwareFileIndex #listLeafFilesInParallel() is 1 in hard code. We may need to make this number configurable. And in PR, I reduce it to 100. ## How was this patch tested? Existing ut. Author: genmao.ygm Author: dylon Closes #15829 from uncleGen/SPARK-18379. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/745ab8bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/745ab8bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/745ab8bc Branch: refs/heads/master Commit: 745ab8bc50da89c42b297de9dcb833e5f2074481 Parents: f14ae49 Author: genmao.ygm Authored: Tue Nov 15 10:32:43 2016 -0800 Committer: Yin Huai Committed: Tue Nov 15 10:32:43 2016 -0800 -- .../datasources/PartitioningAwareFileIndex.scala | 4 +++- .../scala/org/apache/spark/sql/internal/SQLConf.scala| 11 +++ 2 files changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/745ab8bc/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala index 3740caa..705a1e3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala @@ -315,10 +315,12 @@ object PartitioningAwareFileIndex extends Logging { val sparkContext = sparkSession.sparkContext val serializableConfiguration = new SerializableConfiguration(hadoopConf) val serializedPaths = paths.map(_.toString) +val parallelPartitionDiscoveryParallelism = + sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism // Set the number of parallelism to prevent following file listing from generating many tasks // in case of large #defaultParallelism. -val numParallelism = Math.min(paths.size, 1) +val numParallelism = Math.min(paths.size, parallelPartitionDiscoveryParallelism) val statusMap = sparkContext .parallelize(serializedPaths, numParallelism) http://git-wip-us.apache.org/repos/asf/spark/blob/745ab8bc/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 71f3a67..6372936 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -396,6 +396,14 @@ object SQLConf { .intConf .createWithDefault(32) + val PARALLEL_PARTITION_DISCOVERY_PARALLELISM = + SQLConfigBuilder("spark.sql.sources.parallelPartitionDiscovery.parallelism") + .doc("The number of parallelism to list a collection of path recursively, Set the " + +"number to prevent file listing from generating too many tasks.") + .internal() + .intConf + .createWithDefault(1) + // Whether to automatically resolve ambiguity in join conditions for self-joins. // See SPARK-6231. val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY = @@ -774,6 +782,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { def parallelPartitionDiscoveryThreshold: Int = getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD) + def parallelPartitionDiscoveryParallelism: Int = +getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_PARALLELISM) + def bucketingEnabled: Boolean = getConf(SQLConf.BUCKETING_ENABLED) def dataFrameSelfJoinAutoResolveAmbiguity: Boolean = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18368][SQL] Fix regexp replace when serialized
Repository: spark Updated Branches: refs/heads/branch-2.0 c8628e877 -> 6e7310590 [SPARK-18368][SQL] Fix regexp replace when serialized ## What changes were proposed in this pull request? This makes the result value both transient and lazy, so that if the RegExpReplace object is initialized then serialized, `result: StringBuffer` will be correctly initialized. ## How was this patch tested? * Verified that this patch fixed the query that found the bug. * Added a test case that fails without the fix. Author: Ryan Blue Closes #15834 from rdblue/SPARK-18368-fix-regexp-replace. (cherry picked from commit d4028de97687385fa1d1eb6301eb544c0ea4a135) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e731059 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e731059 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e731059 Branch: refs/heads/branch-2.0 Commit: 6e73105904a7bae0f7c9b1bebcb83d5ba8265956 Parents: c8628e8 Author: Ryan Blue Authored: Wed Nov 9 11:00:53 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 11:01:45 2016 -0800 -- .../catalyst/expressions/regexpExpressions.scala| 2 +- .../expressions/RegexpExpressionsSuite.scala| 16 +++- 2 files changed, 16 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e731059/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index d25da3f..f6a55cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -220,7 +220,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio @transient private var lastReplacement: String = _ @transient private var lastReplacementInUTF8: UTF8String = _ // result buffer write by Matcher - @transient private val result: StringBuffer = new StringBuffer + @transient private lazy val result: StringBuffer = new StringBuffer override def nullSafeEval(s: Any, p: Any, r: Any): Any = { if (!p.equals(lastRegex)) { http://git-wip-us.apache.org/repos/asf/spark/blob/6e731059/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 5299549..d0d1aaa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.serializer.JavaSerializer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.types.StringType @@ -191,4 +192,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringSplit(s1, s2), null, row3) } + test("RegExpReplace serialization") { +val serializer = new JavaSerializer(new SparkConf()).newInstance + +val row = create_row("abc", "b", "") + +val s = 's.string.at(0) +val p = 'p.string.at(1) +val r = 'r.string.at(2) + +val expr: RegExpReplace = serializer.deserialize(serializer.serialize(RegExpReplace(s, p, r))) +checkEvaluation(expr, "ac", row) + } + } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18368][SQL] Fix regexp replace when serialized
Repository: spark Updated Branches: refs/heads/branch-2.1 626f6d6d4 -> 80f58510a [SPARK-18368][SQL] Fix regexp replace when serialized ## What changes were proposed in this pull request? This makes the result value both transient and lazy, so that if the RegExpReplace object is initialized then serialized, `result: StringBuffer` will be correctly initialized. ## How was this patch tested? * Verified that this patch fixed the query that found the bug. * Added a test case that fails without the fix. Author: Ryan Blue Closes #15834 from rdblue/SPARK-18368-fix-regexp-replace. (cherry picked from commit d4028de97687385fa1d1eb6301eb544c0ea4a135) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80f58510 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80f58510 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80f58510 Branch: refs/heads/branch-2.1 Commit: 80f58510a7a3e039eecf875f02a115c0fd166f55 Parents: 626f6d6 Author: Ryan Blue Authored: Wed Nov 9 11:00:53 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 11:01:24 2016 -0800 -- .../catalyst/expressions/regexpExpressions.scala| 2 +- .../expressions/RegexpExpressionsSuite.scala| 16 +++- 2 files changed, 16 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80f58510/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 5648ad6..4896a62 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio @transient private var lastReplacement: String = _ @transient private var lastReplacementInUTF8: UTF8String = _ // result buffer write by Matcher - @transient private val result: StringBuffer = new StringBuffer + @transient private lazy val result: StringBuffer = new StringBuffer override def nullSafeEval(s: Any, p: Any, r: Any): Any = { if (!p.equals(lastRegex)) { http://git-wip-us.apache.org/repos/asf/spark/blob/80f58510/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 5299549..d0d1aaa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.serializer.JavaSerializer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.types.StringType @@ -191,4 +192,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringSplit(s1, s2), null, row3) } + test("RegExpReplace serialization") { +val serializer = new JavaSerializer(new SparkConf()).newInstance + +val row = create_row("abc", "b", "") + +val s = 's.string.at(0) +val p = 'p.string.at(1) +val r = 'r.string.at(2) + +val expr: RegExpReplace = serializer.deserialize(serializer.serialize(RegExpReplace(s, p, r))) +checkEvaluation(expr, "ac", row) + } + } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18368][SQL] Fix regexp replace when serialized
Repository: spark Updated Branches: refs/heads/master 47636618a -> d4028de97 [SPARK-18368][SQL] Fix regexp replace when serialized ## What changes were proposed in this pull request? This makes the result value both transient and lazy, so that if the RegExpReplace object is initialized then serialized, `result: StringBuffer` will be correctly initialized. ## How was this patch tested? * Verified that this patch fixed the query that found the bug. * Added a test case that fails without the fix. Author: Ryan Blue Closes #15834 from rdblue/SPARK-18368-fix-regexp-replace. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4028de9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4028de9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4028de9 Branch: refs/heads/master Commit: d4028de97687385fa1d1eb6301eb544c0ea4a135 Parents: 4763661 Author: Ryan Blue Authored: Wed Nov 9 11:00:53 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 11:00:53 2016 -0800 -- .../catalyst/expressions/regexpExpressions.scala| 2 +- .../expressions/RegexpExpressionsSuite.scala| 16 +++- 2 files changed, 16 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4028de9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 5648ad6..4896a62 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio @transient private var lastReplacement: String = _ @transient private var lastReplacementInUTF8: UTF8String = _ // result buffer write by Matcher - @transient private val result: StringBuffer = new StringBuffer + @transient private lazy val result: StringBuffer = new StringBuffer override def nullSafeEval(s: Any, p: Any, r: Any): Any = { if (!p.equals(lastRegex)) { http://git-wip-us.apache.org/repos/asf/spark/blob/d4028de9/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 5299549..d0d1aaa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.serializer.JavaSerializer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.types.StringType @@ -191,4 +192,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringSplit(s1, s2), null, row3) } + test("RegExpReplace serialization") { +val serializer = new JavaSerializer(new SparkConf()).newInstance + +val row = create_row("abc", "b", "") + +val s = 's.string.at(0) +val p = 'p.string.at(1) +val r = 'r.string.at(2) + +val expr: RegExpReplace = serializer.deserialize(serializer.serialize(RegExpReplace(s, p, r))) +checkEvaluation(expr, "ac", row) + } + } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-18368] Fix regexp_replace with task serialization."
Repository: spark Updated Branches: refs/heads/branch-2.0 bdddc661b -> c8628e877 Revert "[SPARK-18368] Fix regexp_replace with task serialization." This reverts commit b9192bb3ffc319ebee7dbd15c24656795e454749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c8628e87 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c8628e87 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c8628e87 Branch: refs/heads/branch-2.0 Commit: c8628e877fb050e3994a24cf70854f49f2188555 Parents: bdddc66 Author: Yin Huai Authored: Wed Nov 9 10:47:29 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 10:49:32 2016 -0800 -- .../sql/catalyst/expressions/regexpExpressions.scala | 2 +- .../catalyst/expressions/ExpressionEvalHelper.scala | 15 ++- 2 files changed, 7 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c8628e87/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index f6a55cf..d25da3f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -220,7 +220,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio @transient private var lastReplacement: String = _ @transient private var lastReplacementInUTF8: UTF8String = _ // result buffer write by Matcher - @transient private lazy val result: StringBuffer = new StringBuffer + @transient private val result: StringBuffer = new StringBuffer override def nullSafeEval(s: Any, p: Any, r: Any): Any = { if (!p.equals(lastRegex)) { http://git-wip-us.apache.org/repos/asf/spark/blob/c8628e87/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 186079f..668543a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -21,8 +21,7 @@ import org.scalacheck.Gen import org.scalactic.TripleEqualsSupport.Spread import org.scalatest.prop.GeneratorDrivenPropertyChecks -import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.serializer.JavaSerializer +import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer @@ -43,15 +42,13 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks { protected def checkEvaluation( expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { -val serializer = new JavaSerializer(new SparkConf()).newInstance -val expr: Expression = serializer.deserialize(serializer.serialize(expression)) val catalystValue = CatalystTypeConverters.convertToCatalyst(expected) -checkEvaluationWithoutCodegen(expr, catalystValue, inputRow) -checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow) -if (GenerateUnsafeProjection.canSupport(expr.dataType)) { - checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow) +checkEvaluationWithoutCodegen(expression, catalystValue, inputRow) +checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow) +if (GenerateUnsafeProjection.canSupport(expression.dataType)) { + checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow) } -checkEvaluationWithOptimization(expr, catalystValue, inputRow) +checkEvaluationWithOptimization(expression, catalystValue, inputRow) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-18368] Fix regexp_replace with task serialization."
Repository: spark Updated Branches: refs/heads/branch-2.1 5bd31dc9d -> 626f6d6d4 Revert "[SPARK-18368] Fix regexp_replace with task serialization." This reverts commit b9192bb3ffc319ebee7dbd15c24656795e454749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/626f6d6d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/626f6d6d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/626f6d6d Branch: refs/heads/branch-2.1 Commit: 626f6d6d4f297fd67cfec017a790d79ddad41d70 Parents: 5bd31dc Author: Yin Huai Authored: Wed Nov 9 10:47:29 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 10:48:59 2016 -0800 -- .../sql/catalyst/expressions/regexpExpressions.scala | 2 +- .../catalyst/expressions/ExpressionEvalHelper.scala | 15 ++- 2 files changed, 7 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/626f6d6d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 4896a62..5648ad6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio @transient private var lastReplacement: String = _ @transient private var lastReplacementInUTF8: UTF8String = _ // result buffer write by Matcher - @transient private lazy val result: StringBuffer = new StringBuffer + @transient private val result: StringBuffer = new StringBuffer override def nullSafeEval(s: Any, p: Any, r: Any): Any = { if (!p.equals(lastRegex)) { http://git-wip-us.apache.org/repos/asf/spark/blob/626f6d6d/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index f836504..9ceb709 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -22,8 +22,7 @@ import org.scalactic.TripleEqualsSupport.Spread import org.scalatest.exceptions.TestFailedException import org.scalatest.prop.GeneratorDrivenPropertyChecks -import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.serializer.JavaSerializer +import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer @@ -44,15 +43,13 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks { protected def checkEvaluation( expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { -val serializer = new JavaSerializer(new SparkConf()).newInstance -val expr: Expression = serializer.deserialize(serializer.serialize(expression)) val catalystValue = CatalystTypeConverters.convertToCatalyst(expected) -checkEvaluationWithoutCodegen(expr, catalystValue, inputRow) -checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow) -if (GenerateUnsafeProjection.canSupport(expr.dataType)) { - checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow) +checkEvaluationWithoutCodegen(expression, catalystValue, inputRow) +checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow) +if (GenerateUnsafeProjection.canSupport(expression.dataType)) { + checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow) } -checkEvaluationWithOptimization(expr, catalystValue, inputRow) +checkEvaluationWithOptimization(expression, catalystValue, inputRow) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-18368] Fix regexp_replace with task serialization."
Repository: spark Updated Branches: refs/heads/master 06a13ecca -> 47636618a Revert "[SPARK-18368] Fix regexp_replace with task serialization." This reverts commit b9192bb3ffc319ebee7dbd15c24656795e454749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/47636618 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/47636618 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/47636618 Branch: refs/heads/master Commit: 47636618a5c4c7f426e15b93d862a11088cf1fa6 Parents: 06a13ec Author: Yin Huai Authored: Wed Nov 9 10:47:29 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 10:47:29 2016 -0800 -- .../sql/catalyst/expressions/regexpExpressions.scala | 2 +- .../catalyst/expressions/ExpressionEvalHelper.scala | 15 ++- 2 files changed, 7 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/47636618/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 4896a62..5648ad6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio @transient private var lastReplacement: String = _ @transient private var lastReplacementInUTF8: UTF8String = _ // result buffer write by Matcher - @transient private lazy val result: StringBuffer = new StringBuffer + @transient private val result: StringBuffer = new StringBuffer override def nullSafeEval(s: Any, p: Any, r: Any): Any = { if (!p.equals(lastRegex)) { http://git-wip-us.apache.org/repos/asf/spark/blob/47636618/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index f836504..9ceb709 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -22,8 +22,7 @@ import org.scalactic.TripleEqualsSupport.Spread import org.scalatest.exceptions.TestFailedException import org.scalatest.prop.GeneratorDrivenPropertyChecks -import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.serializer.JavaSerializer +import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer @@ -44,15 +43,13 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks { protected def checkEvaluation( expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { -val serializer = new JavaSerializer(new SparkConf()).newInstance -val expr: Expression = serializer.deserialize(serializer.serialize(expression)) val catalystValue = CatalystTypeConverters.convertToCatalyst(expected) -checkEvaluationWithoutCodegen(expr, catalystValue, inputRow) -checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow) -if (GenerateUnsafeProjection.canSupport(expr.dataType)) { - checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow) +checkEvaluationWithoutCodegen(expression, catalystValue, inputRow) +checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow) +if (GenerateUnsafeProjection.canSupport(expression.dataType)) { + checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow) } -checkEvaluationWithOptimization(expr, catalystValue, inputRow) +checkEvaluationWithOptimization(expression, catalystValue, inputRow) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18338][SQL][TEST-MAVEN] Fix test case initialization order under Maven builds
Repository: spark Updated Branches: refs/heads/master 02c5325b8 -> 205e6d586 [SPARK-18338][SQL][TEST-MAVEN] Fix test case initialization order under Maven builds ## What changes were proposed in this pull request? Test case initialization order under Maven and SBT are different. Maven always creates instances of all test cases and then run them all together. This fails `ObjectHashAggregateSuite` because the randomized test cases there register a temporary Hive function right before creating a test case, and can be cleared while initializing other successive test cases. In SBT, this is fine since the created test case is executed immediately after creating the temporary function. To fix this issue, we should put initialization/destruction code into `beforeAll()` and `afterAll()`. ## How was this patch tested? Existing tests. Author: Cheng Lian Closes #15802 from liancheng/fix-flaky-object-hash-agg-suite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/205e6d58 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/205e6d58 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/205e6d58 Branch: refs/heads/master Commit: 205e6d5867b180a85bad58035c917ca13552a0a5 Parents: 02c5325 Author: Cheng Lian Authored: Wed Nov 9 09:49:02 2016 -0800 Committer: Yin Huai Committed: Wed Nov 9 09:49:02 2016 -0800 -- .../execution/ObjectHashAggregateSuite.scala| 23 +--- 1 file changed, 10 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/205e6d58/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala index 527626b..93fc5e8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala @@ -25,11 +25,10 @@ import org.scalatest.Matchers._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedFunction -import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper, ExpressionInfo, Literal} +import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper, Literal} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.HiveSessionCatalog import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils @@ -43,6 +42,14 @@ class ObjectHashAggregateSuite import testImplicits._ + protected override def beforeAll(): Unit = { +sql(s"CREATE TEMPORARY FUNCTION hive_max AS '${classOf[GenericUDAFMax].getName}'") + } + + protected override def afterAll(): Unit = { +sql(s"DROP TEMPORARY FUNCTION IF EXISTS hive_max") + } + test("typed_count without grouping keys") { val df = Seq((1: Integer, 2), (null, 2), (3: Integer, 4)).toDF("a", "b") @@ -199,10 +206,7 @@ class ObjectHashAggregateSuite val typed = percentile_approx($"c0", 0.5) // A Hive UDAF without partial aggregation support -val withoutPartial = { - registerHiveFunction("hive_max", classOf[GenericUDAFMax]) - function("hive_max", $"c1") -} +val withoutPartial = function("hive_max", $"c1") // A Spark SQL native aggregate function with partial aggregation support that can be executed // by the Tungsten `HashAggregateExec` @@ -420,13 +424,6 @@ class ObjectHashAggregateSuite } } - private def registerHiveFunction(functionName: String, clazz: Class[_]): Unit = { -val sessionCatalog = spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog] -val builder = sessionCatalog.makeFunctionBuilder(functionName, clazz.getName) -val info = new ExpressionInfo(clazz.getName, functionName) -sessionCatalog.createTempFunction(functionName, info, builder, ignoreIfExists = false) - } - private def function(name: String, args: Column*): Column = { Column(UnresolvedFunction(FunctionIdentifier(name), args.map(_.expr), isDistinct = false)) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18256] Improve the performance of event log replay in HistoryServer
Repository: spark Updated Branches: refs/heads/master 4cee2ce25 -> 0e3312ee7 [SPARK-18256] Improve the performance of event log replay in HistoryServer ## What changes were proposed in this pull request? This patch significantly improves the performance of event log replay in the HistoryServer via two simple changes: - **Don't use `extractOpt`**: it turns out that `json4s`'s `extractOpt` method uses exceptions for control flow, causing huge performance bottlenecks due to the overhead of initializing exceptions. To avoid this overhead, we can simply use our own` Utils.jsonOption` method. This patch replaces all uses of `extractOpt` with `Utils.jsonOption` and adds a style checker rule to ban the use of the slow `extractOpt` method. - **Don't call `Utils.getFormattedClassName` for every event**: the old code called` Utils.getFormattedClassName` dozens of times per replayed event in order to match up class names in events with SparkListener event names. By simply storing the results of these calls in constants rather than recomputing them, we're able to eliminate a huge performance hotspot by removing thousands of expensive `Class.getSimpleName` calls. ## How was this patch tested? Tested by profiling the replay of a long event log using YourKit. For an event log containing 1000+ jobs, each of which had thousands of tasks, the changes in this patch cut the replay time in half: ![image](https://cloud.githubusercontent.com/assets/50748/19980953/31154622-a1bd-11e6-9be4-21fbb9b3f9a7.png) Prior to this patch's changes, the two slowest methods in log replay were internal exceptions thrown by `Json4S` and calls to `Class.getSimpleName()`: ![image](https://cloud.githubusercontent.com/assets/50748/19981052/87416cce-a1bd-11e6-9f25-06a7cd391822.png) After this patch, these hotspots are completely eliminated. Author: Josh Rosen Closes #15756 from JoshRosen/speed-up-jsonprotocol. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0e3312ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0e3312ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0e3312ee Branch: refs/heads/master Commit: 0e3312ee72c44f4c9acafbd80d0c8a14f3aff875 Parents: 4cee2ce Author: Josh Rosen Authored: Fri Nov 4 19:32:26 2016 -0700 Committer: Yin Huai Committed: Fri Nov 4 19:32:26 2016 -0700 -- .../org/apache/spark/util/JsonProtocol.scala| 106 +++ scalastyle-config.xml | 6 ++ 2 files changed, 70 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0e3312ee/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index c11eb3f..6593aab 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -107,20 +107,20 @@ private[spark] object JsonProtocol { def stageSubmittedToJson(stageSubmitted: SparkListenerStageSubmitted): JValue = { val stageInfo = stageInfoToJson(stageSubmitted.stageInfo) val properties = propertiesToJson(stageSubmitted.properties) -("Event" -> Utils.getFormattedClassName(stageSubmitted)) ~ +("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageSubmitted) ~ ("Stage Info" -> stageInfo) ~ ("Properties" -> properties) } def stageCompletedToJson(stageCompleted: SparkListenerStageCompleted): JValue = { val stageInfo = stageInfoToJson(stageCompleted.stageInfo) -("Event" -> Utils.getFormattedClassName(stageCompleted)) ~ +("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageCompleted) ~ ("Stage Info" -> stageInfo) } def taskStartToJson(taskStart: SparkListenerTaskStart): JValue = { val taskInfo = taskStart.taskInfo -("Event" -> Utils.getFormattedClassName(taskStart)) ~ +("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskStart) ~ ("Stage ID" -> taskStart.stageId) ~ ("Stage Attempt ID" -> taskStart.stageAttemptId) ~ ("Task Info" -> taskInfoToJson(taskInfo)) @@ -128,7 +128,7 @@ private[spark] object JsonProtocol { def taskGettingResultToJson(taskGettingResult: SparkListenerTaskGettingResult): JValue = { val taskInfo = taskGettingResult.taskInfo -("Event" -> Utils.getFormattedClassName(taskGettingResult)) ~ +("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskGettingResult) ~ ("Task Info" -> taskInfoToJson(taskInfo)) } @@ -137,7 +137,7 @@ private[spark] object JsonProtocol { val taskInfo = taskEnd.taskInfo val taskMetrics = taskEnd.taskMetrics val taskMetricsJson =
spark git commit: [SPARK-18167] Re-enable the non-flaky parts of SQLQuerySuite
Repository: spark Updated Branches: refs/heads/branch-2.1 e51978c3d -> 0a303a694 [SPARK-18167] Re-enable the non-flaky parts of SQLQuerySuite ## What changes were proposed in this pull request? It seems the proximate cause of the test failures is that `cast(str as decimal)` in derby will raise an exception instead of returning NULL. This is a problem since Hive sometimes inserts `__HIVE_DEFAULT_PARTITION__` entries into the partition table as documented here: https://github.com/apache/hive/blob/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java#L1034 Basically, when these special default partitions are present, partition pruning pushdown using the SQL-direct mode will fail due this cast exception. As commented on in `MetaStoreDirectSql.java` above, this is normally fine since Hive falls back to JDO pruning, however when the pruning predicate contains an unsupported operator such as `>`, that will fail as well. The only remaining question is why this behavior is nondeterministic. We know that when the test flakes, retries do not help, therefore the cause must be environmental. The current best hypothesis is that some config is different between different jenkins runs, which is why this PR prints out the Spark SQL and Hive confs for the test. The hope is that by comparing the config state for failure vs success we can isolate the root cause of the flakiness. **Update:** we could not isolate the issue. It does not seem to be due to configuration differences. As such, I'm going to enable the non-flaky parts of the test since we are fairly confident these issues only occur with Derby (which is not used in production). ## How was this patch tested? N/A Author: Eric Liang Closes #15725 from ericl/print-confs-out. (cherry picked from commit 4cee2ce251110218e68c0f8f30363ec2f2498bea) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a303a69 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a303a69 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a303a69 Branch: refs/heads/branch-2.1 Commit: 0a303a6948a3224070fc16516e0cc0a84df6df7f Parents: e51978c Author: Eric Liang Authored: Fri Nov 4 15:54:28 2016 -0700 Committer: Yin Huai Committed: Fri Nov 4 15:54:52 2016 -0700 -- .../sql/hive/execution/SQLQuerySuite.scala | 31 +++- 1 file changed, 10 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a303a69/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index ad70835..cc09aef 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1569,27 +1569,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { ).map(i => Row(i._1, i._2, i._3, i._4))) } - ignore("SPARK-10562: partition by column with mixed case name") { -def runOnce() { - withTable("tbl10562") { -val df = Seq(2012 -> "a").toDF("Year", "val") -df.write.partitionBy("Year").saveAsTable("tbl10562") -checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012)) -checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012)) -checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012)) -checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil) -checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a")) - } -} -try { - runOnce() -} catch { - case t: Throwable => -// Retry to gather more test data. TODO(ekl) revert this once we deflake this test. -runOnce() -runOnce() -runOnce() -throw t + test("SPARK-10562: partition by column with mixed case name") { +withTable("tbl10562") { + val df = Seq(2012 -> "a").toDF("Year", "val") + df.write.partitionBy("Year").saveAsTable("tbl10562") + checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012)) + checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012)) + checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012)) +// TODO(ekl) this is causing test flakes [SPARK-18167], but we think the issue is derby specific +// checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil) + checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a")) } } - To unsubsc
spark git commit: [SPARK-18167] Re-enable the non-flaky parts of SQLQuerySuite
Repository: spark Updated Branches: refs/heads/master 550cd56e8 -> 4cee2ce25 [SPARK-18167] Re-enable the non-flaky parts of SQLQuerySuite ## What changes were proposed in this pull request? It seems the proximate cause of the test failures is that `cast(str as decimal)` in derby will raise an exception instead of returning NULL. This is a problem since Hive sometimes inserts `__HIVE_DEFAULT_PARTITION__` entries into the partition table as documented here: https://github.com/apache/hive/blob/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java#L1034 Basically, when these special default partitions are present, partition pruning pushdown using the SQL-direct mode will fail due this cast exception. As commented on in `MetaStoreDirectSql.java` above, this is normally fine since Hive falls back to JDO pruning, however when the pruning predicate contains an unsupported operator such as `>`, that will fail as well. The only remaining question is why this behavior is nondeterministic. We know that when the test flakes, retries do not help, therefore the cause must be environmental. The current best hypothesis is that some config is different between different jenkins runs, which is why this PR prints out the Spark SQL and Hive confs for the test. The hope is that by comparing the config state for failure vs success we can isolate the root cause of the flakiness. **Update:** we could not isolate the issue. It does not seem to be due to configuration differences. As such, I'm going to enable the non-flaky parts of the test since we are fairly confident these issues only occur with Derby (which is not used in production). ## How was this patch tested? N/A Author: Eric Liang Closes #15725 from ericl/print-confs-out. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4cee2ce2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4cee2ce2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4cee2ce2 Branch: refs/heads/master Commit: 4cee2ce251110218e68c0f8f30363ec2f2498bea Parents: 550cd56 Author: Eric Liang Authored: Fri Nov 4 15:54:28 2016 -0700 Committer: Yin Huai Committed: Fri Nov 4 15:54:28 2016 -0700 -- .../sql/hive/execution/SQLQuerySuite.scala | 31 +++- 1 file changed, 10 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4cee2ce2/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index ad70835..cc09aef 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1569,27 +1569,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { ).map(i => Row(i._1, i._2, i._3, i._4))) } - ignore("SPARK-10562: partition by column with mixed case name") { -def runOnce() { - withTable("tbl10562") { -val df = Seq(2012 -> "a").toDF("Year", "val") -df.write.partitionBy("Year").saveAsTable("tbl10562") -checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012)) -checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012)) -checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012)) -checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil) -checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a")) - } -} -try { - runOnce() -} catch { - case t: Throwable => -// Retry to gather more test data. TODO(ekl) revert this once we deflake this test. -runOnce() -runOnce() -runOnce() -throw t + test("SPARK-10562: partition by column with mixed case name") { +withTable("tbl10562") { + val df = Seq(2012 -> "a").toDF("Year", "val") + df.write.partitionBy("Year").saveAsTable("tbl10562") + checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012)) + checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012)) + checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012)) +// TODO(ekl) this is causing test flakes [SPARK-18167], but we think the issue is derby specific +// checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil) + checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a")) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.
spark git commit: [SPARK-17949][SQL] A JVM object based aggregate operator
Repository: spark Updated Branches: refs/heads/master 66a99f4a4 -> 27daf6bcd [SPARK-17949][SQL] A JVM object based aggregate operator ## What changes were proposed in this pull request? This PR adds a new hash-based aggregate operator named `ObjectHashAggregateExec` that supports `TypedImperativeAggregate`, which may use arbitrary Java objects as aggregation states. Please refer to the [design doc](https://issues.apache.org/jira/secure/attachment/12834260/%5BDesign%20Doc%5D%20Support%20for%20Arbitrary%20Aggregation%20States.pdf) attached in [SPARK-17949](https://issues.apache.org/jira/browse/SPARK-17949) for more details about it. The major benefit of this operator is better performance when evaluating `TypedImperativeAggregate` functions, especially when there are relatively few distinct groups. Functions like Hive UDAFs, `collect_list`, and `collect_set` may also benefit from this after being migrated to `TypedImperativeAggregate`. The following feature flag is introduced to enable or disable the new aggregate operator: - Name: `spark.sql.execution.useObjectHashAggregateExec` - Default value: `true` We can also configure the fallback threshold using the following SQL operation: - Name: `spark.sql.objectHashAggregate.sortBased.fallbackThreshold` - Default value: 128 Fallback to sort-based aggregation when more than 128 distinct groups are accumulated in the aggregation hash map. This number is intentionally made small to avoid GC problems since aggregation buffers of this operator may contain arbitrary Java objects. This may be improved by implementing size tracking for this operator, but that can be done in a separate PR. Code generation and size tracking are planned to be implemented in follow-up PRs. ## Benchmark results ### `ObjectHashAggregateExec` vs `SortAggregateExec` The first benchmark compares `ObjectHashAggregateExec` and `SortAggregateExec` by evaluating `typed_count`, a testing `TypedImperativeAggregate` version of the SQL `count` function. ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.10.5 Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz object agg v.s. sort agg:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative sort agg w/ group by31251 / 31908 3.4 298.0 1.0X object agg w/ group by w/o fallback 6903 / 7141 15.2 65.8 4.5X object agg w/ group by w/ fallback 20945 / 21613 5.0 199.7 1.5X sort agg w/o group by 4734 / 5463 22.1 45.2 6.6X object agg w/o group by w/o fallback 4310 / 4529 24.3 41.1 7.3X ``` The next benchmark compares `ObjectHashAggregateExec` and `SortAggregateExec` by evaluating the Spark native version of `percentile_approx`. Note that `percentile_approx` is so heavy an aggregate function that the bottleneck of the benchmark is evaluating the aggregate function itself rather than the aggregate operator since I couldn't run a large scale benchmark on my laptop. That's why the results are so close and looks counter-intuitive (aggregation with grouping is even faster than that aggregation without grouping). ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.10.5 Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz object agg v.s. sort agg:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative sort agg w/ group by 3418 / 3530 0.6 1630.0 1.0X object agg w/ group by w/o fallback 3210 / 3314 0.7 1530.7 1.1X object agg w/ group by w/ fallback3419 / 3511 0.6 1630.1 1.0X sort agg w/o group by 4336 / 4499 0.5 2067.3 0.8X object agg w/o group by w/o fallback 4271 / 4372 0.5 2036.7 0.8X ``` ### Hive UDAF vs Spark AF This benchmark compares the following two kinds of aggregate functions: - "hive udaf": Hive implementation of `percentile_approx`, without partial aggregation supports, evaluated using `SortAggregateExec`. - "spark af": Spark native implementation of `percentile_approx`, with partial aggregation support, evaluated using `ObjectHashAggregateExec` The performance differences are mostly due to faster implementation and partial aggregation support in the Spark native version of `percentile_approx`. This benchmark basically shows the performance differences between the worst case, where an aggregate function without partial aggregation support is evaluated using `SortAggregateExec`, and the best case, where a `TypedImperativeAggregate` with partial aggregation support is evaluated using `ObjectHashAggre
spark git commit: [SPARK-17470][SQL] unify path for data source table and locationUri for hive serde table
Repository: spark Updated Branches: refs/heads/branch-2.1 2aff2ea81 -> 5ea2f9e5e [SPARK-17470][SQL] unify path for data source table and locationUri for hive serde table ## What changes were proposed in this pull request? Due to a limitation of hive metastore(table location must be directory path, not file path), we always store `path` for data source table in storage properties, instead of the `locationUri` field. However, we should not expose this difference to `CatalogTable` level, but just treat it as a hack in `HiveExternalCatalog`, like we store table schema of data source table in table properties. This PR unifies `path` and `locationUri` outside of `HiveExternalCatalog`, both data source table and hive serde table should use the `locationUri` field. This PR also unifies the way we handle default table location for managed table. Previously, the default table location of hive serde managed table is set by external catalog, but the one of data source table is set by command. After this PR, we follow the hive way and the default table location is always set by external catalog. For managed non-file-based tables, we will assign a default table location and create an empty directory for it, the table location will be removed when the table is dropped. This is reasonable as metastore doesn't care about whether a table is file-based or not, and an empty table directory has no harm. For external non-file-based tables, ideally we can omit the table location, but due to a hive metastore issue, we will assign a random location to it, and remove it right after the table is created. See SPARK-15269 for more details. This is fine as it's well isolated in `HiveExternalCatalog`. To keep the existing behaviour of the `path` option, in this PR we always add the `locationUri` to storage properties using key `path`, before passing storage properties to `DataSource` as data source options. ## How was this patch tested? existing tests. Author: Wenchen Fan Closes #15024 from cloud-fan/path. (cherry picked from commit 3a1bc6f4780f8384c1211b1335e7394a4a28377e) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ea2f9e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ea2f9e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ea2f9e5 Branch: refs/heads/branch-2.1 Commit: 5ea2f9e5e449c02f77635918bfcc7ba7193c97a2 Parents: 2aff2ea Author: Wenchen Fan Authored: Wed Nov 2 18:05:14 2016 -0700 Committer: Yin Huai Committed: Wed Nov 2 18:05:29 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 40 ++- .../org/apache/spark/sql/DataFrameWriter.scala | 5 +- .../spark/sql/execution/SparkSqlParser.scala| 17 +- .../command/createDataSourceTables.scala| 37 +-- .../spark/sql/execution/command/ddl.scala | 23 +- .../spark/sql/execution/command/tables.scala| 50 ++-- .../sql/execution/datasources/DataSource.scala | 241 ++- .../datasources/DataSourceStrategy.scala| 3 +- .../apache/spark/sql/internal/CatalogImpl.scala | 4 +- .../spark/sql/execution/command/DDLSuite.scala | 1 - .../spark/sql/sources/PathOptionSuite.scala | 136 +++ .../spark/sql/hive/HiveExternalCatalog.scala| 227 +++-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 16 +- .../sql/hive/HiveMetastoreCatalogSuite.scala| 3 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 28 ++- .../spark/sql/hive/MultiDatabaseSuite.scala | 2 +- .../spark/sql/hive/execution/HiveDDLSuite.scala | 14 +- .../sql/hive/execution/SQLQuerySuite.scala | 4 +- 19 files changed, 520 insertions(+), 335 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ea2f9e5/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index d7fe6b3..ee48baa 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2659,7 +2659,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume # It makes sure that we can omit path argument in write.df API and then it calls # DataFrameWriter.save() without path. expect_error(write.df(df, source = "csv"), - "Error in save : illegal argument - 'path' is not specified") + "Error in save : illegal argument - Expected exactly one path to be specified") expect_error(write.json(df, jsonPath), "Error in json : analysis error - path file:.*already exists") expect_error(write.text(df, jsonPath), @@ -2667,7 +2667,7 @@
spark git commit: [SPARK-17470][SQL] unify path for data source table and locationUri for hive serde table
Repository: spark Updated Branches: refs/heads/master fd90541c3 -> 3a1bc6f47 [SPARK-17470][SQL] unify path for data source table and locationUri for hive serde table ## What changes were proposed in this pull request? Due to a limitation of hive metastore(table location must be directory path, not file path), we always store `path` for data source table in storage properties, instead of the `locationUri` field. However, we should not expose this difference to `CatalogTable` level, but just treat it as a hack in `HiveExternalCatalog`, like we store table schema of data source table in table properties. This PR unifies `path` and `locationUri` outside of `HiveExternalCatalog`, both data source table and hive serde table should use the `locationUri` field. This PR also unifies the way we handle default table location for managed table. Previously, the default table location of hive serde managed table is set by external catalog, but the one of data source table is set by command. After this PR, we follow the hive way and the default table location is always set by external catalog. For managed non-file-based tables, we will assign a default table location and create an empty directory for it, the table location will be removed when the table is dropped. This is reasonable as metastore doesn't care about whether a table is file-based or not, and an empty table directory has no harm. For external non-file-based tables, ideally we can omit the table location, but due to a hive metastore issue, we will assign a random location to it, and remove it right after the table is created. See SPARK-15269 for more details. This is fine as it's well isolated in `HiveExternalCatalog`. To keep the existing behaviour of the `path` option, in this PR we always add the `locationUri` to storage properties using key `path`, before passing storage properties to `DataSource` as data source options. ## How was this patch tested? existing tests. Author: Wenchen Fan Closes #15024 from cloud-fan/path. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3a1bc6f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3a1bc6f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3a1bc6f4 Branch: refs/heads/master Commit: 3a1bc6f4780f8384c1211b1335e7394a4a28377e Parents: fd90541 Author: Wenchen Fan Authored: Wed Nov 2 18:05:14 2016 -0700 Committer: Yin Huai Committed: Wed Nov 2 18:05:14 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 40 ++- .../org/apache/spark/sql/DataFrameWriter.scala | 5 +- .../spark/sql/execution/SparkSqlParser.scala| 17 +- .../command/createDataSourceTables.scala| 37 +-- .../spark/sql/execution/command/ddl.scala | 23 +- .../spark/sql/execution/command/tables.scala| 50 ++-- .../sql/execution/datasources/DataSource.scala | 241 ++- .../datasources/DataSourceStrategy.scala| 3 +- .../apache/spark/sql/internal/CatalogImpl.scala | 4 +- .../spark/sql/execution/command/DDLSuite.scala | 1 - .../spark/sql/sources/PathOptionSuite.scala | 136 +++ .../spark/sql/hive/HiveExternalCatalog.scala| 227 +++-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 16 +- .../sql/hive/HiveMetastoreCatalogSuite.scala| 3 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 28 ++- .../spark/sql/hive/MultiDatabaseSuite.scala | 2 +- .../spark/sql/hive/execution/HiveDDLSuite.scala | 14 +- .../sql/hive/execution/SQLQuerySuite.scala | 4 +- 19 files changed, 520 insertions(+), 335 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3a1bc6f4/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index d7fe6b3..ee48baa 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2659,7 +2659,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume # It makes sure that we can omit path argument in write.df API and then it calls # DataFrameWriter.save() without path. expect_error(write.df(df, source = "csv"), - "Error in save : illegal argument - 'path' is not specified") + "Error in save : illegal argument - Expected exactly one path to be specified") expect_error(write.json(df, jsonPath), "Error in json : analysis error - path file:.*already exists") expect_error(write.text(df, jsonPath), @@ -2667,7 +2667,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume expect_error(write
spark git commit: [SPARK-18167][SQL] Also log all partitions when the SQLQuerySuite test flakes
Repository: spark Updated Branches: refs/heads/master de3f87fa7 -> 6633b97b5 [SPARK-18167][SQL] Also log all partitions when the SQLQuerySuite test flakes ## What changes were proposed in this pull request? One possibility for this test flaking is that we have corrupted the partition schema somehow in the tests, which causes the cast to decimal to fail in the call. This should at least show us the actual partition values. ## How was this patch tested? Run it locally, it prints out something like `ArrayBuffer(test(partcol=0), test(partcol=1), test(partcol=2), test(partcol=3), test(partcol=4))`. Author: Eric Liang Closes #15701 from ericl/print-more-info. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6633b97b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6633b97b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6633b97b Branch: refs/heads/master Commit: 6633b97b579c7f003d60b6bfa2e2a248340d3dc6 Parents: de3f87f Author: Eric Liang Authored: Mon Oct 31 16:26:52 2016 -0700 Committer: Yin Huai Committed: Mon Oct 31 16:26:52 2016 -0700 -- .../main/scala/org/apache/spark/sql/hive/client/HiveShim.scala| 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6633b97b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 4bbbd66..85edaf6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -594,9 +594,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { // SPARK-18167 retry to investigate the flaky test. This should be reverted before // the release is cut. val retry = Try(getPartitionsByFilterMethod.invoke(hive, table, filter)) -val full = Try(getAllPartitionsMethod.invoke(hive, table)) logError("getPartitionsByFilter failed, retry success = " + retry.isSuccess) -logError("getPartitionsByFilter failed, full fetch success = " + full.isSuccess) +logError("all partitions: " + getAllPartitions(hive, table)) throw e } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17972][SQL] Add Dataset.checkpoint() to truncate large query plans
Repository: spark Updated Branches: refs/heads/master 26b07f190 -> 8bfc3b7aa [SPARK-17972][SQL] Add Dataset.checkpoint() to truncate large query plans ## What changes were proposed in this pull request? ### Problem Iterative ML code may easily create query plans that grow exponentially. We found that query planning time also increases exponentially even when all the sub-plan trees are cached. The following snippet illustrates the problem: ``` scala (0 until 6).foldLeft(Seq(1, 2, 3).toDS) { (plan, iteration) => println(s"== Iteration $iteration ==") val time0 = System.currentTimeMillis() val joined = plan.join(plan, "value").join(plan, "value").join(plan, "value").join(plan, "value") joined.cache() println(s"Query planning takes ${System.currentTimeMillis() - time0} ms") joined.as[Int] } // == Iteration 0 == // Query planning takes 9 ms // == Iteration 1 == // Query planning takes 26 ms // == Iteration 2 == // Query planning takes 53 ms // == Iteration 3 == // Query planning takes 163 ms // == Iteration 4 == // Query planning takes 700 ms // == Iteration 5 == // Query planning takes 3418 ms ``` This is because when building a new Dataset, the new plan is always built upon `QueryExecution.analyzed`, which doesn't leverage existing cached plans. On the other hand, usually, doing caching every a few iterations may not be the right direction for this problem since caching is too memory consuming (imaging computing connected components over a graph with 50 billion nodes). What we really need here is to truncate both the query plan (to minimize query planning time) and the lineage of the underlying RDD (to avoid stack overflow). ### Changes introduced in this PR This PR tries to fix this issue by introducing a `checkpoint()` method into `Dataset[T]`, which does exactly the things described above. The following snippet, which is essentially the same as the one above but invokes `checkpoint()` instead of `cache()`, shows the micro benchmark result of this PR: One key point is that the checkpointed Dataset should preserve the origianl partitioning and ordering information of the original Dataset, so that we can avoid unnecessary shuffling (similar to reading from a pre-bucketed table). This is done by adding `outputPartitioning` and `outputOrdering` to `LogicalRDD` and `RDDScanExec`. ### Micro benchmark ``` scala spark.sparkContext.setCheckpointDir("/tmp/cp") (0 until 100).foldLeft(Seq(1, 2, 3).toDS) { (plan, iteration) => println(s"== Iteration $iteration ==") val time0 = System.currentTimeMillis() val cp = plan.checkpoint() cp.count() System.out.println(s"Checkpointing takes ${System.currentTimeMillis() - time0} ms") val time1 = System.currentTimeMillis() val joined = cp.join(cp, "value").join(cp, "value").join(cp, "value").join(cp, "value") val result = joined.as[Int] println(s"Query planning takes ${System.currentTimeMillis() - time1} ms") result } // == Iteration 0 == // Checkpointing takes 591 ms // Query planning takes 13 ms // == Iteration 1 == // Checkpointing takes 1605 ms // Query planning takes 16 ms // == Iteration 2 == // Checkpointing takes 782 ms // Query planning takes 8 ms // == Iteration 3 == // Checkpointing takes 729 ms // Query planning takes 10 ms // == Iteration 4 == // Checkpointing takes 734 ms // Query planning takes 9 ms // == Iteration 5 == // ... // == Iteration 50 == // Checkpointing takes 571 ms // Query planning takes 7 ms // == Iteration 51 == // Checkpointing takes 548 ms // Query planning takes 7 ms // == Iteration 52 == // Checkpointing takes 596 ms // Query planning takes 8 ms // == Iteration 53 == // Checkpointing takes 568 ms // Query planning takes 7 ms // ... ``` You may see that although checkpointing is more heavy weight an operation, it always takes roughly the same amount of time to perform both checkpointing and query planning. ### Open question mengxr mentioned that it would be more convenient if we can make `Dataset.checkpoint()` eager, i.e., always performs a `RDD.count()` after calling `RDD.checkpoint()`. Not quite sure whether this is a universal requirement. Maybe we can add a `eager: Boolean` argument for `Dataset.checkpoint()` to support that. ## How was this patch tested? Unit test added in `DatasetSuite`. Author: Cheng Lian Author: Yin Huai Closes #15651 from liancheng/ds-checkpoint. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8bfc3b7a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8bfc3b7a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8bfc3b7a Branch: refs/heads/master Commit: 8bfc3b7aac577e36aadc4fe6dee0665d0b2ae919 Parents: 26b07f1 Author: Cheng Lian Authored: Mon Oct 31 13:39:59 2016 -0700 Committer: Yin Huai Committed: Mon Oct 31 13:39:59 2016 -0700 -- .../scala/org/apache/spark/sql/Dataset.scala| 57 +
[1/2] spark git commit: [SPARK-17970][SQL] store partition spec in metastore for data source table
Repository: spark Updated Branches: refs/heads/master 79fd0cc05 -> ccb115430 http://git-wip-us.apache.org/repos/asf/spark/blob/ccb11543/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala new file mode 100644 index 000..5f16960 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.File + +import org.apache.spark.metrics.source.HiveCatalogMetrics +import org.apache.spark.sql.{AnalysisException, QueryTest} +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils + +class PartitionProviderCompatibilitySuite + extends QueryTest with TestHiveSingleton with SQLTestUtils { + + private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = { +spark.range(5).selectExpr("id as fieldOne", "id as partCol").write + .partitionBy("partCol") + .mode("overwrite") + .parquet(dir.getAbsolutePath) + +spark.sql(s""" + |create table $tableName (fieldOne long, partCol int) + |using parquet + |options (path "${dir.getAbsolutePath}") + |partitioned by (partCol)""".stripMargin) + } + + private def verifyIsLegacyTable(tableName: String): Unit = { +val unsupportedCommands = Seq( + s"ALTER TABLE $tableName ADD PARTITION (partCol=1) LOCATION '/foo'", + s"ALTER TABLE $tableName PARTITION (partCol=1) RENAME TO PARTITION (partCol=2)", + s"ALTER TABLE $tableName PARTITION (partCol=1) SET LOCATION '/foo'", + s"ALTER TABLE $tableName DROP PARTITION (partCol=1)", + s"DESCRIBE $tableName PARTITION (partCol=1)", + s"SHOW PARTITIONS $tableName") + +withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { + for (cmd <- unsupportedCommands) { +val e = intercept[AnalysisException] { + spark.sql(cmd) +} +assert(e.getMessage.contains("partition metadata is not stored in the Hive metastore"), e) + } +} + } + + test("convert partition provider to hive with repair table") { +withTable("test") { + withTempDir { dir => +withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") { + setupPartitionedDatasourceTable("test", dir) + assert(spark.sql("select * from test").count() == 5) +} +withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { + verifyIsLegacyTable("test") + spark.sql("msck repair table test") + spark.sql("show partitions test").count() // check we are a new table + + // sanity check table performance + HiveCatalogMetrics.reset() + assert(spark.sql("select * from test where partCol < 2").count() == 2) + assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2) + assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2) +} + } +} + } + + test("when partition management is enabled, new tables have partition provider hive") { +withTable("test") { + withTempDir { dir => +withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { + setupPartitionedDatasourceTable("test", dir) + spark.sql("show partitions test").count() // check we are a new table + assert(spark.sql("select * from test").count() == 0) // needs repair + spark.sql("msck repair table test") + assert(spark.sql("select * from test").count() == 5) +} + } +} + } + + test("when partition management is disabled, new tables have no partition provider") { +withTable("test") { + withTempDir { dir => +withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") { + setupPart
[2/2] spark git commit: [SPARK-17970][SQL] store partition spec in metastore for data source table
[SPARK-17970][SQL] store partition spec in metastore for data source table ## What changes were proposed in this pull request? We should follow hive table and also store partition spec in metastore for data source table. This brings 2 benefits: 1. It's more flexible to manage the table data files, as users can use `ADD PARTITION`, `DROP PARTITION` and `RENAME PARTITION` 2. We don't need to cache all file status for data source table anymore. ## How was this patch tested? existing tests. Author: Eric Liang Author: Michael Allman Author: Eric Liang Author: Wenchen Fan Closes #15515 from cloud-fan/partition. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ccb11543 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ccb11543 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ccb11543 Branch: refs/heads/master Commit: ccb11543048dccd4cc590a8db1df1d9d5847d112 Parents: 79fd0cc Author: Eric Liang Authored: Thu Oct 27 14:22:30 2016 -0700 Committer: Yin Huai Committed: Thu Oct 27 14:22:30 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 12 +- .../sql/catalyst/trees/TreeNodeSuite.scala | 1 + .../org/apache/spark/sql/DataFrameWriter.scala | 13 +- .../command/AnalyzeColumnCommand.scala | 3 +- .../execution/command/AnalyzeTableCommand.scala | 3 +- .../command/createDataSourceTables.scala| 17 +- .../spark/sql/execution/command/ddl.scala | 90 +++--- .../spark/sql/execution/command/tables.scala| 39 +-- .../sql/execution/datasources/DataSource.scala | 20 +- .../datasources/DataSourceStrategy.scala| 15 +- .../sql/execution/datasources/FileCatalog.scala | 4 + .../execution/datasources/FileStatusCache.scala | 2 +- .../PartitioningAwareFileCatalog.scala | 12 +- .../datasources/TableFileCatalog.scala | 4 +- .../org/apache/spark/sql/internal/SQLConf.scala | 16 +- .../apache/spark/sql/SQLQueryTestSuite.scala| 2 +- .../spark/sql/execution/command/DDLSuite.scala | 200 +--- .../spark/sql/hive/HiveExternalCatalog.scala| 129 +--- .../spark/sql/hive/HiveMetastoreCatalog.scala | 9 +- .../spark/sql/hive/client/HiveClientImpl.scala | 5 +- .../spark/sql/hive/HiveMetadataCacheSuite.scala | 2 +- .../sql/hive/HiveTablePerfStatsSuite.scala | 240 --- .../PartitionProviderCompatibilitySuite.scala | 137 + .../hive/PartitionedTablePerfStatsSuite.scala | 304 +++ .../apache/spark/sql/hive/StatisticsSuite.scala | 65 ++-- .../sql/hive/execution/HiveCommandSuite.scala | 5 +- .../sql/hive/execution/SQLQuerySuite.scala | 8 +- 27 files changed, 812 insertions(+), 545 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ccb11543/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index a97ed70..7c3bec8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -89,9 +89,10 @@ case class CatalogTablePartition( parameters: Map[String, String] = Map.empty) { override def toString: String = { +val specString = spec.map { case (k, v) => s"$k=$v" }.mkString(", ") val output = Seq( -s"Partition Values: [${spec.values.mkString(", ")}]", +s"Partition Values: [$specString]", s"$storage", s"Partition Parameters:{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}") @@ -137,6 +138,8 @@ case class BucketSpec( * Can be None if this table is a View, should be "hive" for hive serde tables. * @param unsupportedFeatures is a list of string descriptions of features that are used by the *underlying table but not supported by Spark SQL yet. + * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive + *metastore. */ case class CatalogTable( identifier: TableIdentifier, @@ -154,7 +157,8 @@ case class CatalogTable( viewOriginalText: Option[String] = None, viewText: Option[String] = None, comment: Option[String] = None, -unsupportedFeatures: Seq[String] = Seq.empty) { +unsupportedFeatures: Seq[String] = Seq.empty, +partitionProviderIsHive: Boolean = false) { /** schema of this table's partition columns */ def partitionSchema: StructType = StructType(schema.filter { @@ -212,11 +216,11 @@ case class CatalogTable(
spark git commit: [SPARK-18132] Fix checkstyle
Repository: spark Updated Branches: refs/heads/branch-2.0 dcf2f090c -> 1a4be51d6 [SPARK-18132] Fix checkstyle This PR fixes checkstyle. Author: Yin Huai Closes #15656 from yhuai/fix-format. (cherry picked from commit d3b4831d009905185ad74096ce3ecfa934bc191d) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a4be51d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a4be51d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a4be51d Branch: refs/heads/branch-2.0 Commit: 1a4be51d64eaafe2fa0e69d0c3c81f7b40051427 Parents: dcf2f09 Author: Yin Huai Authored: Wed Oct 26 22:22:23 2016 -0700 Committer: Yin Huai Committed: Wed Oct 26 22:22:55 2016 -0700 -- .../spark/util/collection/unsafe/sort/UnsafeExternalSorter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a4be51d/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 6e03064..56d54a1 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -142,9 +142,10 @@ public final class UnsafeExternalSorter extends MemoryConsumer { this.recordComparator = recordComparator; this.prefixComparator = prefixComparator; // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units -// this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; +// this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024 this.fileBufferSizeBytes = 32 * 1024; -// The spill metrics are stored in a new ShuffleWriteMetrics, and then discarded (this fixes SPARK-16827). +// The spill metrics are stored in a new ShuffleWriteMetrics, +// and then discarded (this fixes SPARK-16827). // TODO: Instead, separate spill metrics should be stored and reported (tracked in SPARK-3577). this.writeMetrics = new ShuffleWriteMetrics(); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18132] Fix checkstyle
Repository: spark Updated Branches: refs/heads/master dd4f088c1 -> d3b4831d0 [SPARK-18132] Fix checkstyle This PR fixes checkstyle. Author: Yin Huai Closes #15656 from yhuai/fix-format. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d3b4831d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d3b4831d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d3b4831d Branch: refs/heads/master Commit: d3b4831d009905185ad74096ce3ecfa934bc191d Parents: dd4f088 Author: Yin Huai Authored: Wed Oct 26 22:22:23 2016 -0700 Committer: Yin Huai Committed: Wed Oct 26 22:22:23 2016 -0700 -- .../spark/util/collection/unsafe/sort/UnsafeExternalSorter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d3b4831d/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 7835017..dcae4a3 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -143,9 +143,10 @@ public final class UnsafeExternalSorter extends MemoryConsumer { this.recordComparator = recordComparator; this.prefixComparator = prefixComparator; // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units -// this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; +// this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024 this.fileBufferSizeBytes = 32 * 1024; -// The spill metrics are stored in a new ShuffleWriteMetrics, and then discarded (this fixes SPARK-16827). +// The spill metrics are stored in a new ShuffleWriteMetrics, +// and then discarded (this fixes SPARK-16827). // TODO: Instead, separate spill metrics should be stored and reported (tracked in SPARK-3577). this.writeMetrics = new ShuffleWriteMetrics(); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18070][SQL] binary operator should not consider nullability when comparing input types
Repository: spark Updated Branches: refs/heads/branch-2.0 1c1e847bc -> 7c8d9a557 [SPARK-18070][SQL] binary operator should not consider nullability when comparing input types ## What changes were proposed in this pull request? Binary operator requires its inputs to be of same type, but it should not consider nullability, e.g. `EqualTo` should be able to compare an element-nullable array and an element-non-nullable array. ## How was this patch tested? a regression test in `DataFrameSuite` Author: Wenchen Fan Closes #15606 from cloud-fan/type-bug. (cherry picked from commit a21791e3164f4e6546fbe0a90017a4394a05deb1) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7c8d9a55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7c8d9a55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7c8d9a55 Branch: refs/heads/branch-2.0 Commit: 7c8d9a55707ceb1b93ce1f91bcedb10aea8d5c3c Parents: 1c1e847 Author: Wenchen Fan Authored: Tue Oct 25 12:08:17 2016 -0700 Committer: Yin Huai Committed: Tue Oct 25 12:08:28 2016 -0700 -- .../apache/spark/sql/catalyst/expressions/Expression.scala | 2 +- .../test/scala/org/apache/spark/sql/DataFrameSuite.scala| 9 + 2 files changed, 10 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7c8d9a55/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 7abbbe2..0f6a896 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -511,7 +511,7 @@ abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes { override def checkInputDataTypes(): TypeCheckResult = { // First check whether left and right have the same type, then check if the type is acceptable. -if (left.dataType != right.dataType) { +if (!left.dataType.sameType(right.dataType)) { TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " + s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).") } else if (!inputType.acceptsType(left.dataType)) { http://git-wip-us.apache.org/repos/asf/spark/blob/7c8d9a55/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index f8d7ddd..4478a9a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1601,4 +1601,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { dates.except(widenTypedRows).collect() dates.intersect(widenTypedRows).collect() } + + test("SPARK-18070 binary operator should not consider nullability when comparing input types") { +val rows = Seq(Row(Seq(1), Seq(1))) +val schema = new StructType() + .add("array1", ArrayType(IntegerType)) + .add("array2", ArrayType(IntegerType, containsNull = false)) +val df = spark.createDataFrame(spark.sparkContext.makeRDD(rows), schema) +assert(df.filter($"array1" === $"array2").count() == 1) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18070][SQL] binary operator should not consider nullability when comparing input types
Repository: spark Updated Branches: refs/heads/master c5fe3dd4f -> a21791e31 [SPARK-18070][SQL] binary operator should not consider nullability when comparing input types ## What changes were proposed in this pull request? Binary operator requires its inputs to be of same type, but it should not consider nullability, e.g. `EqualTo` should be able to compare an element-nullable array and an element-non-nullable array. ## How was this patch tested? a regression test in `DataFrameSuite` Author: Wenchen Fan Closes #15606 from cloud-fan/type-bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a21791e3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a21791e3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a21791e3 Branch: refs/heads/master Commit: a21791e3164f4e6546fbe0a90017a4394a05deb1 Parents: c5fe3dd Author: Wenchen Fan Authored: Tue Oct 25 12:08:17 2016 -0700 Committer: Yin Huai Committed: Tue Oct 25 12:08:17 2016 -0700 -- .../apache/spark/sql/catalyst/expressions/Expression.scala | 2 +- .../test/scala/org/apache/spark/sql/DataFrameSuite.scala| 9 + 2 files changed, 10 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a21791e3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index fa1a2ad..9edc1ce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -511,7 +511,7 @@ abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes { override def checkInputDataTypes(): TypeCheckResult = { // First check whether left and right have the same type, then check if the type is acceptable. -if (left.dataType != right.dataType) { +if (!left.dataType.sameType(right.dataType)) { TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " + s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).") } else if (!inputType.acceptsType(left.dataType)) { http://git-wip-us.apache.org/repos/asf/spark/blob/a21791e3/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 3fb7eee..33b3b78 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1649,4 +1649,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { dates.except(widenTypedRows).collect() dates.intersect(widenTypedRows).collect() } + + test("SPARK-18070 binary operator should not consider nullability when comparing input types") { +val rows = Seq(Row(Seq(1), Seq(1))) +val schema = new StructType() + .add("array1", ArrayType(IntegerType)) + .add("array2", ArrayType(IntegerType, containsNull = false)) +val df = spark.createDataFrame(spark.sparkContext.makeRDD(rows), schema) +assert(df.filter($"array1" === $"array2").count() == 1) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17926][SQL][STREAMING] Added json for statuses
Repository: spark Updated Branches: refs/heads/branch-2.0 78458a7eb -> af2e6e0c9 [SPARK-17926][SQL][STREAMING] Added json for statuses ## What changes were proposed in this pull request? StreamingQueryStatus exposed through StreamingQueryListener often needs to be recorded (similar to SparkListener events). This PR adds `.json` and `.prettyJson` to `StreamingQueryStatus`, `SourceStatus` and `SinkStatus`. ## How was this patch tested? New unit tests Author: Tathagata Das Closes #15476 from tdas/SPARK-17926. (cherry picked from commit 7a531e3054f8d4820216ed379433559f57f571b8) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af2e6e0c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af2e6e0c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af2e6e0c Branch: refs/heads/branch-2.0 Commit: af2e6e0c9c85c40bc505ed1183857a8fb60fbd72 Parents: 78458a7 Author: Tathagata Das Authored: Fri Oct 21 13:07:29 2016 -0700 Committer: Yin Huai Committed: Fri Oct 21 13:07:59 2016 -0700 -- python/pyspark/sql/streaming.py | 11 +- .../apache/spark/sql/streaming/SinkStatus.scala | 18 +++- .../spark/sql/streaming/SourceStatus.scala | 23 +++- .../sql/streaming/StreamingQueryStatus.scala| 55 +++--- .../streaming/StreamingQueryStatusSuite.scala | 105 +++ 5 files changed, 187 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af2e6e0c/python/pyspark/sql/streaming.py -- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 0df63a7..cfe917b 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -205,8 +205,7 @@ class StreamingQueryStatus(object): Pretty string of this query status. >>> print(sqs) -StreamingQueryStatus: -Query name: query +Status of query 'query' Query id: 1 Status timestamp: 123 Input rate: 15.5 rows/sec @@ -220,7 +219,7 @@ class StreamingQueryStatus(object): numRows.input.total: 100 triggerId: 5 Source statuses [1 source]: -Source 1:MySource1 +Source 1 - MySource1 Available offset: #0 Input rate: 15.5 rows/sec Processing rate: 23.5 rows/sec @@ -228,7 +227,7 @@ class StreamingQueryStatus(object): numRows.input.source: 100 latency.getOffset.source: 10 latency.getBatch.source: 20 -Sink status: MySink +Sink status - MySink Committed offsets: [#1, -] """ return self._jsqs.toString() @@ -366,7 +365,7 @@ class SourceStatus(object): Pretty string of this source status. >>> print(sqs.sourceStatuses[0]) -SourceStatus:MySource1 +Status of source MySource1 Available offset: #0 Input rate: 15.5 rows/sec Processing rate: 23.5 rows/sec @@ -457,7 +456,7 @@ class SinkStatus(object): Pretty string of this source status. >>> print(sqs.sinkStatus) -SinkStatus:MySink +Status of sink MySink Committed offsets: [#1, -] """ return self._jss.toString() http://git-wip-us.apache.org/repos/asf/spark/blob/af2e6e0c/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala index c991166..ab19602 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql.streaming +import org.json4s._ +import org.json4s.JsonAST.JValue +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + import org.apache.spark.annotation.Experimental import org.apache.spark.sql.streaming.StreamingQueryStatus.indent @@ -34,8 +39,19 @@ class SinkStatus private( val description: String, val offsetDesc: String) { + /** The compact JSON representation of this status. */ + def json: String = compact(render(jsonValue)) + + /** The pretty (i.e. indented) JSON representation of this status. */ + def prettyJson: String = pretty(render(jsonValue)) + override def toString: String = -"SinkStatus:" + indent(prettyString) +"Status of sink " + indent(prettyString).trim + +
spark git commit: [SPARK-17926][SQL][STREAMING] Added json for statuses
Repository: spark Updated Branches: refs/heads/master e371040a0 -> 7a531e305 [SPARK-17926][SQL][STREAMING] Added json for statuses ## What changes were proposed in this pull request? StreamingQueryStatus exposed through StreamingQueryListener often needs to be recorded (similar to SparkListener events). This PR adds `.json` and `.prettyJson` to `StreamingQueryStatus`, `SourceStatus` and `SinkStatus`. ## How was this patch tested? New unit tests Author: Tathagata Das Closes #15476 from tdas/SPARK-17926. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a531e30 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a531e30 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a531e30 Branch: refs/heads/master Commit: 7a531e3054f8d4820216ed379433559f57f571b8 Parents: e371040 Author: Tathagata Das Authored: Fri Oct 21 13:07:29 2016 -0700 Committer: Yin Huai Committed: Fri Oct 21 13:07:29 2016 -0700 -- python/pyspark/sql/streaming.py | 11 +- .../apache/spark/sql/streaming/SinkStatus.scala | 18 +++- .../spark/sql/streaming/SourceStatus.scala | 23 +++- .../sql/streaming/StreamingQueryStatus.scala| 55 +++--- .../streaming/StreamingQueryStatusSuite.scala | 105 +++ 5 files changed, 187 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a531e30/python/pyspark/sql/streaming.py -- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index ce47bd1..35fc469 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -205,8 +205,7 @@ class StreamingQueryStatus(object): Pretty string of this query status. >>> print(sqs) -StreamingQueryStatus: -Query name: query +Status of query 'query' Query id: 1 Status timestamp: 123 Input rate: 15.5 rows/sec @@ -220,7 +219,7 @@ class StreamingQueryStatus(object): numRows.input.total: 100 triggerId: 5 Source statuses [1 source]: -Source 1:MySource1 +Source 1 - MySource1 Available offset: #0 Input rate: 15.5 rows/sec Processing rate: 23.5 rows/sec @@ -228,7 +227,7 @@ class StreamingQueryStatus(object): numRows.input.source: 100 latency.getOffset.source: 10 latency.getBatch.source: 20 -Sink status: MySink +Sink status - MySink Committed offsets: [#1, -] """ return self._jsqs.toString() @@ -366,7 +365,7 @@ class SourceStatus(object): Pretty string of this source status. >>> print(sqs.sourceStatuses[0]) -SourceStatus:MySource1 +Status of source MySource1 Available offset: #0 Input rate: 15.5 rows/sec Processing rate: 23.5 rows/sec @@ -457,7 +456,7 @@ class SinkStatus(object): Pretty string of this source status. >>> print(sqs.sinkStatus) -SinkStatus:MySink +Status of sink MySink Committed offsets: [#1, -] """ return self._jss.toString() http://git-wip-us.apache.org/repos/asf/spark/blob/7a531e30/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala index c991166..ab19602 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql.streaming +import org.json4s._ +import org.json4s.JsonAST.JValue +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + import org.apache.spark.annotation.Experimental import org.apache.spark.sql.streaming.StreamingQueryStatus.indent @@ -34,8 +39,19 @@ class SinkStatus private( val description: String, val offsetDesc: String) { + /** The compact JSON representation of this status. */ + def json: String = compact(render(jsonValue)) + + /** The pretty (i.e. indented) JSON representation of this status. */ + def prettyJson: String = pretty(render(jsonValue)) + override def toString: String = -"SinkStatus:" + indent(prettyString) +"Status of sink " + indent(prettyString).trim + + private[sql] def jsonValue: JValue = { +("description" -> JString(description)) ~ +("offsetDe