spark git commit: [SPARK-8687] [YARN] Fix bug: Executor can't fetch the new set configuration in yarn-client
Repository: spark Updated Branches: refs/heads/branch-1.4 e33c0f0a4 -> 7cbfef23a [SPARK-8687] [YARN] Fix bug: Executor can't fetch the new set configuration in yarn-client Spark initi the properties CoarseGrainedSchedulerBackend.start ```scala // TODO (prashant) send conf instead of properties driverEndpoint = rpcEnv.setupEndpoint( CoarseGrainedSchedulerBackend.ENDPOINT_NAME, new DriverEndpoint(rpcEnv, properties)) ``` Then the yarn logic will set some configuration but not update in this `properties`. So `Executor` won't gain the `properties`. [Jira](https://issues.apache.org/jira/browse/SPARK-8687) Author: huangzhaowei Closes #7066 from SaintBacchus/SPARK-8687 and squashes the following commits: 1de4f48 [huangzhaowei] Ensure all necessary properties have already been set before startup ExecutorLaucher (cherry picked from commit 1b0c8e61040bf06213f9758f775679dcc41b0cce) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7cbfef23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7cbfef23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7cbfef23 Branch: refs/heads/branch-1.4 Commit: 7cbfef23aa4fd57b9eaee12a120406d1cbb26ef3 Parents: e33c0f0 Author: huangzhaowei Authored: Wed Jul 1 23:14:13 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:14:54 2015 -0700 -- .../spark/scheduler/cluster/YarnClientSchedulerBackend.scala | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7cbfef23/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index e7b0af6..cb6008e 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -41,7 +41,6 @@ private[spark] class YarnClientSchedulerBackend( * This waits until the application is running. */ override def start() { -super.start() val driverHost = conf.get("spark.driver.host") val driverPort = conf.get("spark.driver.port") val hostport = driverHost + ":" + driverPort @@ -56,6 +55,12 @@ private[spark] class YarnClientSchedulerBackend( totalExpectedExecutors = args.numExecutors client = new Client(args, conf) appId = client.submitApplication() + +// SPARK-8687: Ensure all necessary properties have already been set before +// we initialize our driver scheduler backend, which serves these properties +// to the executors +super.start() + waitForApplication() monitorThread = asyncMonitorApplication() monitorThread.start() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8687] [YARN] Fix bug: Executor can't fetch the new set configuration in yarn-client
Repository: spark Updated Branches: refs/heads/master 3697232b7 -> 1b0c8e610 [SPARK-8687] [YARN] Fix bug: Executor can't fetch the new set configuration in yarn-client Spark initi the properties CoarseGrainedSchedulerBackend.start ```scala // TODO (prashant) send conf instead of properties driverEndpoint = rpcEnv.setupEndpoint( CoarseGrainedSchedulerBackend.ENDPOINT_NAME, new DriverEndpoint(rpcEnv, properties)) ``` Then the yarn logic will set some configuration but not update in this `properties`. So `Executor` won't gain the `properties`. [Jira](https://issues.apache.org/jira/browse/SPARK-8687) Author: huangzhaowei Closes #7066 from SaintBacchus/SPARK-8687 and squashes the following commits: 1de4f48 [huangzhaowei] Ensure all necessary properties have already been set before startup ExecutorLaucher Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b0c8e61 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b0c8e61 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b0c8e61 Branch: refs/heads/master Commit: 1b0c8e61040bf06213f9758f775679dcc41b0cce Parents: 3697232 Author: huangzhaowei Authored: Wed Jul 1 23:14:13 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:14:13 2015 -0700 -- .../spark/scheduler/cluster/YarnClientSchedulerBackend.scala | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1b0c8e61/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index dd8c4fd..3a0b944 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -41,7 +41,6 @@ private[spark] class YarnClientSchedulerBackend( * This waits until the application is running. */ override def start() { -super.start() val driverHost = conf.get("spark.driver.host") val driverPort = conf.get("spark.driver.port") val hostport = driverHost + ":" + driverPort @@ -56,6 +55,12 @@ private[spark] class YarnClientSchedulerBackend( totalExpectedExecutors = args.numExecutors client = new Client(args, conf) appId = client.submitApplication() + +// SPARK-8687: Ensure all necessary properties have already been set before +// we initialize our driver scheduler backend, which serves these properties +// to the executors +super.start() + waitForApplication() monitorThread = asyncMonitorApplication() monitorThread.start() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8740] [PROJECT INFRA] Support GitHub OAuth tokens in dev/merge_spark_pr.py
Repository: spark Updated Branches: refs/heads/master 15d41cc50 -> 377ff4c9e [SPARK-8740] [PROJECT INFRA] Support GitHub OAuth tokens in dev/merge_spark_pr.py This commit allows `dev/merge_spark_pr.py` to use personal GitHub OAuth tokens in order to make authenticated requests. This is necessary to work around per-IP rate limiting issues. To use a token, just set the `GITHUB_OAUTH_KEY` environment variable. You can create a personal token at https://github.com/settings/tokens; we only require `public_repo` scope. If the script fails due to a rate-limit issue, it now logs a useful message directing the user to the OAuth token instructions. Author: Josh Rosen Closes #7136 from JoshRosen/pr-merge-script-oauth-authentication and squashes the following commits: 4d011bd [Josh Rosen] Fix error message 23d92ff [Josh Rosen] Support GitHub OAuth tokens in dev/merge_spark_pr.py Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/377ff4c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/377ff4c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/377ff4c9 Branch: refs/heads/master Commit: 377ff4c9e8942882183d94698684824e9dc9f391 Parents: 15d41cc Author: Josh Rosen Authored: Wed Jul 1 23:06:52 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:06:52 2015 -0700 -- dev/merge_spark_pr.py | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/377ff4c9/dev/merge_spark_pr.py -- diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index cf827ce..4a17d48 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -47,6 +47,12 @@ PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache") JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "") # ASF JIRA password JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "") +# OAuth key used for issuing requests against the GitHub API. If this is not defined, then requests +# will be unauthenticated. You should only need to configure this if you find yourself regularly +# exceeding your IP's unauthenticated request rate limit. You can create an OAuth key at +# https://github.com/settings/tokens. This script only requires the "public_repo" scope. +GITHUB_OAUTH_KEY = os.environ.get("GITHUB_OAUTH_KEY") + GITHUB_BASE = "https://github.com/apache/spark/pull"; GITHUB_API_BASE = "https://api.github.com/repos/apache/spark"; @@ -58,9 +64,17 @@ BRANCH_PREFIX = "PR_TOOL" def get_json(url): try: -return json.load(urllib2.urlopen(url)) +request = urllib2.Request(url) +if GITHUB_OAUTH_KEY: +request.add_header('Authorization', 'token %s' % GITHUB_OAUTH_KEY) +return json.load(urllib2.urlopen(request)) except urllib2.HTTPError as e: -print "Unable to fetch URL, exiting: %s" % url +if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0': +print "Exceeded the GitHub API rate limit; see the instructions in " + \ + "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \ + "GitHub requests." +else: +print "Unable to fetch URL, exiting: %s" % url sys.exit(-1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-3071] Increase default driver memory
Repository: spark Updated Branches: refs/heads/master 377ff4c9e -> 3697232b7 [SPARK-3071] Increase default driver memory I've updated default values in comments, documentation, and in the command line builder to be 1g based on comments in the JIRA. I've also updated most usages to point at a single variable defined in the Utils.scala and JavaUtils.java files. This wasn't possible in all cases (R, shell scripts etc.) but usage in most code is now pointing at the same place. Please let me know if I've missed anything. Will the spark-shell use the value within the command line builder during instantiation? Author: Ilya Ganelin Closes #7132 from ilganeli/SPARK-3071 and squashes the following commits: 4074164 [Ilya Ganelin] String fix 271610b [Ilya Ganelin] Merge branch 'SPARK-3071' of github.com:ilganeli/spark into SPARK-3071 273b6e9 [Ilya Ganelin] Test fix fd67721 [Ilya Ganelin] Update JavaUtils.java 26cc177 [Ilya Ganelin] test fix e5db35d [Ilya Ganelin] Fixed test failure 39732a1 [Ilya Ganelin] merge fix a6f7deb [Ilya Ganelin] Created default value for DRIVER MEM in Utils that's now used in almost all locations instead of setting manually in each 09ad698 [Ilya Ganelin] Update SubmitRestProtocolSuite.scala 19b6f25 [Ilya Ganelin] Missed one doc update 2698a3d [Ilya Ganelin] Updated default value for driver memory Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3697232b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3697232b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3697232b Branch: refs/heads/master Commit: 3697232b7d438979cc119b2a364296b0eec4a16a Parents: 377ff4c Author: Ilya Ganelin Authored: Wed Jul 1 23:11:02 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:11:02 2015 -0700 -- R/pkg/R/sparkR.R | 2 +- conf/spark-env.sh.template| 2 +- .../scala/org/apache/spark/deploy/ClientArguments.scala | 2 +- .../org/apache/spark/deploy/SparkSubmitArguments.scala| 5 +++-- .../apache/spark/deploy/rest/mesos/MesosRestServer.scala | 2 +- .../org/apache/spark/deploy/worker/WorkerArguments.scala | 2 +- core/src/main/scala/org/apache/spark/util/Utils.scala | 6 ++ .../spark/deploy/rest/SubmitRestProtocolSuite.scala | 10 +- docs/configuration.md | 4 ++-- .../org/apache/spark/launcher/CommandBuilderUtils.java| 2 +- .../apache/spark/launcher/SparkSubmitCommandBuilder.java | 2 +- .../apache/spark/mllib/tree/model/DecisionTreeModel.scala | 2 +- .../spark/mllib/tree/model/treeEnsembleModels.scala | 2 +- .../java/org/apache/spark/network/util/JavaUtils.java | 6 ++ .../org/apache/spark/deploy/yarn/ClientArguments.scala| 7 --- 15 files changed, 35 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3697232b/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 633b869..86233e0 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -109,7 +109,7 @@ sparkR.init <- function( return(get(".sparkRjsc", envir = .sparkREnv)) } - sparkMem <- Sys.getenv("SPARK_MEM", "512m") + sparkMem <- Sys.getenv("SPARK_MEM", "1024m") jars <- suppressWarnings(normalizePath(as.character(sparkJars))) # Classpath separator is ";" on Windows http://git-wip-us.apache.org/repos/asf/spark/blob/3697232b/conf/spark-env.sh.template -- diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index 43c4288..192d3ae 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -22,7 +22,7 @@ # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) -# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb) +# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 1G) # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: âdefaultâ) # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. http://git-wip-us.apache.org/repos/asf/spark/blob/3697232b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala index 316e2d5..42d3296 100644 --- a/cor
spark git commit: [SPARK-8769] [TRIVIAL] [DOCS] toLocalIterator should mention it results in many jobs
Repository: spark Updated Branches: refs/heads/branch-1.4 5b468cf0c -> e33c0f0a4 [SPARK-8769] [TRIVIAL] [DOCS] toLocalIterator should mention it results in many jobs Author: Holden Karau Closes #7171 from holdenk/SPARK-8769-toLocalIterator-documentation-improvement and squashes the following commits: 97ddd99 [Holden Karau] Add note (cherry picked from commit 15d41cc501f5fa7ac82c4a6741e416bb557f610a) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e33c0f0a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e33c0f0a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e33c0f0a Branch: refs/heads/branch-1.4 Commit: e33c0f0a497194d93b3c034502a9a49dc22c0cdf Parents: 5b468cf Author: Holden Karau Authored: Wed Jul 1 23:05:45 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:05:57 2015 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e33c0f0a/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 10610f4..cac6e3b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -890,6 +890,10 @@ abstract class RDD[T: ClassTag]( * Return an iterator that contains all of the elements in this RDD. * * The iterator will consume as much memory as the largest partition in this RDD. + * + * Note: this results in multiple Spark jobs, and if the input RDD is the result + * of a wide transformation (e.g. join with different partitioners), to avoid + * recomputing the input RDD should be cached first. */ def toLocalIterator: Iterator[T] = withScope { def collectPartition(p: Int): Array[T] = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8769] [TRIVIAL] [DOCS] toLocalIterator should mention it results in many jobs
Repository: spark Updated Branches: refs/heads/master d14338eaf -> 15d41cc50 [SPARK-8769] [TRIVIAL] [DOCS] toLocalIterator should mention it results in many jobs Author: Holden Karau Closes #7171 from holdenk/SPARK-8769-toLocalIterator-documentation-improvement and squashes the following commits: 97ddd99 [Holden Karau] Add note Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/15d41cc5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/15d41cc5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/15d41cc5 Branch: refs/heads/master Commit: 15d41cc501f5fa7ac82c4a6741e416bb557f610a Parents: d14338e Author: Holden Karau Authored: Wed Jul 1 23:05:45 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:05:45 2015 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/15d41cc5/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 10610f4..cac6e3b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -890,6 +890,10 @@ abstract class RDD[T: ClassTag]( * Return an iterator that contains all of the elements in this RDD. * * The iterator will consume as much memory as the largest partition in this RDD. + * + * Note: this results in multiple Spark jobs, and if the input RDD is the result + * of a wide transformation (e.g. join with different partitioners), to avoid + * recomputing the input RDD should be cached first. */ def toLocalIterator: Iterator[T] = withScope { def collectPartition(p: Int): Array[T] = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8771] [TRIVIAL] Add a version to the deprecated annotation for the actorSystem
Repository: spark Updated Branches: refs/heads/master 646366b5d -> d14338eaf [SPARK-8771] [TRIVIAL] Add a version to the deprecated annotation for the actorSystem Author: Holden Karau Closes #7172 from holdenk/SPARK-8771-actor-system-deprecation-tag-uses-deprecated-deprecation-tag and squashes the following commits: 7f1455b [Holden Karau] Add .0s to the versions for the derpecated anotations in SparkEnv.scala ca13c9d [Holden Karau] Add a version to the deprecated annotation for the actorSystem in SparkEnv Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d14338ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d14338ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d14338ea Branch: refs/heads/master Commit: d14338eafc5d633f766bd52ba610fd7c4fe90581 Parents: 646366b Author: Holden Karau Authored: Wed Jul 1 23:04:05 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:04:05 2015 -0700 -- core/src/main/scala/org/apache/spark/SparkEnv.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d14338ea/core/src/main/scala/org/apache/spark/SparkEnv.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 1b133fb..d18fc59 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -76,7 +76,7 @@ class SparkEnv ( val conf: SparkConf) extends Logging { // TODO Remove actorSystem - @deprecated("Actor system is no longer supported as of 1.4") + @deprecated("Actor system is no longer supported as of 1.4.0", "1.4.0") val actorSystem: ActorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem private[spark] var isStopped = false @@ -173,7 +173,7 @@ object SparkEnv extends Logging { /** * Returns the ThreadLocal SparkEnv. */ - @deprecated("Use SparkEnv.get instead", "1.2") + @deprecated("Use SparkEnv.get instead", "1.2.0") def getThreadLocal: SparkEnv = { env } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8688] [YARN] Bug fix: disable the cache fs to gain the HDFS connection.
Repository: spark Updated Branches: refs/heads/master 792fcd802 -> 646366b5d [SPARK-8688] [YARN] Bug fix: disable the cache fs to gain the HDFS connection. If `fs.hdfs.impl.disable.cache` was `false`(default), `FileSystem` will use the cached `DFSClient` which use old token. [AMDelegationTokenRenewer](https://github.com/apache/spark/blob/master/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala#L196) ```scala val credentials = UserGroupInformation.getCurrentUser.getCredentials credentials.writeTokenStorageFile(tempTokenPath, discachedConfiguration) ``` Although the `credentials` had the new Token, but it still use the cached client and old token. So It's better to set the `fs.hdfs.impl.disable.cache` as `true` to avoid token expired. [Jira](https://issues.apache.org/jira/browse/SPARK-8688) Author: huangzhaowei Closes #7069 from SaintBacchus/SPARK-8688 and squashes the following commits: f94cd0b [huangzhaowei] modify function parameter 8fb9eb9 [huangzhaowei] explicit the comment 0cd55c9 [huangzhaowei] Rename function name to be an accurate one cf776a1 [huangzhaowei] [SPARK-8688][YARN]Bug fix: disable the cache fs to gain the HDFS connection. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/646366b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/646366b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/646366b5 Branch: refs/heads/master Commit: 646366b5d2f12e42f8e7287672ba29a8c918a17d Parents: 792fcd8 Author: huangzhaowei Authored: Wed Jul 1 23:01:44 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 23:01:44 2015 -0700 -- .../org/apache/spark/deploy/SparkHadoopUtil.scala | 13 + .../spark/deploy/yarn/AMDelegationTokenRenewer.scala | 10 ++ .../deploy/yarn/ExecutorDelegationTokenUpdater.scala | 5 - 3 files changed, 23 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/646366b5/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 7fa75ac..6d14590 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -334,6 +334,19 @@ class SparkHadoopUtil extends Logging { * Stop the thread that does the delegation token updates. */ private[spark] def stopExecutorDelegationTokenRenewer() {} + + /** + * Return a fresh Hadoop configuration, bypassing the HDFS cache mechanism. + * This is to prevent the DFSClient from using an old cached token to connect to the NameNode. + */ + private[spark] def getConfBypassingFSCache( + hadoopConf: Configuration, + scheme: String): Configuration = { +val newConf = new Configuration(hadoopConf) +val confKey = s"fs.${scheme}.impl.disable.cache" +newConf.setBoolean(confKey, true) +newConf + } } object SparkHadoopUtil { http://git-wip-us.apache.org/repos/asf/spark/blob/646366b5/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala index 77af46c..56e4741 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala @@ -65,6 +65,8 @@ private[yarn] class AMDelegationTokenRenewer( sparkConf.getInt("spark.yarn.credentials.file.retention.days", 5) private val numFilesToKeep = sparkConf.getInt("spark.yarn.credentials.file.retention.count", 5) + private val freshHadoopConf = +hadoopUtil.getConfBypassingFSCache(hadoopConf, new Path(credentialsFile).toUri.getScheme) /** * Schedule a login from the keytab and principal set using the --principal and --keytab @@ -123,7 +125,7 @@ private[yarn] class AMDelegationTokenRenewer( private def cleanupOldFiles(): Unit = { import scala.concurrent.duration._ try { - val remoteFs = FileSystem.get(hadoopConf) + val remoteFs = FileSystem.get(freshHadoopConf) val credentialsPath = new Path(credentialsFile) val thresholdTime = System.currentTimeMillis() - (daysToKeepFiles days).toMillis hadoopUtil.listFilesSorted( @@ -169,13 +171,13 @@ private[yarn] class AMDelegationTokenRenewer( // Get a copy of the credentials override def run(): Void = { val nns = YarnSparkHado
spark git commit: [SPARK-8754] [YARN] YarnClientSchedulerBackend doesn't stop gracefully in failure conditions
Repository: spark Updated Branches: refs/heads/master b285ac5ba -> 792fcd802 [SPARK-8754] [YARN] YarnClientSchedulerBackend doesn't stop gracefully in failure conditions In YarnClientSchedulerBackend.stop(), added a check for monitorThread. Author: Devaraj K Closes #7153 from devaraj-kavali/master and squashes the following commits: 66be9ad [Devaraj K] https://issues.apache.org/jira/browse/SPARK-8754 YarnClientSchedulerBackend doesn't stop gracefully in failure conditions Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/792fcd80 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/792fcd80 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/792fcd80 Branch: refs/heads/master Commit: 792fcd802c99a0aef2b67d54f0e6e58710e65956 Parents: b285ac5 Author: Devaraj K Authored: Wed Jul 1 22:59:04 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 22:59:04 2015 -0700 -- .../spark/scheduler/cluster/YarnClientSchedulerBackend.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/792fcd80/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index 1c8d7ec..dd8c4fd 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -148,7 +148,9 @@ private[spark] class YarnClientSchedulerBackend( */ override def stop() { assert(client != null, "Attempted to stop this scheduler before starting it!") -monitorThread.interrupt() +if (monitorThread != null) { + monitorThread.interrupt() +} super.stop() client.stop() logInfo("Stopped") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8754] [YARN] YarnClientSchedulerBackend doesn't stop gracefully in failure conditions
Repository: spark Updated Branches: refs/heads/branch-1.4 17def3957 -> 5b468cf0c [SPARK-8754] [YARN] YarnClientSchedulerBackend doesn't stop gracefully in failure conditions In YarnClientSchedulerBackend.stop(), added a check for monitorThread. Author: Devaraj K Closes #7153 from devaraj-kavali/master and squashes the following commits: 66be9ad [Devaraj K] https://issues.apache.org/jira/browse/SPARK-8754 YarnClientSchedulerBackend doesn't stop gracefully in failure conditions (cherry picked from commit 792fcd802c99a0aef2b67d54f0e6e58710e65956) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b468cf0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b468cf0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b468cf0 Branch: refs/heads/branch-1.4 Commit: 5b468cf0c21071d212b0cba7a0cede7eeb5d273b Parents: 17def39 Author: Devaraj K Authored: Wed Jul 1 22:59:04 2015 -0700 Committer: Andrew Or Committed: Wed Jul 1 22:59:13 2015 -0700 -- .../spark/scheduler/cluster/YarnClientSchedulerBackend.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b468cf0/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index 99c0532..e7b0af6 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -147,7 +147,9 @@ private[spark] class YarnClientSchedulerBackend( */ override def stop() { assert(client != null, "Attempted to stop this scheduler before starting it!") -monitorThread.interrupt() +if (monitorThread != null) { + monitorThread.interrupt() +} super.stop() client.stop() logInfo("Stopped") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8227] [SQL] Add function unhex
Repository: spark Updated Branches: refs/heads/master 4e4f74b5e -> b285ac5ba [SPARK-8227] [SQL] Add function unhex cc chenghao-intel adrian-wang Author: zhichao.li Closes #7113 from zhichao-li/unhex and squashes the following commits: 379356e [zhichao.li] remove exception checking a4ae6dc [zhichao.li] add udf_unhex to whitelist fe5c14a [zhichao.li] add todigit 607d7a3 [zhichao.li] use checkInputTypes bffd37f [zhichao.li] change to use Hex in apache common package cde73f5 [zhichao.li] update to use AutoCastInputTypes 11945c7 [zhichao.li] style c852d46 [zhichao.li] Add function unhex Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b285ac5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b285ac5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b285ac5b Branch: refs/heads/master Commit: b285ac5ba85fe0b32b00726ad7d3a2efb602e885 Parents: 4e4f74b Author: zhichao.li Authored: Wed Jul 1 22:19:51 2015 -0700 Committer: Davies Liu Committed: Wed Jul 1 22:19:51 2015 -0700 -- .../catalyst/analysis/FunctionRegistry.scala| 1 + .../spark/sql/catalyst/expressions/math.scala | 52 .../expressions/MathFunctionsSuite.scala| 6 +++ .../scala/org/apache/spark/sql/functions.scala | 18 +++ .../apache/spark/sql/MathExpressionsSuite.scala | 10 .../hive/execution/HiveCompatibilitySuite.scala | 1 + 6 files changed, 88 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b285ac5b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d53eaed..6f04298 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -157,6 +157,7 @@ object FunctionRegistry { expression[Substring]("substr"), expression[Substring]("substring"), expression[Upper]("ucase"), +expression[UnHex]("unhex"), expression[Upper]("upper") ) http://git-wip-us.apache.org/repos/asf/spark/blob/b285ac5b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala index b51318d..8633eb0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala @@ -351,6 +351,58 @@ case class Pow(left: Expression, right: Expression) } } +/** + * Performs the inverse operation of HEX. + * Resulting characters are returned as a byte array. + */ +case class UnHex(child: Expression) extends UnaryExpression with Serializable { + + override def dataType: DataType = BinaryType + + override def checkInputDataTypes(): TypeCheckResult = { +if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) { + TypeCheckResult.TypeCheckSuccess +} else { + TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}") +} + } + + override def eval(input: InternalRow): Any = { +val num = child.eval(input) +if (num == null) { + null +} else { + unhex(num.asInstanceOf[UTF8String].getBytes) +} + } + + private val unhexDigits = { +val array = Array.fill[Byte](128)(-1) +(0 to 9).foreach(i => array('0' + i) = i.toByte) +(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte) +(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte) +array + } + + private def unhex(inputBytes: Array[Byte]): Array[Byte] = { +var bytes = inputBytes +if ((bytes.length & 0x01) != 0) { + bytes = '0'.toByte +: bytes +} +val out = new Array[Byte](bytes.length >> 1) +// two characters form the hex value. +var i = 0 +while (i < bytes.length) { +val first = unhexDigits(bytes(i)) +val second = unhexDigits(bytes(i + 1)) +if (first == -1 || second == -1) { return null} +out(i / 2) = (((first << 4) | second) & 0xFF).toByte +i += 2 +} +out + } +} + case class Hypot(left: Expression, right: Expression) extends BinaryMathExpression(math.hypot, "HYPOT") http://git-wip-us.apache.org/repos/asf/spark/blob/b285ac5b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunc
spark git commit: [SPARK-8660] [MLLIB] removed > symbols from comments in LogisticRegressionSuite.scala for ease of copypaste
Repository: spark Updated Branches: refs/heads/master 9fd13d561 -> 4e4f74b5e [SPARK-8660] [MLLIB] removed > symbols from comments in LogisticRegressionSuite.scala for ease of copypaste '>' symbols removed from comments in LogisticRegressionSuite.scala, for ease of copypaste also single-lined the multiline commands (is this desirable, or does it violate style?) Author: Rosstin Closes #7167 from Rosstin/SPARK-8660-2 and squashes the following commits: f4b9bc8 [Rosstin] SPARK-8660 restored character limit on multiline comments in LogisticRegressionSuite.scala fe6b112 [Rosstin] SPARK-8660 > symbols removed from LogisticRegressionSuite.scala for easy of copypaste 39ddd50 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8661 5a05dee [Rosstin] SPARK-8661 for LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments to make it easier to copy-paste the R code. bb9a4b1 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8660 242aedd [Rosstin] SPARK-8660, changed comment style from JavaDoc style to normal multiline comment in order to make copypaste into R easier, in file classification/LogisticRegressionSuite.scala 2cd2985 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639 21ac1e5 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639 6c18058 [Rosstin] fixed minor typos in docs/README.md and docs/api.md Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e4f74b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e4f74b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e4f74b5 Branch: refs/heads/master Commit: 4e4f74b5e1267d1ada4a8f57b86aee0d9c17d90a Parents: 9fd13d5 Author: Rosstin Authored: Wed Jul 1 21:42:06 2015 -0700 Committer: Xiangrui Meng Committed: Wed Jul 1 21:42:06 2015 -0700 -- .../LogisticRegressionSuite.scala | 117 ++- 1 file changed, 63 insertions(+), 54 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e4f74b5/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index bc6eeac..ba8fbee 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -214,12 +214,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0)) - > weights + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0)) + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 2.8366423 @@ -245,13 +246,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE)) - > weights + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . @@ -278,12 +280,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) -
spark git commit: [SPARK-8770][SQL] Create BinaryOperator abstract class.
Repository: spark Updated Branches: refs/heads/master 3a342dedc -> 9fd13d561 [SPARK-8770][SQL] Create BinaryOperator abstract class. Our current BinaryExpression abstract class is not for generic binary expressions, i.e. it requires left/right children to have the same type. However, due to its name, contributors build new binary expressions that don't have that assumption (e.g. Sha) and still extend BinaryExpression. This patch creates a new BinaryOperator abstract class, and update the analyzer o only apply type casting rule there. This patch also adds the notion of "prettyName" to expressions, which defines the user-facing name for the expression. Author: Reynold Xin Closes #7174 from rxin/binary-opterator and squashes the following commits: f31900d [Reynold Xin] [SPARK-8770][SQL] Create BinaryOperator abstract class. fceb216 [Reynold Xin] Merge branch 'master' of github.com:apache/spark into binary-opterator d8518cf [Reynold Xin] Updated Python tests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9fd13d56 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9fd13d56 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9fd13d56 Branch: refs/heads/master Commit: 9fd13d5613b6d16a78d97d4798f085b56107d343 Parents: 3a342de Author: Reynold Xin Authored: Wed Jul 1 21:14:13 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 21:14:13 2015 -0700 -- python/pyspark/sql/dataframe.py | 10 +- python/pyspark/sql/functions.py | 4 +- python/pyspark/sql/group.py | 24 +-- .../catalyst/analysis/HiveTypeCoercion.scala| 17 +- .../expressions/ExpectsInputTypes.scala | 59 +++ .../sql/catalyst/expressions/Expression.scala | 161 +-- .../sql/catalyst/expressions/ScalaUDF.scala | 2 +- .../sql/catalyst/expressions/aggregates.scala | 9 +- .../sql/catalyst/expressions/arithmetic.scala | 14 +- .../expressions/complexTypeCreator.scala| 4 +- .../catalyst/expressions/nullFunctions.scala| 2 - .../sql/catalyst/expressions/predicates.scala | 6 +- .../spark/sql/catalyst/expressions/sets.scala | 2 - .../catalyst/expressions/stringOperations.scala | 26 +-- .../sql/catalyst/trees/TreeNodeSuite.scala | 6 +- 15 files changed, 191 insertions(+), 155 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9fd13d56/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 273a40d..1e9c657 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -802,11 +802,11 @@ class DataFrame(object): Each element should be a column name (string) or an expression (:class:`Column`). >>> df.groupBy().avg().collect() -[Row(AVG(age)=3.5)] +[Row(avg(age)=3.5)] >>> df.groupBy('name').agg({'age': 'mean'}).collect() -[Row(name=u'Alice', AVG(age)=2.0), Row(name=u'Bob', AVG(age)=5.0)] +[Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] >>> df.groupBy(df.name).avg().collect() -[Row(name=u'Alice', AVG(age)=2.0), Row(name=u'Bob', AVG(age)=5.0)] +[Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] >>> df.groupBy(['name', df.age]).count().collect() [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)] """ @@ -864,10 +864,10 @@ class DataFrame(object): (shorthand for ``df.groupBy.agg()``). >>> df.agg({"age": "max"}).collect() -[Row(MAX(age)=5)] +[Row(max(age)=5)] >>> from pyspark.sql import functions as F >>> df.agg(F.min(df.age)).collect() -[Row(MIN(age)=2)] +[Row(min(age)=2)] """ return self.groupBy().agg(*exprs) http://git-wip-us.apache.org/repos/asf/spark/blob/9fd13d56/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 4e2be88..f9a15d4 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -266,7 +266,7 @@ def coalesce(*cols): >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() +-+ -|Coalesce(a,b)| +|coalesce(a,b)| +-+ | null| |1| @@ -275,7 +275,7 @@ def coalesce(*cols): >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() +++---+ -| a| b|Coalesce(a,0.0)| +| a| b|coalesce(a,0.0)| +++---+ |null|null|0.0| | 1|null|1.0
spark git commit: [SPARK-8766] support non-ascii character in column names
Repository: spark Updated Branches: refs/heads/branch-1.4 228aabe24 -> 17def3957 [SPARK-8766] support non-ascii character in column names Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii'). This PR also fix a bug when there is Java exception without error message. Author: Davies Liu Closes #7165 from davies/non_ascii and squashes the following commits: 02cb61a [Davies Liu] fix tests 3b09d31 [Davies Liu] add encoding in header 867754a [Davies Liu] support non-ascii character in column names (cherry picked from commit f958f27e2056f9e380373c2807d8bb5977ecf269) Signed-off-by: Davies Liu Conflicts: python/pyspark/sql/utils.py Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17def395 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17def395 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17def395 Branch: refs/heads/branch-1.4 Commit: 17def395798dfc3af962d34b9a0260fa8880fe7d Parents: 228aabe Author: Davies Liu Authored: Wed Jul 1 16:43:18 2015 -0700 Committer: Davies Liu Committed: Wed Jul 1 17:18:04 2015 -0700 -- python/pyspark/sql/dataframe.py | 3 +-- python/pyspark/sql/tests.py | 9 + python/pyspark/sql/types.py | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/17def395/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 2d8c595..e9dd00e 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -476,13 +476,12 @@ class DataFrame(object): return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] @property -@ignore_unicode_prefix @since(1.3) def columns(self): """Returns all column names as a list. >>> df.columns -[u'age', u'name'] +['age', 'name'] """ return [f.name for f in self.schema.fields] http://git-wip-us.apache.org/repos/asf/spark/blob/17def395/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index f902776..27c2ad1 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -583,6 +584,14 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaises(IndexError, lambda: df["bad_key"]) self.assertRaises(TypeError, lambda: df[{}]) +def test_column_name_with_non_ascii(self): +df = self.sqlCtx.createDataFrame([(1,)], ["æ°é"]) +self.assertEqual(StructType([StructField("æ°é", LongType(), True)]), df.schema) +self.assertEqual("DataFrame[æ°é: bigint]", str(df)) +self.assertEqual([("æ°é", 'bigint')], df.dtypes) +self.assertEqual(1, df.select("æ°é").first()[0]) +self.assertEqual(1, df.select(df["æ°é"]).first()[0]) + def test_access_nested_types(self): df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() self.assertEqual(1, df.select(df.l[0]).first()[0]) http://git-wip-us.apache.org/repos/asf/spark/blob/17def395/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index b6ec613..e4cb006 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -323,6 +323,8 @@ class StructField(DataType): False """ assert isinstance(dataType, DataType), "dataType should be DataType" +if not isinstance(name, str): +name = name.encode('utf-8') self.name = name self.dataType = dataType self.nullable = nullable - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-8770][SQL] Create BinaryOperator abstract class."
Repository: spark Updated Branches: refs/heads/master 272778999 -> 3a342dedc Revert "[SPARK-8770][SQL] Create BinaryOperator abstract class." This reverts commit 272778999823ed79af92280350c5869a87a21f29. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3a342ded Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3a342ded Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3a342ded Branch: refs/heads/master Commit: 3a342dedc04799948bf6da69843bd1a91202ffe5 Parents: 2727789 Author: Reynold Xin Authored: Wed Jul 1 16:59:39 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 16:59:39 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 17 +- .../expressions/ExpectsInputTypes.scala | 59 --- .../sql/catalyst/expressions/Expression.scala | 161 ++- .../sql/catalyst/expressions/ScalaUDF.scala | 2 +- .../sql/catalyst/expressions/aggregates.scala | 6 + .../sql/catalyst/expressions/arithmetic.scala | 14 +- .../expressions/complexTypeCreator.scala| 4 +- .../catalyst/expressions/nullFunctions.scala| 2 + .../sql/catalyst/expressions/predicates.scala | 6 +- .../spark/sql/catalyst/expressions/sets.scala | 2 + .../catalyst/expressions/stringOperations.scala | 26 ++- .../sql/catalyst/trees/TreeNodeSuite.scala | 6 +- 12 files changed, 135 insertions(+), 170 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3a342ded/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 8420c54..2ab5cb6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -150,7 +150,6 @@ object HiveTypeCoercion { * Converts string "NaN"s that are in binary operators with a NaN-able types (Float / Double) to * the appropriate numeric equivalent. */ - // TODO: remove this rule and make Cast handle Nan. object ConvertNaNs extends Rule[LogicalPlan] { private val StringNaN = Literal("NaN") @@ -160,19 +159,19 @@ object HiveTypeCoercion { case e if !e.childrenResolved => e /* Double Conversions */ -case b @ BinaryOperator(StringNaN, right @ DoubleType()) => +case b @ BinaryExpression(StringNaN, right @ DoubleType()) => b.makeCopy(Array(Literal(Double.NaN), right)) -case b @ BinaryOperator(left @ DoubleType(), StringNaN) => +case b @ BinaryExpression(left @ DoubleType(), StringNaN) => b.makeCopy(Array(left, Literal(Double.NaN))) /* Float Conversions */ -case b @ BinaryOperator(StringNaN, right @ FloatType()) => +case b @ BinaryExpression(StringNaN, right @ FloatType()) => b.makeCopy(Array(Literal(Float.NaN), right)) -case b @ BinaryOperator(left @ FloatType(), StringNaN) => +case b @ BinaryExpression(left @ FloatType(), StringNaN) => b.makeCopy(Array(left, Literal(Float.NaN))) /* Use float NaN by default to avoid unnecessary type widening */ -case b @ BinaryOperator(left @ StringNaN, StringNaN) => +case b @ BinaryExpression(left @ StringNaN, StringNaN) => b.makeCopy(Array(left, Literal(Float.NaN))) } } @@ -246,12 +245,12 @@ object HiveTypeCoercion { Union(newLeft, newRight) - // Also widen types for BinaryOperator. + // Also widen types for BinaryExpressions. case q: LogicalPlan => q transformExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e -case b @ BinaryOperator(left, right) if left.dataType != right.dataType => +case b @ BinaryExpression(left, right) if left.dataType != right.dataType => findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { widestType => val newLeft = if (left.dataType == widestType) left else Cast(left, widestType) val newRight = if (right.dataType == widestType) right else Cast(right, widestType) @@ -479,7 +478,7 @@ object HiveTypeCoercion { // Promote integers inside a binary expression with fixed-precision decimals to decimals, // and fixed-precision decimals in an expression with floats / doubles to doubles -case b @ BinaryOperator(left, right) if left.dataType != right.dataType => +case b @ BinaryExpression(left, right) if left.d
spark git commit: [SPARK-8770][SQL] Create BinaryOperator abstract class.
Repository: spark Updated Branches: refs/heads/master f958f27e2 -> 272778999 [SPARK-8770][SQL] Create BinaryOperator abstract class. Our current BinaryExpression abstract class is not for generic binary expressions, i.e. it requires left/right children to have the same type. However, due to its name, contributors build new binary expressions that don't have that assumption (e.g. Sha) and still extend BinaryExpression. This patch creates a new BinaryOperator abstract class, and update the analyzer o only apply type casting rule there. This patch also adds the notion of "prettyName" to expressions, which defines the user-facing name for the expression. Author: Reynold Xin Closes #7170 from rxin/binaryoperator and squashes the following commits: 51264a5 [Reynold Xin] [SPARK-8770][SQL] Create BinaryOperator abstract class. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27277899 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27277899 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27277899 Branch: refs/heads/master Commit: 272778999823ed79af92280350c5869a87a21f29 Parents: f958f27 Author: Reynold Xin Authored: Wed Jul 1 16:56:48 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 16:56:48 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 17 +- .../expressions/ExpectsInputTypes.scala | 59 +++ .../sql/catalyst/expressions/Expression.scala | 161 +-- .../sql/catalyst/expressions/ScalaUDF.scala | 2 +- .../sql/catalyst/expressions/aggregates.scala | 6 - .../sql/catalyst/expressions/arithmetic.scala | 14 +- .../expressions/complexTypeCreator.scala| 4 +- .../catalyst/expressions/nullFunctions.scala| 2 - .../sql/catalyst/expressions/predicates.scala | 6 +- .../spark/sql/catalyst/expressions/sets.scala | 2 - .../catalyst/expressions/stringOperations.scala | 26 +-- .../sql/catalyst/trees/TreeNodeSuite.scala | 6 +- 12 files changed, 170 insertions(+), 135 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27277899/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 2ab5cb6..8420c54 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -150,6 +150,7 @@ object HiveTypeCoercion { * Converts string "NaN"s that are in binary operators with a NaN-able types (Float / Double) to * the appropriate numeric equivalent. */ + // TODO: remove this rule and make Cast handle Nan. object ConvertNaNs extends Rule[LogicalPlan] { private val StringNaN = Literal("NaN") @@ -159,19 +160,19 @@ object HiveTypeCoercion { case e if !e.childrenResolved => e /* Double Conversions */ -case b @ BinaryExpression(StringNaN, right @ DoubleType()) => +case b @ BinaryOperator(StringNaN, right @ DoubleType()) => b.makeCopy(Array(Literal(Double.NaN), right)) -case b @ BinaryExpression(left @ DoubleType(), StringNaN) => +case b @ BinaryOperator(left @ DoubleType(), StringNaN) => b.makeCopy(Array(left, Literal(Double.NaN))) /* Float Conversions */ -case b @ BinaryExpression(StringNaN, right @ FloatType()) => +case b @ BinaryOperator(StringNaN, right @ FloatType()) => b.makeCopy(Array(Literal(Float.NaN), right)) -case b @ BinaryExpression(left @ FloatType(), StringNaN) => +case b @ BinaryOperator(left @ FloatType(), StringNaN) => b.makeCopy(Array(left, Literal(Float.NaN))) /* Use float NaN by default to avoid unnecessary type widening */ -case b @ BinaryExpression(left @ StringNaN, StringNaN) => +case b @ BinaryOperator(left @ StringNaN, StringNaN) => b.makeCopy(Array(left, Literal(Float.NaN))) } } @@ -245,12 +246,12 @@ object HiveTypeCoercion { Union(newLeft, newRight) - // Also widen types for BinaryExpressions. + // Also widen types for BinaryOperator. case q: LogicalPlan => q transformExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e -case b @ BinaryExpression(left, right) if left.dataType != right.dataType => +case b @ BinaryOperator(left, right) if left.dataType != right.dataType => findTightestComm
spark git commit: [SPARK-8766] support non-ascii character in column names
Repository: spark Updated Branches: refs/heads/master 1ce642890 -> f958f27e2 [SPARK-8766] support non-ascii character in column names Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii'). This PR also fix a bug when there is Java exception without error message. Author: Davies Liu Closes #7165 from davies/non_ascii and squashes the following commits: 02cb61a [Davies Liu] fix tests 3b09d31 [Davies Liu] add encoding in header 867754a [Davies Liu] support non-ascii character in column names Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f958f27e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f958f27e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f958f27e Branch: refs/heads/master Commit: f958f27e2056f9e380373c2807d8bb5977ecf269 Parents: 1ce6428 Author: Davies Liu Authored: Wed Jul 1 16:43:18 2015 -0700 Committer: Davies Liu Committed: Wed Jul 1 16:43:18 2015 -0700 -- python/pyspark/sql/dataframe.py | 3 +-- python/pyspark/sql/tests.py | 9 + python/pyspark/sql/types.py | 2 ++ python/pyspark/sql/utils.py | 6 +++--- 4 files changed, 15 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 4b9efa0..273a40d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -484,13 +484,12 @@ class DataFrame(object): return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] @property -@ignore_unicode_prefix @since(1.3) def columns(self): """Returns all column names as a list. >>> df.columns -[u'age', u'name'] +['age', 'name'] """ return [f.name for f in self.schema.fields] http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 5af2ce0..78c 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -628,6 +629,14 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaises(IndexError, lambda: df["bad_key"]) self.assertRaises(TypeError, lambda: df[{}]) +def test_column_name_with_non_ascii(self): +df = self.sqlCtx.createDataFrame([(1,)], ["æ°é"]) +self.assertEqual(StructType([StructField("æ°é", LongType(), True)]), df.schema) +self.assertEqual("DataFrame[æ°é: bigint]", str(df)) +self.assertEqual([("æ°é", 'bigint')], df.dtypes) +self.assertEqual(1, df.select("æ°é").first()[0]) +self.assertEqual(1, df.select(df["æ°é"]).first()[0]) + def test_access_nested_types(self): df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() self.assertEqual(1, df.select(df.l[0]).first()[0]) http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index ae9344e..160df40 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -324,6 +324,8 @@ class StructField(DataType): False """ assert isinstance(dataType, DataType), "dataType should be DataType" +if not isinstance(name, str): +name = name.encode('utf-8') self.name = name self.dataType = dataType self.nullable = nullable http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/utils.py -- diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 8096802..cc5b2c0 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -29,9 +29,9 @@ def capture_sql_exception(f): try: return f(*a, **kw) except py4j.protocol.Py4JJavaError as e: -cls, msg = e.java_exception.toString().split(': ', 1) -if cls == 'org.apache.spark.sql.AnalysisException': -raise AnalysisException(msg) +s = e.java_exception.toString() +if s.startswith('org.apache.spark.sql.AnalysisException: '): +raise AnalysisException(s.split(': ', 1)[1])
spark git commit: [SPARK-3444] [CORE] Restore INFO level after log4j test.
Repository: spark Updated Branches: refs/heads/master 3083e1764 -> 1ce642890 [SPARK-3444] [CORE] Restore INFO level after log4j test. Otherwise other tests don't log anything useful... Author: Marcelo Vanzin Closes #7140 from vanzin/SPARK-3444 and squashes the following commits: de14836 [Marcelo Vanzin] Better fix. 6cff13a [Marcelo Vanzin] [SPARK-3444] [core] Restore INFO level after log4j test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ce64289 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ce64289 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ce64289 Branch: refs/heads/master Commit: 1ce6428907b4ddcf52dbf0c86196d82ab7392442 Parents: 3083e17 Author: Marcelo Vanzin Authored: Wed Jul 1 20:40:47 2015 +0100 Committer: Sean Owen Committed: Wed Jul 1 20:40:47 2015 +0100 -- .../scala/org/apache/spark/util/UtilsSuite.scala| 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1ce64289/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index baa4c66..251a797 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -486,11 +486,17 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { // Test for using the util function to change our log levels. test("log4j log level change") { -Utils.setLogLevel(org.apache.log4j.Level.ALL) -assert(log.isInfoEnabled()) -Utils.setLogLevel(org.apache.log4j.Level.ERROR) -assert(!log.isInfoEnabled()) -assert(log.isErrorEnabled()) +val current = org.apache.log4j.Logger.getRootLogger().getLevel() +try { + Utils.setLogLevel(org.apache.log4j.Level.ALL) + assert(log.isInfoEnabled()) + Utils.setLogLevel(org.apache.log4j.Level.ERROR) + assert(!log.isInfoEnabled()) + assert(log.isErrorEnabled()) +} finally { + // Best effort at undoing changes this test made. + Utils.setLogLevel(current) +} } test("deleteRecursively") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-3444] [CORE] Restore INFO level after log4j test.
Repository: spark Updated Branches: refs/heads/branch-1.4 bcfb37bf6 -> 228aabe24 [SPARK-3444] [CORE] Restore INFO level after log4j test. Otherwise other tests don't log anything useful... Author: Marcelo Vanzin Closes #7140 from vanzin/SPARK-3444 and squashes the following commits: de14836 [Marcelo Vanzin] Better fix. 6cff13a [Marcelo Vanzin] [SPARK-3444] [core] Restore INFO level after log4j test. (cherry picked from commit 1ce6428907b4ddcf52dbf0c86196d82ab7392442) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/228aabe2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/228aabe2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/228aabe2 Branch: refs/heads/branch-1.4 Commit: 228aabe244d03886cd1c106c73df51054f882e73 Parents: bcfb37b Author: Marcelo Vanzin Authored: Wed Jul 1 20:40:47 2015 +0100 Committer: Sean Owen Committed: Wed Jul 1 20:41:00 2015 +0100 -- .../scala/org/apache/spark/util/UtilsSuite.scala| 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/228aabe2/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index a61ea39..6f17041 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -486,11 +486,17 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { // Test for using the util function to change our log levels. test("log4j log level change") { -Utils.setLogLevel(org.apache.log4j.Level.ALL) -assert(log.isInfoEnabled()) -Utils.setLogLevel(org.apache.log4j.Level.ERROR) -assert(!log.isInfoEnabled()) -assert(log.isErrorEnabled()) +val current = org.apache.log4j.Logger.getRootLogger().getLevel() +try { + Utils.setLogLevel(org.apache.log4j.Level.ALL) + assert(log.isInfoEnabled()) + Utils.setLogLevel(org.apache.log4j.Level.ERROR) + assert(!log.isInfoEnabled()) + assert(log.isErrorEnabled()) +} finally { + // Best effort at undoing changes this test made. + Utils.setLogLevel(current) +} } test("deleteRecursively") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [QUICKFIX] [SQL] fix copy of generated row
Repository: spark Updated Branches: refs/heads/master 9f7db3486 -> 3083e1764 [QUICKFIX] [SQL] fix copy of generated row copy() of generated Row doesn't check nullability of columns Author: Davies Liu Closes #7163 from davies/fix_copy and squashes the following commits: 661a206 [Davies Liu] fix copy of generated row Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3083e176 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3083e176 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3083e176 Branch: refs/heads/master Commit: 3083e17645e4b707646fe48e406e02c156a0f37b Parents: 9f7db34 Author: Davies Liu Authored: Wed Jul 1 12:39:57 2015 -0700 Committer: Davies Liu Committed: Wed Jul 1 12:39:57 2015 -0700 -- .../sql/catalyst/expressions/codegen/GenerateProjection.scala | 2 +- .../spark/sql/catalyst/expressions/ExpressionEvalHelper.scala | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3083e176/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala index 5be4717..3c7ee9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala @@ -148,7 +148,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] { }.mkString("\n") val copyColumns = expressions.zipWithIndex.map { case (e, i) => -s"""arr[$i] = c$i;""" +s"""if (!nullBits[$i]) arr[$i] = c$i;""" }.mkString("\n ") val code = s""" http://git-wip-us.apache.org/repos/asf/spark/blob/3083e176/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 7d95ef7..3171caf 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -136,6 +136,9 @@ trait ExpressionEvalHelper { val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input") } +if (actual.copy() != expectedRow) { + fail(s"Copy of generated Row is wrong: actual: ${actual.copy()}, expected: $expectedRow") +} } protected def checkEvaluationWithOptimization( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7820] [BUILD] Fix Java8-tests suite compile and test error under sbt
Repository: spark Updated Branches: refs/heads/branch-1.4 2f85d8ee0 -> bcfb37bf6 [SPARK-7820] [BUILD] Fix Java8-tests suite compile and test error under sbt Author: jerryshao Closes #7120 from jerryshao/SPARK-7820 and squashes the following commits: 6902439 [jerryshao] fix Java8-tests suite compile error under sbt (cherry picked from commit 9f7db3486fcb403cae8da9dfce8978373c3f47b7) Signed-off-by: Josh Rosen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bcfb37bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bcfb37bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bcfb37bf Branch: refs/heads/branch-1.4 Commit: bcfb37bf6987f15be7d1c48ef97f1630e3481af8 Parents: 2f85d8e Author: jerryshao Authored: Wed Jul 1 12:33:24 2015 -0700 Committer: Josh Rosen Committed: Wed Jul 1 12:34:38 2015 -0700 -- extras/java8-tests/pom.xml | 8 project/SparkBuild.scala | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bcfb37bf/extras/java8-tests/pom.xml -- diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index 06f9b64..e249b56 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -41,6 +41,13 @@ org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark spark-streaming_${scala.binary.version} ${project.version} @@ -49,6 +56,7 @@ spark-streaming_${scala.binary.version} ${project.version} test-jar + test junit http://git-wip-us.apache.org/repos/asf/spark/blob/bcfb37bf/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index bd0cf33..aa59d74 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -157,7 +157,7 @@ object SparkBuild extends PomBuild { // Note ordering of these settings matter. /* Enable shared settings on all projects */ (allProjects ++ optionallyEnabledProjects ++ assemblyProjects ++ Seq(spark, tools)) -.foreach(enable(sharedSettings ++ ExludedDependencies.settings)) +.foreach(enable(sharedSettings ++ ExcludedDependencies.settings)) /* Enable tests settings for all projects except examples, assembly and tools */ (allProjects ++ optionallyEnabledProjects).foreach(enable(TestSettings.settings)) @@ -246,7 +246,7 @@ object Flume { This excludes library dependencies in sbt, which are specified in maven but are not needed by sbt build. */ -object ExludedDependencies { +object ExcludedDependencies { lazy val settings = Seq( libraryDependencies ~= { libs => libs.filterNot(_.name == "groovy-all") } ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7820] [BUILD] Fix Java8-tests suite compile and test error under sbt
Repository: spark Updated Branches: refs/heads/master 75b9fe4c5 -> 9f7db3486 [SPARK-7820] [BUILD] Fix Java8-tests suite compile and test error under sbt Author: jerryshao Closes #7120 from jerryshao/SPARK-7820 and squashes the following commits: 6902439 [jerryshao] fix Java8-tests suite compile error under sbt Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f7db348 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f7db348 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f7db348 Branch: refs/heads/master Commit: 9f7db3486fcb403cae8da9dfce8978373c3f47b7 Parents: 75b9fe4 Author: jerryshao Authored: Wed Jul 1 12:33:24 2015 -0700 Committer: Josh Rosen Committed: Wed Jul 1 12:33:24 2015 -0700 -- extras/java8-tests/pom.xml | 8 project/SparkBuild.scala | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f7db348/extras/java8-tests/pom.xml -- diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index f138251..3636a90 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -41,6 +41,13 @@ org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark spark-streaming_${scala.binary.version} ${project.version} @@ -49,6 +56,7 @@ spark-streaming_${scala.binary.version} ${project.version} test-jar + test junit http://git-wip-us.apache.org/repos/asf/spark/blob/9f7db348/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 4ef4dc8..5f389bc 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -161,7 +161,7 @@ object SparkBuild extends PomBuild { // Note ordering of these settings matter. /* Enable shared settings on all projects */ (allProjects ++ optionallyEnabledProjects ++ assemblyProjects ++ Seq(spark, tools)) -.foreach(enable(sharedSettings ++ ExludedDependencies.settings ++ Revolver.settings)) +.foreach(enable(sharedSettings ++ ExcludedDependencies.settings ++ Revolver.settings)) /* Enable tests settings for all projects except examples, assembly and tools */ (allProjects ++ optionallyEnabledProjects).foreach(enable(TestSettings.settings)) @@ -246,7 +246,7 @@ object Flume { This excludes library dependencies in sbt, which are specified in maven but are not needed by sbt build. */ -object ExludedDependencies { +object ExcludedDependencies { lazy val settings = Seq( libraryDependencies ~= { libs => libs.filterNot(_.name == "groovy-all") } ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8378] [STREAMING] Add the Python API for Flume
Repository: spark Updated Branches: refs/heads/master b8faa3287 -> 75b9fe4c5 [SPARK-8378] [STREAMING] Add the Python API for Flume Author: zsxwing Closes #6830 from zsxwing/flume-python and squashes the following commits: 78dfdac [zsxwing] Fix the compile error in the test code f1bf3c0 [zsxwing] Address TD's comments 0449723 [zsxwing] Add sbt goal streaming-flume-assembly/assembly e93736b [zsxwing] Fix the test case for determine_modules_to_test 9d5821e [zsxwing] Fix pyspark_core dependencies f9ee681 [zsxwing] Merge branch 'master' into flume-python 7a55837 [zsxwing] Add streaming_flume_assembly to run-tests.py b96b0de [zsxwing] Merge branch 'master' into flume-python ce85e83 [zsxwing] Fix incompatible issues for Python 3 01cbb3d [zsxwing] Add import sys 152364c [zsxwing] Fix the issue that StringIO doesn't work in Python 3 14ba0ff [zsxwing] Add flume-assembly for sbt building b8d5551 [zsxwing] Merge branch 'master' into flume-python 4762c34 [zsxwing] Fix the doc 0336579 [zsxwing] Refactor Flume unit tests and also add tests for Python API 9f33873 [zsxwing] Add the Python API for Flume Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/75b9fe4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/75b9fe4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/75b9fe4c Branch: refs/heads/master Commit: 75b9fe4c5ff6f206c6fc9100563d625b39f142ba Parents: b8faa32 Author: zsxwing Authored: Wed Jul 1 11:59:24 2015 -0700 Committer: Tathagata Das Committed: Wed Jul 1 11:59:24 2015 -0700 -- dev/run-tests.py| 7 +- dev/sparktestsupport/modules.py | 15 +- docs/streaming-flume-integration.md | 18 ++ docs/streaming-programming-guide.md | 2 +- .../main/python/streaming/flume_wordcount.py| 55 + external/flume-assembly/pom.xml | 135 .../spark/streaming/flume/FlumeTestUtils.scala | 116 ++ .../spark/streaming/flume/FlumeUtils.scala | 76 ++- .../streaming/flume/PollingFlumeTestUtils.scala | 209 +++ .../flume/FlumePollingStreamSuite.scala | 173 +++ .../streaming/flume/FlumeStreamSuite.scala | 106 ++ pom.xml | 1 + project/SparkBuild.scala| 6 +- python/pyspark/streaming/flume.py | 147 + python/pyspark/streaming/tests.py | 179 +++- 15 files changed, 1009 insertions(+), 236 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/75b9fe4c/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 4596e07..1f0d218 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -96,8 +96,8 @@ def determine_modules_to_test(changed_modules): ['examples', 'graphx'] >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql])) >>> x # doctest: +NORMALIZE_WHITESPACE -['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', 'pyspark-ml', \ - 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming', 'sparkr', 'sql'] +['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \ + 'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql'] """ # If we're going to have to run all of the tests, then we can just short-circuit # and return 'root'. No module depends on root, so if it appears then it will be @@ -293,7 +293,8 @@ def build_spark_sbt(hadoop_version): build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags sbt_goals = ["package", "assembly/assembly", - "streaming-kafka-assembly/assembly"] + "streaming-kafka-assembly/assembly", + "streaming-flume-assembly/assembly"] profiles_and_goals = build_profiles + sbt_goals print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: ", http://git-wip-us.apache.org/repos/asf/spark/blob/75b9fe4c/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index efe3a89..993583e 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -203,7 +203,7 @@ streaming_flume_sink = Module( streaming_flume = Module( -name="streaming_flume", +name="streaming-flume", dependencies=[streaming], source_file_regexes=[ "external/flume", @@ -214,6 +214,15 @@ streaming_flume = Module( ) +streaming_flume_assembly = Module( +name="streaming-flume-assembly", +dependencies=[streaming_flume, streaming_flume_sink], +source_file_rege
spark git commit: [SPARK-8765] [MLLIB] [PYTHON] removed flaky python PIC test
Repository: spark Updated Branches: refs/heads/master 201291335 -> b8faa3287 [SPARK-8765] [MLLIB] [PYTHON] removed flaky python PIC test See failure: [https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/36133/console] CC yanboliang mengxr Author: Joseph K. Bradley Closes #7164 from jkbradley/pic-python-test and squashes the following commits: 156d55b [Joseph K. Bradley] removed flaky python PIC test Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8faa328 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8faa328 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8faa328 Branch: refs/heads/master Commit: b8faa32875aa560cdce340266d898902a920418d Parents: 2012913 Author: Joseph K. Bradley Authored: Wed Jul 1 11:57:52 2015 -0700 Committer: Xiangrui Meng Committed: Wed Jul 1 11:57:52 2015 -0700 -- python/pyspark/mllib/clustering.py | 4 1 file changed, 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b8faa328/python/pyspark/mllib/clustering.py -- diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index e3c8a24..a3eab63 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -288,16 +288,12 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> model = PowerIterationClustering.train(rdd, 2, 100) >>> model.k 2 ->>> sorted(model.assignments().collect()) -[Assignment(id=0, cluster=1), Assignment(id=1, cluster=0), ... >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> model.save(sc, path) >>> sameModel = PowerIterationClusteringModel.load(sc, path) >>> sameModel.k 2 ->>> sorted(sameModel.assignments().collect()) -[Assignment(id=0, cluster=1), Assignment(id=1, cluster=0), ... >>> from shutil import rmtree >>> try: ... rmtree(path) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8308] [MLLIB] add missing save load for python example
Repository: spark Updated Branches: refs/heads/master 184de91d1 -> 201291335 [SPARK-8308] [MLLIB] add missing save load for python example jira: https://issues.apache.org/jira/browse/SPARK-8308 1. add some missing save/load in python examples. , LogisticRegression, LinearRegression and NaiveBayes 2. tune down iterations for MatrixFactorization, since current number will trigger StackOverflow for default java configuration (>1M) Author: Yuhao Yang Closes #6760 from hhbyyh/docUpdate and squashes the following commits: 9bd3383 [Yuhao Yang] update scala example 8a44692 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into docUpdate 077cbb8 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into docUpdate 3e948dc [Yuhao Yang] add missing save load for python example Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20129133 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20129133 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20129133 Branch: refs/heads/master Commit: 2012913355993e6516e4c81dbc92e579977131da Parents: 184de91 Author: Yuhao Yang Authored: Wed Jul 1 11:17:56 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Jul 1 11:17:56 2015 -0700 -- docs/mllib-collaborative-filtering.md | 6 +++--- docs/mllib-linear-methods.md | 12 ++-- docs/mllib-naive-bayes.md | 6 +- 3 files changed, 18 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/20129133/docs/mllib-collaborative-filtering.md -- diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md index dfdf621..eedc234 100644 --- a/docs/mllib-collaborative-filtering.md +++ b/docs/mllib-collaborative-filtering.md @@ -77,7 +77,7 @@ val ratings = data.map(_.split(',') match { case Array(user, item, rate) => // Build the recommendation model using ALS val rank = 10 -val numIterations = 20 +val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data @@ -149,7 +149,7 @@ public class CollaborativeFiltering { // Build the recommendation model using ALS int rank = 10; -int numIterations = 20; +int numIterations = 10; MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); // Evaluate the model on rating data @@ -210,7 +210,7 @@ ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l # Build the recommendation model using Alternating Least Squares rank = 10 -numIterations = 20 +numIterations = 10 model = ALS.train(ratings, rank, numIterations) # Evaluate the model on training data http://git-wip-us.apache.org/repos/asf/spark/blob/20129133/docs/mllib-linear-methods.md -- diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md index 2a2a7c1..3927d65 100644 --- a/docs/mllib-linear-methods.md +++ b/docs/mllib-linear-methods.md @@ -499,7 +499,7 @@ Note that the Python API does not yet support multiclass classification and mode will in the future. {% highlight python %} -from pyspark.mllib.classification import LogisticRegressionWithLBFGS +from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint from numpy import array @@ -518,6 +518,10 @@ model = LogisticRegressionWithLBFGS.train(parsedData) labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = LogisticRegressionModel.load(sc, "myModelPath") {% endhighlight %} @@ -668,7 +672,7 @@ values. We compute the mean squared error at the end to evaluate Note that the Python API does not yet support model save/load but will in the future. {% highlight python %} -from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD +from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel from numpy import array # Load and parse the data @@ -686,6 +690,10 @@ model = LinearRegressionWithSGD.train(parsedData) valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = LinearRegressionModel.load(sc, "myModelPath") {% endhighlight %}
spark git commit: [SPARK-6263] [MLLIB] Python MLlib API missing items: Utils
Repository: spark Updated Branches: refs/heads/master 31b4a3d7f -> 184de91d1 [SPARK-6263] [MLLIB] Python MLlib API missing items: Utils Implement missing API in pyspark. MLUtils * appendBias * loadVectors `kFold` is also missing however I am not sure `ClassTag` can be passed or restored through python. Author: lewuathe Closes #5707 from Lewuathe/SPARK-6263 and squashes the following commits: 16863ea [lewuathe] Merge master 3fc27e7 [lewuathe] Merge branch 'master' into SPARK-6263 6084e9c [lewuathe] Resolv conflict d2aa2a0 [lewuathe] Resolv conflict 9c329d8 [lewuathe] Fix efficiency 3a12a2d [lewuathe] Merge branch 'master' into SPARK-6263 1d4714b [lewuathe] Fix style b29e2bc [lewuathe] Remove scipy dependencies e32eb40 [lewuathe] Merge branch 'master' into SPARK-6263 25d3c9d [lewuathe] Remove unnecessary imports 7ec04db [lewuathe] Resolv conflict 1502d13 [lewuathe] Resolv conflict d6bd416 [lewuathe] Check existence of scipy.sparse 5d555b1 [lewuathe] Construct scipy.sparse matrix c345a44 [lewuathe] Merge branch 'master' into SPARK-6263 b8b5ef7 [lewuathe] Fix unnecessary sort method d254be7 [lewuathe] Merge branch 'master' into SPARK-6263 62a9c7e [lewuathe] Fix appendBias return type 454c73d [lewuathe] Merge branch 'master' into SPARK-6263 a353354 [lewuathe] Remove unnecessary appendBias implementation 44295c2 [lewuathe] Merge branch 'master' into SPARK-6263 64f72ad [lewuathe] Merge branch 'master' into SPARK-6263 c728046 [lewuathe] Fix style 2980569 [lewuathe] [SPARK-6263] Python MLlib API missing items: Utils Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/184de91d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/184de91d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/184de91d Branch: refs/heads/master Commit: 184de91d15a4bfc5c014e8cf86211874bba4593f Parents: 31b4a3d Author: lewuathe Authored: Wed Jul 1 11:14:07 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Jul 1 11:14:07 2015 -0700 -- .../spark/mllib/api/python/PythonMLLibAPI.scala | 9 python/pyspark/mllib/tests.py | 43 python/pyspark/mllib/util.py| 22 ++ 3 files changed, 74 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/184de91d/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a66a404..458fab4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -75,6 +75,15 @@ private[python] class PythonMLLibAPI extends Serializable { minPartitions: Int): JavaRDD[LabeledPoint] = MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions) + /** + * Loads and serializes vectors saved with `RDD#saveAsTextFile`. + * @param jsc Java SparkContext + * @param path file or directory path in any Hadoop-supported file system URI + * @return serialized vectors in a RDD + */ + def loadVectors(jsc: JavaSparkContext, path: String): RDD[Vector] = +MLUtils.loadVectors(jsc.sc, path) + private def trainRegressionModel( learner: GeneralizedLinearAlgorithm[_ <: GeneralizedLinearModel], data: JavaRDD[LabeledPoint], http://git-wip-us.apache.org/repos/asf/spark/blob/184de91d/python/pyspark/mllib/tests.py -- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index f0091d6..49ce125 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -54,6 +54,7 @@ from pyspark.mllib.feature import Word2Vec from pyspark.mllib.feature import IDF from pyspark.mllib.feature import StandardScaler, ElementwiseProduct from pyspark.mllib.util import LinearDataGenerator +from pyspark.mllib.util import MLUtils from pyspark.serializers import PickleSerializer from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext @@ -1290,6 +1291,48 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase): self.assertTrue(mean_absolute_errors[1] - mean_absolute_errors[-1] > 2) +class MLUtilsTests(MLlibTestCase): +def test_append_bias(self): +data = [2.0, 2.0, 2.0] +ret = MLUtils.appendBias(data) +self.assertEqual(ret[3], 1.0) +self.assertEqual(type(ret), DenseVector) + +def test_append_bias_with_vector(self): +data = Vectors.dense([2.0, 2.0, 2.0]) +ret = MLUtils.appendBias(data) +self.assertEqual(ret[3], 1.0) +self.asser
spark git commit: [SPARK-8621] [SQL] support empty string as column name
Repository: spark Updated Branches: refs/heads/branch-1.4 214550b83 -> 2f85d8ee0 [SPARK-8621] [SQL] support empty string as column name improve the empty check in `parseAttributeName` so that we can allow empty string as column name. Close https://github.com/apache/spark/pull/7117 Author: Wenchen Fan Closes #7149 from cloud-fan/8621 and squashes the following commits: efa9e3e [Wenchen Fan] support empty string (cherry picked from commit 31b4a3d7f2be9053a041e5ae67418562a93d80d8) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f85d8ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f85d8ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f85d8ee Branch: refs/heads/branch-1.4 Commit: 2f85d8ee0c8106c99b9994bf0de2b86233c3f4b4 Parents: 214550b Author: Wenchen Fan Authored: Wed Jul 1 10:31:35 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 10:31:49 2015 -0700 -- .../apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala | 4 ++-- .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 7 +++ 2 files changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2f85d8ee/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index dba6965..5567189 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -163,7 +163,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { if (tmp.nonEmpty) throw e inBacktick = true } else if (char == '.') { - if (tmp.isEmpty) throw e + if (name(i - 1) == '.' || i == name.length - 1) throw e nameParts += tmp.mkString tmp.clear() } else { @@ -172,7 +172,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { } i += 1 } -if (tmp.isEmpty || inBacktick) throw e +if (inBacktick) throw e nameParts += tmp.mkString nameParts.toSeq } http://git-wip-us.apache.org/repos/asf/spark/blob/2f85d8ee/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 8e71ef9..399ab2b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -638,4 +638,11 @@ class DataFrameSuite extends QueryTest { val res11 = TestSQLContext.range(-1).select("id") assert(res11.count == 0) } + + test("SPARK-8621: support empty string column name") { +val df = Seq(Tuple1(1)).toDF("").as("t") +// We should allow empty string as column name +df.col("") +df.col("t.``") + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8621] [SQL] support empty string as column name
Repository: spark Updated Branches: refs/heads/master 4137f769b -> 31b4a3d7f [SPARK-8621] [SQL] support empty string as column name improve the empty check in `parseAttributeName` so that we can allow empty string as column name. Close https://github.com/apache/spark/pull/7117 Author: Wenchen Fan Closes #7149 from cloud-fan/8621 and squashes the following commits: efa9e3e [Wenchen Fan] support empty string Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31b4a3d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31b4a3d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31b4a3d7 Branch: refs/heads/master Commit: 31b4a3d7f2be9053a041e5ae67418562a93d80d8 Parents: 4137f76 Author: Wenchen Fan Authored: Wed Jul 1 10:31:35 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 10:31:35 2015 -0700 -- .../apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala | 4 ++-- .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 7 +++ 2 files changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/31b4a3d7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index b009a20..e911b90 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -161,7 +161,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { if (tmp.nonEmpty) throw e inBacktick = true } else if (char == '.') { - if (tmp.isEmpty) throw e + if (name(i - 1) == '.' || i == name.length - 1) throw e nameParts += tmp.mkString tmp.clear() } else { @@ -170,7 +170,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { } i += 1 } -if (tmp.isEmpty || inBacktick) throw e +if (inBacktick) throw e nameParts += tmp.mkString nameParts.toSeq } http://git-wip-us.apache.org/repos/asf/spark/blob/31b4a3d7/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 50d324c..afb1cf5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -730,4 +730,11 @@ class DataFrameSuite extends QueryTest { val res11 = ctx.range(-1).select("id") assert(res11.count == 0) } + + test("SPARK-8621: support empty string column name") { +val df = Seq(Tuple1(1)).toDF("").as("t") +// We should allow empty string as column name +df.col("") +df.col("t.``") + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8752][SQL] Add ExpectsInputTypes trait for defining expected input types.
Repository: spark Updated Branches: refs/heads/master 69c5dee2f -> 4137f769b [SPARK-8752][SQL] Add ExpectsInputTypes trait for defining expected input types. This patch doesn't actually introduce any code that uses the new ExpectsInputTypes. It just adds the trait so others can use it. Also renamed the old expectsInputTypes function to just inputTypes. We should add implicit type casting also in the future. Author: Reynold Xin Closes #7151 from rxin/expects-input-types and squashes the following commits: 16cf07b [Reynold Xin] [SPARK-8752][SQL] Add ExpectsInputTypes trait for defining expected input types. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4137f769 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4137f769 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4137f769 Branch: refs/heads/master Commit: 4137f769b84300648ad933b0b3054d69a7316745 Parents: 69c5dee Author: Reynold Xin Authored: Wed Jul 1 10:30:54 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 10:30:54 2015 -0700 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 1 - .../catalyst/analysis/HiveTypeCoercion.scala| 8 +++--- .../sql/catalyst/expressions/Expression.scala | 29 +--- .../spark/sql/catalyst/expressions/math.scala | 6 ++-- .../spark/sql/catalyst/expressions/misc.scala | 8 +++--- .../sql/catalyst/expressions/predicates.scala | 6 ++-- .../catalyst/expressions/stringOperations.scala | 10 +++ 7 files changed, 44 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4137f769/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index a069b47..583338d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.types._ * Throws user facing errors when passed invalid queries that fail to analyze. */ trait CheckAnalysis { - self: Analyzer => /** * Override to provide additional checks for correct analysis. http://git-wip-us.apache.org/repos/asf/spark/blob/4137f769/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index a9d396d..2ab5cb6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -45,7 +45,7 @@ object HiveTypeCoercion { IfCoercion :: Division :: PropagateTypes :: - AddCastForAutoCastInputTypes :: + ImplicitTypeCasts :: Nil // See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types. @@ -705,13 +705,13 @@ object HiveTypeCoercion { * Casts types according to the expected input types for Expressions that have the trait * [[AutoCastInputTypes]]. */ - object AddCastForAutoCastInputTypes extends Rule[LogicalPlan] { + object ImplicitTypeCasts extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e - case e: AutoCastInputTypes if e.children.map(_.dataType) != e.expectedChildTypes => -val newC = (e.children, e.children.map(_.dataType), e.expectedChildTypes).zipped.map { + case e: AutoCastInputTypes if e.children.map(_.dataType) != e.inputTypes => +val newC = (e.children, e.children.map(_.dataType), e.inputTypes).zipped.map { case (child, actual, expected) => if (actual == expected) child else Cast(child, expected) } http://git-wip-us.apache.org/repos/asf/spark/blob/4137f769/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index b5063f3..e18a311 100644 --- a/sql/catalyst/src/main/scala/org/apache/spa
spark git commit: [SPARK-7714] [SPARKR] SparkR tests should use more specific expectations than expect_true
Repository: spark Updated Branches: refs/heads/master fdcad6ef4 -> 69c5dee2f [SPARK-7714] [SPARKR] SparkR tests should use more specific expectations than expect_true 1. Update the pattern 'expect_true(a == b)' to 'expect_equal(a, b)'. 2. Update the pattern 'expect_true(inherits(a, b))' to 'expect_is(a, b)'. 3. Update the pattern 'expect_true(identical(a, b))' to 'expect_identical(a, b)'. Author: Sun Rui Closes #7152 from sun-rui/SPARK-7714 and squashes the following commits: 8ad2440 [Sun Rui] Fix test case errors. 8fe9f0c [Sun Rui] Update the pattern 'expect_true(identical(a, b))' to 'expect_identical(a, b)'. f1b8005 [Sun Rui] Update the pattern 'expect_true(inherits(a, b))' to 'expect_is(a, b)'. f631e94 [Sun Rui] Update the pattern 'expect_true(a == b)' to 'expect_equal(a, b)'. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69c5dee2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69c5dee2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69c5dee2 Branch: refs/heads/master Commit: 69c5dee2f01b1ae35bd813d31d46429a32cb475d Parents: fdcad6e Author: Sun Rui Authored: Wed Jul 1 09:50:12 2015 -0700 Committer: Shivaram Venkataraman Committed: Wed Jul 1 09:50:12 2015 -0700 -- R/pkg/inst/tests/test_binaryFile.R | 2 +- R/pkg/inst/tests/test_binary_function.R | 4 +- R/pkg/inst/tests/test_includeJAR.R | 4 +- R/pkg/inst/tests/test_parallelize_collect.R | 2 +- R/pkg/inst/tests/test_rdd.R | 4 +- R/pkg/inst/tests/test_sparkSQL.R| 354 +++ R/pkg/inst/tests/test_take.R| 8 +- R/pkg/inst/tests/test_textFile.R| 6 +- R/pkg/inst/tests/test_utils.R | 4 +- 9 files changed, 194 insertions(+), 194 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69c5dee2/R/pkg/inst/tests/test_binaryFile.R -- diff --git a/R/pkg/inst/tests/test_binaryFile.R b/R/pkg/inst/tests/test_binaryFile.R index 4db7266..ccaea18 100644 --- a/R/pkg/inst/tests/test_binaryFile.R +++ b/R/pkg/inst/tests/test_binaryFile.R @@ -82,7 +82,7 @@ test_that("saveAsObjectFile()/objectFile() works with multiple paths", { saveAsObjectFile(rdd2, fileName2) rdd <- objectFile(sc, c(fileName1, fileName2)) - expect_true(count(rdd) == 2) + expect_equal(count(rdd), 2) unlink(fileName1, recursive = TRUE) unlink(fileName2, recursive = TRUE) http://git-wip-us.apache.org/repos/asf/spark/blob/69c5dee2/R/pkg/inst/tests/test_binary_function.R -- diff --git a/R/pkg/inst/tests/test_binary_function.R b/R/pkg/inst/tests/test_binary_function.R index a1e354e..3be8c65 100644 --- a/R/pkg/inst/tests/test_binary_function.R +++ b/R/pkg/inst/tests/test_binary_function.R @@ -38,13 +38,13 @@ test_that("union on two RDDs", { union.rdd <- unionRDD(rdd, text.rdd) actual <- collect(union.rdd) expect_equal(actual, c(as.list(nums), mockFile)) - expect_true(getSerializedMode(union.rdd) == "byte") + expect_equal(getSerializedMode(union.rdd), "byte") rdd<- map(text.rdd, function(x) {x}) union.rdd <- unionRDD(rdd, text.rdd) actual <- collect(union.rdd) expect_equal(actual, as.list(c(mockFile, mockFile))) - expect_true(getSerializedMode(union.rdd) == "byte") + expect_equal(getSerializedMode(union.rdd), "byte") unlink(fileName) }) http://git-wip-us.apache.org/repos/asf/spark/blob/69c5dee2/R/pkg/inst/tests/test_includeJAR.R -- diff --git a/R/pkg/inst/tests/test_includeJAR.R b/R/pkg/inst/tests/test_includeJAR.R index 8bc693b..844d86f 100644 --- a/R/pkg/inst/tests/test_includeJAR.R +++ b/R/pkg/inst/tests/test_includeJAR.R @@ -31,7 +31,7 @@ runScript <- function() { test_that("sparkJars tag in SparkContext", { testOutput <- runScript() helloTest <- testOutput[1] - expect_true(helloTest == "Hello, Dave") + expect_equal(helloTest, "Hello, Dave") basicFunction <- testOutput[2] - expect_true(basicFunction == 4L) + expect_equal(basicFunction, "4") }) http://git-wip-us.apache.org/repos/asf/spark/blob/69c5dee2/R/pkg/inst/tests/test_parallelize_collect.R -- diff --git a/R/pkg/inst/tests/test_parallelize_collect.R b/R/pkg/inst/tests/test_parallelize_collect.R index fff0286..2552127 100644 --- a/R/pkg/inst/tests/test_parallelize_collect.R +++ b/R/pkg/inst/tests/test_parallelize_collect.R @@ -57,7 +57,7 @@ test_that("parallelize() on simple vectors and lists returns an RDD", { strListRDD2) for (rdd in rdds) { -expect_true(inherits(rdd, "RDD")) +expect_is(rdd, "RDD") expec
spark git commit: [SPARK-8763] [PYSPARK] executing run-tests.py with Python 2.6 fails with absence of subprocess.check_output function
Repository: spark Updated Branches: refs/heads/master 97652416e -> fdcad6ef4 [SPARK-8763] [PYSPARK] executing run-tests.py with Python 2.6 fails with absence of subprocess.check_output function Running run-tests.py with Python 2.6 cause following error: ``` Running PySpark tests. Output is in python//Users/tomohiko/.jenkins/jobs/pyspark_test/workspace/python/unit-tests.log Will test against the following Python executables: ['python2.6', 'python3.4', 'pypy'] Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming'] Traceback (most recent call last): File "./python/run-tests.py", line 196, in main() File "./python/run-tests.py", line 159, in main python_implementation = subprocess.check_output( AttributeError: 'module' object has no attribute 'check_output' ... ``` The cause of this error is using subprocess.check_output function, which exists since Python 2.7. (ref. https://docs.python.org/2.7/library/subprocess.html#subprocess.check_output) Author: cocoatomo Closes #7161 from cocoatomo/issues/8763-test-fails-py26 and squashes the following commits: cf4f901 [cocoatomo] [SPARK-8763] backport process.check_output function from Python 2.7 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fdcad6ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fdcad6ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fdcad6ef Branch: refs/heads/master Commit: fdcad6ef48a9e790776c316124bd6478ab6bd5c8 Parents: 9765241 Author: cocoatomo Authored: Wed Jul 1 09:37:09 2015 -0700 Committer: Davies Liu Committed: Wed Jul 1 09:37:09 2015 -0700 -- python/run-tests.py | 21 +++-- 1 file changed, 19 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fdcad6ef/python/run-tests.py -- diff --git a/python/run-tests.py b/python/run-tests.py index b773765..7638854 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -31,6 +31,23 @@ if sys.version < '3': import Queue else: import queue as Queue +if sys.version_info >= (2, 7): +subprocess_check_output = subprocess.check_output +else: +# SPARK-8763 +# backported from subprocess module in Python 2.7 +def subprocess_check_output(*popenargs, **kwargs): +if 'stdout' in kwargs: +raise ValueError('stdout argument not allowed, it will be overridden.') +process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) +output, unused_err = process.communicate() +retcode = process.poll() +if retcode: +cmd = kwargs.get("args") +if cmd is None: +cmd = popenargs[0] +raise subprocess.CalledProcessError(retcode, cmd, output=output) +return output # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module @@ -156,11 +173,11 @@ def main(): task_queue = Queue.Queue() for python_exec in python_execs: -python_implementation = subprocess.check_output( +python_implementation = subprocess_check_output( [python_exec, "-c", "import platform; print(platform.python_implementation())"], universal_newlines=True).strip() LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) -LOGGER.debug("%s version is: %s", python_exec, subprocess.check_output( +LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output( [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) for module in modules_to_test: if python_implementation not in module.blacklisted_python_implementations: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8750][SQL] Remove the closure in functions.callUdf.
Repository: spark Updated Branches: refs/heads/master 0eee06158 -> 97652416e [SPARK-8750][SQL] Remove the closure in functions.callUdf. Author: Reynold Xin Closes #7148 from rxin/calludf-closure and squashes the following commits: 00df372 [Reynold Xin] Fixed index out of bound exception. 4beba76 [Reynold Xin] [SPARK-8750][SQL] Remove the closure in functions.callUdf. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/97652416 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/97652416 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/97652416 Branch: refs/heads/master Commit: 97652416e22ae7d4c471178377a7dda61afb1f7a Parents: 0eee061 Author: Reynold Xin Authored: Wed Jul 1 01:08:20 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 01:08:20 2015 -0700 -- .../src/main/scala/org/apache/spark/sql/functions.scala | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/97652416/sql/core/src/main/scala/org/apache/spark/sql/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 5767668..4e8f3f9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1829,7 +1829,15 @@ object functions { */ @deprecated("Use callUDF", "1.5.0") def callUdf(udfName: String, cols: Column*): Column = { - UnresolvedFunction(udfName, cols.map(_.expr)) +// Note: we avoid using closures here because on file systems that are case-insensitive, the +// compiled class file for the closure here will conflict with the one in callUDF (upper case). +val exprs = new Array[Expression](cols.size) +var i = 0 +while (i < cols.size) { + exprs(i) = cols(i).expr + i += 1 +} +UnresolvedFunction(udfName, exprs) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL] [MINOR] remove internalRowRDD in DataFrame
Repository: spark Updated Branches: refs/heads/master fc3a6fe67 -> 0eee06158 [SQL] [MINOR] remove internalRowRDD in DataFrame Developers have already familiar with `queryExecution.toRDD` as internal row RDD, and we should not add new concept. Author: Wenchen Fan Closes #7116 from cloud-fan/internal-rdd and squashes the following commits: 24756ca [Wenchen Fan] remove internalRowRDD Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0eee0615 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0eee0615 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0eee0615 Branch: refs/heads/master Commit: 0eee0615894cda8ae1b2c8e61b8bda0ff648a219 Parents: fc3a6fe Author: Wenchen Fan Authored: Wed Jul 1 01:02:33 2015 -0700 Committer: Michael Armbrust Committed: Wed Jul 1 01:02:33 2015 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 4 +--- .../org/apache/spark/sql/execution/stat/FrequentItems.scala | 2 +- .../org/apache/spark/sql/execution/stat/StatFunctions.scala | 2 +- .../src/main/scala/org/apache/spark/sql/sources/commands.scala | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0eee0615/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 8fe1f7e..caad2da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1469,14 +1469,12 @@ class DataFrame private[sql]( lazy val rdd: RDD[Row] = { // use a local variable to make sure the map closure doesn't capture the whole DataFrame val schema = this.schema -internalRowRdd.mapPartitions { rows => +queryExecution.toRdd.mapPartitions { rows => val converter = CatalystTypeConverters.createToScalaConverter(schema) rows.map(converter(_).asInstanceOf[Row]) } } - private[sql] def internalRowRdd = queryExecution.executedPlan.execute() - /** * Returns the content of the [[DataFrame]] as a [[JavaRDD]] of [[Row]]s. * @group rdd http://git-wip-us.apache.org/repos/asf/spark/blob/0eee0615/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala index 3ebbf96..4e2e2c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala @@ -90,7 +90,7 @@ private[sql] object FrequentItems extends Logging { (name, originalSchema.fields(index).dataType) } -val freqItems = df.select(cols.map(Column(_)) : _*).internalRowRdd.aggregate(countMaps)( +val freqItems = df.select(cols.map(Column(_)) : _*).queryExecution.toRdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { http://git-wip-us.apache.org/repos/asf/spark/blob/0eee0615/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index b624ef7..23ddfa9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -82,7 +82,7 @@ private[sql] object StatFunctions extends Logging { s"with dataType ${data.get.dataType} not supported.") } val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType))) -df.select(columns: _*).internalRowRdd.aggregate(new CovarianceCounter)( +df.select(columns: _*).queryExecution.toRdd.aggregate(new CovarianceCounter)( seqOp = (counter, row) => { counter.add(row.getDouble(0), row.getDouble(1)) }, http://git-wip-us.apache.org/repos/asf/spark/blob/0eee0615/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala index 42b51ca..7214eb0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/
spark git commit: [SPARK-8749][SQL] Remove HiveTypeCoercion trait.
Repository: spark Updated Branches: refs/heads/master 365c14055 -> fc3a6fe67 [SPARK-8749][SQL] Remove HiveTypeCoercion trait. Moved all the rules into the companion object. Author: Reynold Xin Closes #7147 from rxin/SPARK-8749 and squashes the following commits: c1c6dc0 [Reynold Xin] [SPARK-8749][SQL] Remove HiveTypeCoercion trait. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc3a6fe6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc3a6fe6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc3a6fe6 Branch: refs/heads/master Commit: fc3a6fe67f5aeda2443958c31f097daeba8549e5 Parents: 365c140 Author: Reynold Xin Authored: Wed Jul 1 00:08:16 2015 -0700 Committer: Reynold Xin Committed: Wed Jul 1 00:08:16 2015 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 4 +- .../catalyst/analysis/HiveTypeCoercion.scala| 59 +--- .../analysis/HiveTypeCoercionSuite.scala| 14 ++--- 3 files changed, 33 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc3a6fe6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 117c87a..15e84e6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -43,7 +43,7 @@ class Analyzer( registry: FunctionRegistry, conf: CatalystConf, maxIterations: Int = 100) - extends RuleExecutor[LogicalPlan] with HiveTypeCoercion with CheckAnalysis { + extends RuleExecutor[LogicalPlan] with CheckAnalysis { def resolver: Resolver = { if (conf.caseSensitiveAnalysis) { @@ -76,7 +76,7 @@ class Analyzer( ExtractWindowExpressions :: GlobalAggregates :: UnresolvedHavingClauseAttributes :: - typeCoercionRules ++ + HiveTypeCoercion.typeCoercionRules ++ extendedResolutionRules : _*) ) http://git-wip-us.apache.org/repos/asf/spark/blob/fc3a6fe6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index e525ad6..a9d396d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -22,7 +22,32 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.types._ + +/** + * A collection of [[Rule Rules]] that can be used to coerce differing types that + * participate in operations into compatible ones. Most of these rules are based on Hive semantics, + * but they do not introduce any dependencies on the hive codebase. For this reason they remain in + * Catalyst until we have a more standard set of coercions. + */ object HiveTypeCoercion { + + val typeCoercionRules = +PropagateTypes :: + ConvertNaNs :: + InConversion :: + WidenTypes :: + PromoteStrings :: + DecimalPrecision :: + BooleanEquality :: + StringToIntegralCasts :: + FunctionArgumentConversion :: + CaseWhenCoercion :: + IfCoercion :: + Division :: + PropagateTypes :: + AddCastForAutoCastInputTypes :: + Nil + // See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types. // The conversion for integral and floating point types have a linear widening hierarchy: private val numericPrecedence = @@ -79,7 +104,6 @@ object HiveTypeCoercion { }) } - /** * Find the tightest common type of a set of types by continuously applying * `findTightestCommonTypeOfTwo` on these types. @@ -90,34 +114,6 @@ object HiveTypeCoercion { case Some(d) => findTightestCommonTypeOfTwo(d, c) }) } -} - -/** - * A collection of [[Rule Rules]] that can be used to coerce differing types that - * participate in operations into compatible ones. Most of these rules are based on Hive semantics, - * but they do not introduce any dependencies on the hive codebase. For this reason they remain in - * Catalyst until we have a more standard set of coercions. - */ -trait HiveTypeCoercion { - - import HiveTypeCoercion._ - - val typeCoercio