spark git commit: [SPARK-6910] [SQL] Support for pushing predicates down to metastore for partition pruning
Repository: spark Updated Branches: refs/heads/master b7bcbe25f - 408b384de [SPARK-6910] [SQL] Support for pushing predicates down to metastore for partition pruning This PR supersedes my old one #6921. Since my patch has changed quite a bit, I am opening a new PR to make it easier to review. The changes include- * Implement `toMetastoreFilter()` function in `HiveShim` that takes `Seq[Expression]` and converts them into a filter string for Hive metastore. * This functions matches all the `AttributeReference` + `BinaryComparisonOp` + `Integral/StringType` patterns in `Seq[Expression]` and fold them into a string. * Change `hiveQlPartitions` field in `MetastoreRelation` to `getHiveQlPartitions()` function that takes a filter string parameter. * Call `getHiveQlPartitions()` in `HiveTableScan` with a filter string. But there are some cases in which predicate pushdown is disabled- Case | Predicate pushdown --- | - Hive integral and string types | Yes Hive varchar type | No Hive 0.13 and newer | Yes Hive 0.12 and older | No convertMetastoreParquet=false | Yes convertMetastoreParquet=true | No In case of `convertMetastoreParquet=true`, predicates are not pushed down because this conversion happens in an `Analyzer` rule (`HiveMetastoreCatalog.ParquetConversions`). At this point, `HiveTableScan` hasn't run, so predicates are not available. But reading the source code, I think it is intentional to convert the entire Hive table w/ all the partitions into `ParquetRelation` because then `ParquetRelation` can be cached and reused for any query against that table. Please correct me if I am wrong. cc marmbrus Author: Cheolsoo Park cheols...@netflix.com Closes #7216 from piaozhexiu/SPARK-6910-2 and squashes the following commits: aa1490f [Cheolsoo Park] Fix ordering of imports c212c4d [Cheolsoo Park] Incorporate review comments 5e93f9d [Cheolsoo Park] Predicate pushdown into Hive metastore Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/408b384d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/408b384d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/408b384d Branch: refs/heads/master Commit: 408b384de96b9dbe94945753f7947fbe84272ae1 Parents: b7bcbe2 Author: Cheolsoo Park cheols...@netflix.com Authored: Mon Jul 13 19:45:10 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 13 19:45:10 2015 -0700 -- .../spark/sql/hive/HiveMetastoreCatalog.scala | 58 + .../org/apache/spark/sql/hive/HiveShim.scala| 1 + .../apache/spark/sql/hive/HiveStrategies.scala | 4 +- .../spark/sql/hive/client/ClientInterface.scala | 11 +++- .../spark/sql/hive/client/ClientWrapper.scala | 22 --- .../apache/spark/sql/hive/client/HiveShim.scala | 68 +++- .../sql/hive/execution/HiveTableScan.scala | 7 +- .../spark/sql/hive/client/VersionsSuite.scala | 8 +++ .../spark/sql/hive/execution/PruningSuite.scala | 2 +- 9 files changed, 137 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/408b384d/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 4b7a782..5bdf68c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -301,7 +301,9 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive val result = if (metastoreRelation.hiveQlTable.isPartitioned) { val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys) val partitionColumnDataTypes = partitionSchema.map(_.dataType) - val partitions = metastoreRelation.hiveQlPartitions.map { p = + // We're converting the entire table into ParquetRelation, so predicates to Hive metastore + // are empty. + val partitions = metastoreRelation.getHiveQlPartitions().map { p = val location = p.getLocation val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map { case (rawValue, dataType) = Cast(Literal(rawValue), dataType).eval(null) @@ -644,32 +646,6 @@ private[hive] case class MetastoreRelation new Table(tTable) } - @transient val hiveQlPartitions: Seq[Partition] = table.getAllPartitions.map { p = -val tPartition = new org.apache.hadoop.hive.metastore.api.Partition -tPartition.setDbName(databaseName) -tPartition.setTableName(tableName) -tPartition.setValues(p.values)
spark git commit: [SPARK-8954] [BUILD] Remove unneeded deb repository from Dockerfile to fix build error in docker.
Repository: spark Updated Branches: refs/heads/master 79c35826e - 5c41691fe [SPARK-8954] [BUILD] Remove unneeded deb repository from Dockerfile to fix build error in docker. [SPARK-8954] [Build] 1. Remove unneeded deb repository from Dockerfile to fix build error in docker. 2. Remove unneeded /var/lib/apt/lists/* after install to reduce the docker image size (by ~30MB). Author: yongtang yongt...@users.noreply.github.com Closes #7346 from yongtang/SPARK-8954 and squashes the following commits: 36024a1 [yongtang] [SPARK-8954] [Build] Remove unneeded /var/lib/apt/lists/* after install to reduce the docker image size (by ~30MB) 7084941 [yongtang] [SPARK-8954] [Build] Remove unneeded deb repository from Dockerfile to fix build error in docker. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c41691f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c41691f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c41691f Branch: refs/heads/master Commit: 5c41691fe4da0017ccc7e372e3f655dc1c815349 Parents: 79c3582 Author: yongtang yongt...@users.noreply.github.com Authored: Mon Jul 13 12:01:23 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Mon Jul 13 12:01:23 2015 -0700 -- docker/spark-test/base/Dockerfile | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c41691f/docker/spark-test/base/Dockerfile -- diff --git a/docker/spark-test/base/Dockerfile b/docker/spark-test/base/Dockerfile index 5956d59..5dbdb8b 100644 --- a/docker/spark-test/base/Dockerfile +++ b/docker/spark-test/base/Dockerfile @@ -17,13 +17,13 @@ FROM ubuntu:precise -RUN echo deb http://archive.ubuntu.com/ubuntu precise main universe /etc/apt/sources.list - # Upgrade package index -RUN apt-get update - # install a few other useful packages plus Open Jdk 7 -RUN apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server +# Remove unneeded /var/lib/apt/lists/* after install to reduce the +# docker image size (by ~30MB) +RUN apt-get update \ +apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server \ +rm -rf /var/lib/apt/lists/* ENV SCALA_VERSION 2.10.4 ENV CDH_VERSION cdh4 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8991] [ML] Update SharedParamsCodeGen's Generated Documentation
Repository: spark Updated Branches: refs/heads/master 5c41691fe - 714fc55f4 [SPARK-8991] [ML] Update SharedParamsCodeGen's Generated Documentation Removed private[ml] from Generated documentation Author: Vinod K C vinod...@huawei.com Closes #7367 from vinodkc/fix_sharedparmascodegen and squashes the following commits: 4fa3c8f [Vinod K C] Adding auto generated code 7e19025 [Vinod K C] Removed private[ml] Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/714fc55f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/714fc55f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/714fc55f Branch: refs/heads/master Commit: 714fc55f4aadd5e7b7fb1e462910bfb6a82d9154 Parents: 5c41691 Author: Vinod K C vinod...@huawei.com Authored: Mon Jul 13 12:03:39 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Jul 13 12:03:39 2015 -0700 -- .../ml/param/shared/SharedParamsCodeGen.scala | 3 +- .../spark/ml/param/shared/sharedParams.scala| 37 ++-- 2 files changed, 19 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/714fc55f/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 66b751a..f7ae1de 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -134,7 +134,7 @@ private[shared] object SharedParamsCodeGen { s |/** - | * (private[ml]) Trait for shared param $name$defaultValueDoc. + | * Trait for shared param $name$defaultValueDoc. | */ |private[ml] trait Has$Name extends Params { | @@ -173,7 +173,6 @@ private[shared] object SharedParamsCodeGen { |package org.apache.spark.ml.param.shared | |import org.apache.spark.ml.param._ -|import org.apache.spark.util.Utils | |// DO NOT MODIFY THIS FILE! It was generated by SharedParamsCodeGen. | http://git-wip-us.apache.org/repos/asf/spark/blob/714fc55f/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index f81bd76..65e48e4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -18,14 +18,13 @@ package org.apache.spark.ml.param.shared import org.apache.spark.ml.param._ -import org.apache.spark.util.Utils // DO NOT MODIFY THIS FILE! It was generated by SharedParamsCodeGen. // scalastyle:off /** - * (private[ml]) Trait for shared param regParam. + * Trait for shared param regParam. */ private[ml] trait HasRegParam extends Params { @@ -40,7 +39,7 @@ private[ml] trait HasRegParam extends Params { } /** - * (private[ml]) Trait for shared param maxIter. + * Trait for shared param maxIter. */ private[ml] trait HasMaxIter extends Params { @@ -55,7 +54,7 @@ private[ml] trait HasMaxIter extends Params { } /** - * (private[ml]) Trait for shared param featuresCol (default: features). + * Trait for shared param featuresCol (default: features). */ private[ml] trait HasFeaturesCol extends Params { @@ -72,7 +71,7 @@ private[ml] trait HasFeaturesCol extends Params { } /** - * (private[ml]) Trait for shared param labelCol (default: label). + * Trait for shared param labelCol (default: label). */ private[ml] trait HasLabelCol extends Params { @@ -89,7 +88,7 @@ private[ml] trait HasLabelCol extends Params { } /** - * (private[ml]) Trait for shared param predictionCol (default: prediction). + * Trait for shared param predictionCol (default: prediction). */ private[ml] trait HasPredictionCol extends Params { @@ -106,7 +105,7 @@ private[ml] trait HasPredictionCol extends Params { } /** - * (private[ml]) Trait for shared param rawPredictionCol (default: rawPrediction). + * Trait for shared param rawPredictionCol (default: rawPrediction). */ private[ml] trait HasRawPredictionCol extends Params { @@ -123,7 +122,7 @@ private[ml] trait HasRawPredictionCol extends Params { } /** - * (private[ml]) Trait for shared param probabilityCol (default: probability). + * Trait for shared param probabilityCol (default: probability). */ private[ml] trait HasProbabilityCol extends Params {
spark git commit: [SPARK-8950] [WEBUI] Correct the calculation of SchedulerDelay in StagePage
Repository: spark Updated Branches: refs/heads/master 9b62e9375 - 5ca26fb64 [SPARK-8950] [WEBUI] Correct the calculation of SchedulerDelay in StagePage In StagePage, the SchedulerDelay is calculated as totalExecutionTime - executorRunTime - executorOverhead - gettingResultTime. But the totalExecutionTime is calculated in the way that doesn't include the gettingResultTime. Author: Carson Wang carson.w...@intel.com Closes #7319 from carsonwang/SchedulerDelayTime and squashes the following commits: f66fb6e [Carson Wang] Update the code style 7d971ae [Carson Wang] Correct the calculation of SchedulerDelay Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ca26fb6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ca26fb6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ca26fb6 Branch: refs/heads/master Commit: 5ca26fb64de99fa414dc59ce4cf29a0171894793 Parents: 9b62e93 Author: Carson Wang carson.w...@intel.com Authored: Mon Jul 13 11:20:04 2015 -0700 Committer: Kay Ousterhout kayousterh...@gmail.com Committed: Mon Jul 13 11:20:04 2015 -0700 -- .../org/apache/spark/ui/jobs/StagePage.scala| 45 ++-- 1 file changed, 22 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ca26fb6/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index 60e3c63..ff0a339 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -332,7 +332,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage(stage) { /td +: getFormattedTimeQuantiles(serializationTimes) val gettingResultTimes = validTasks.map { case TaskUIData(info, _, _) = -getGettingResultTime(info).toDouble +getGettingResultTime(info, currentTime).toDouble } val gettingResultQuantiles = td @@ -346,7 +346,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage(stage) { // machine and to send back the result (but not the time to fetch the task result, // if it needed to be fetched from the block manager on the worker). val schedulerDelays = validTasks.map { case TaskUIData(info, metrics, _) = -getSchedulerDelay(info, metrics.get).toDouble +getSchedulerDelay(info, metrics.get, currentTime).toDouble } val schedulerDelayTitle = tdspan data-toggle=tooltip title={ToolTips.SCHEDULER_DELAY} data-placement=rightScheduler Delay/span/td @@ -544,7 +544,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage(stage) { val serializationTimeProportion = toProportion(serializationTime) val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L) val deserializationTimeProportion = toProportion(deserializationTime) -val gettingResultTime = getGettingResultTime(taskUIData.taskInfo) +val gettingResultTime = getGettingResultTime(taskUIData.taskInfo, currentTime) val gettingResultTimeProportion = toProportion(gettingResultTime) val schedulerDelay = totalExecutionTime - (executorComputingTime + shuffleReadTime + shuffleWriteTime + @@ -685,11 +685,11 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage(stage) { else metrics.map(_.executorRunTime).getOrElse(1L) val formatDuration = if (info.status == RUNNING) UIUtils.formatDuration(duration) else metrics.map(m = UIUtils.formatDuration(m.executorRunTime)).getOrElse() - val schedulerDelay = metrics.map(getSchedulerDelay(info, _)).getOrElse(0L) + val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L) val gcTime = metrics.map(_.jvmGCTime).getOrElse(0L) val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L) val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L) - val gettingResultTime = getGettingResultTime(info) + val gettingResultTime = getGettingResultTime(info, currentTime) val maybeAccumulators = info.accumulables val accumulatorsReadable = maybeAccumulators.map{acc = s${acc.name}: ${acc.update.get}} @@ -852,32 +852,31 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage(stage) { td{errorSummary}{details}/td } - private def getGettingResultTime(info: TaskInfo): Long = { -if (info.gettingResultTime 0) { - if (info.finishTime 0) { +
spark git commit: [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark
Repository: spark Updated Branches: refs/heads/master 7f487c8bd - 9b62e9375 [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark This adds Pylint checks to PySpark. For now this lazy installs using easy_install to /dev/pylint (similar to the pep8 script). We still need to figure out what rules to be allowed. Author: MechCoder manojkumarsivaraj...@gmail.com Closes #7241 from MechCoder/pylint and squashes the following commits: 8496834 [MechCoder] Silence warnings and make pylint tests fail to check if it works in jenkins 57393a3 [MechCoder] undefined-variable a8e2547 [MechCoder] Minor changes 7753810 [MechCoder] remove trailing whitespace 75c5d2b [MechCoder] Remove blacklisted arguments and pointless statements check 6bde250 [MechCoder] Disable all checks for now 3464666 [MechCoder] Add pylint configuration file d28109f [MechCoder] [SPARK-8706] [PySpark] [Project infra] Add pylint checks to PySpark Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9b62e937 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9b62e937 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9b62e937 Branch: refs/heads/master Commit: 9b62e9375f032548d386aec7468e3d0f7c6da7b2 Parents: 7f487c8 Author: MechCoder manojkumarsivaraj...@gmail.com Authored: Mon Jul 13 09:47:53 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Mon Jul 13 09:47:53 2015 -0700 -- dev/lint-python | 57 - pylintrc | 404 + python/pyspark/ml/param/shared.py | 4 +- 3 files changed, 455 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9b62e937/dev/lint-python -- diff --git a/dev/lint-python b/dev/lint-python index 0c35864..e02dff2 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -21,12 +21,14 @@ SCRIPT_DIR=$( cd $( dirname $0 ) pwd ) SPARK_ROOT_DIR=$(dirname $SCRIPT_DIR) PATHS_TO_CHECK=./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport PATHS_TO_CHECK=$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py -PYTHON_LINT_REPORT_PATH=$SPARK_ROOT_DIR/dev/python-lint-report.txt +PEP8_REPORT_PATH=$SPARK_ROOT_DIR/dev/pep8-report.txt +PYLINT_REPORT_PATH=$SPARK_ROOT_DIR/dev/pylint-report.txt +PYLINT_INSTALL_INFO=$SPARK_ROOT_DIR/dev/pylint-info.txt cd $SPARK_ROOT_DIR # compileall: https://docs.python.org/2/library/compileall.html -python -B -m compileall -q -l $PATHS_TO_CHECK $PYTHON_LINT_REPORT_PATH +python -B -m compileall -q -l $PATHS_TO_CHECK $PEP8_REPORT_PATH compile_status=${PIPESTATUS[0]} # Get pep8 at runtime so that we don't rely on it being installed on the build server. @@ -47,11 +49,36 @@ if [ ! -e $PEP8_SCRIPT_PATH ]; then fi fi +# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should +# be set to the directory. +# dev/pylint should be appended to the PATH variable as well. +# Jenkins by default installs the pylint3 version, so for now this just checks the code quality +# of python3. +export PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint +export PYLINT_HOME=$PYTHONPATH +export PATH=$PYTHONPATH:$PATH + +if [ ! -d $PYLINT_HOME ]; then +mkdir $PYLINT_HOME +# Redirect the annoying pylint installation output. +easy_install -d $PYLINT_HOME pylint==1.4.4 $PYLINT_INSTALL_INFO +easy_install_status=$? + +if [ $easy_install_status -ne 0 ]; then +echo Unable to install pylint locally in \$PYTHONPATH\. +cat $PYLINT_INSTALL_INFO +exit $easy_install_status +fi + +rm $PYLINT_INSTALL_INFO + +fi + # There is no need to write this output to a file #+ first, but we do so so that the check status can #+ be output before the report, like with the #+ scalastyle and RAT checks. -python $PEP8_SCRIPT_PATH --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK $PYTHON_LINT_REPORT_PATH +python $PEP8_SCRIPT_PATH --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK $PEP8_REPORT_PATH pep8_status=${PIPESTATUS[0]} if [ $compile_status -eq 0 -a $pep8_status -eq 0 ]; then @@ -61,13 +88,27 @@ else fi if [ $lint_status -ne 0 ]; then -echo Python lint checks failed. -cat $PYTHON_LINT_REPORT_PATH +echo PEP8 checks failed. +cat $PEP8_REPORT_PATH +else +echo PEP8 checks passed. +fi + +rm $PEP8_REPORT_PATH + +for to_be_checked in $PATHS_TO_CHECK +do +pylint --rcfile=$SPARK_ROOT_DIR/pylintrc $to_be_checked $PYLINT_REPORT_PATH +done + +if [ ${PIPESTATUS[0]} -ne 0 ]; then +lint_status=1 +echo Pylint checks failed. +cat $PYLINT_REPORT_PATH else -echo Python lint checks passed. +echo Pylint checks passed. fi -# rm $PEP8_SCRIPT_PATH -rm $PYTHON_LINT_REPORT_PATH
spark git commit: Revert [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark
Repository: spark Updated Branches: refs/heads/master 5ca26fb64 - 79c35826e Revert [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark This reverts commit 9b62e9375f032548d386aec7468e3d0f7c6da7b2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79c35826 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79c35826 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79c35826 Branch: refs/heads/master Commit: 79c35826e626805c5df776b3d22ed37829d239b6 Parents: 5ca26fb Author: Davies Liu davies@gmail.com Authored: Mon Jul 13 11:30:36 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Mon Jul 13 11:30:36 2015 -0700 -- dev/lint-python | 57 + pylintrc | 404 - python/pyspark/ml/param/shared.py | 4 +- 3 files changed, 10 insertions(+), 455 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79c35826/dev/lint-python -- diff --git a/dev/lint-python b/dev/lint-python index e02dff2..0c35864 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -21,14 +21,12 @@ SCRIPT_DIR=$( cd $( dirname $0 ) pwd ) SPARK_ROOT_DIR=$(dirname $SCRIPT_DIR) PATHS_TO_CHECK=./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport PATHS_TO_CHECK=$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py -PEP8_REPORT_PATH=$SPARK_ROOT_DIR/dev/pep8-report.txt -PYLINT_REPORT_PATH=$SPARK_ROOT_DIR/dev/pylint-report.txt -PYLINT_INSTALL_INFO=$SPARK_ROOT_DIR/dev/pylint-info.txt +PYTHON_LINT_REPORT_PATH=$SPARK_ROOT_DIR/dev/python-lint-report.txt cd $SPARK_ROOT_DIR # compileall: https://docs.python.org/2/library/compileall.html -python -B -m compileall -q -l $PATHS_TO_CHECK $PEP8_REPORT_PATH +python -B -m compileall -q -l $PATHS_TO_CHECK $PYTHON_LINT_REPORT_PATH compile_status=${PIPESTATUS[0]} # Get pep8 at runtime so that we don't rely on it being installed on the build server. @@ -49,36 +47,11 @@ if [ ! -e $PEP8_SCRIPT_PATH ]; then fi fi -# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should -# be set to the directory. -# dev/pylint should be appended to the PATH variable as well. -# Jenkins by default installs the pylint3 version, so for now this just checks the code quality -# of python3. -export PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint -export PYLINT_HOME=$PYTHONPATH -export PATH=$PYTHONPATH:$PATH - -if [ ! -d $PYLINT_HOME ]; then -mkdir $PYLINT_HOME -# Redirect the annoying pylint installation output. -easy_install -d $PYLINT_HOME pylint==1.4.4 $PYLINT_INSTALL_INFO -easy_install_status=$? - -if [ $easy_install_status -ne 0 ]; then -echo Unable to install pylint locally in \$PYTHONPATH\. -cat $PYLINT_INSTALL_INFO -exit $easy_install_status -fi - -rm $PYLINT_INSTALL_INFO - -fi - # There is no need to write this output to a file #+ first, but we do so so that the check status can #+ be output before the report, like with the #+ scalastyle and RAT checks. -python $PEP8_SCRIPT_PATH --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK $PEP8_REPORT_PATH +python $PEP8_SCRIPT_PATH --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK $PYTHON_LINT_REPORT_PATH pep8_status=${PIPESTATUS[0]} if [ $compile_status -eq 0 -a $pep8_status -eq 0 ]; then @@ -88,27 +61,13 @@ else fi if [ $lint_status -ne 0 ]; then -echo PEP8 checks failed. -cat $PEP8_REPORT_PATH -else -echo PEP8 checks passed. -fi - -rm $PEP8_REPORT_PATH - -for to_be_checked in $PATHS_TO_CHECK -do -pylint --rcfile=$SPARK_ROOT_DIR/pylintrc $to_be_checked $PYLINT_REPORT_PATH -done - -if [ ${PIPESTATUS[0]} -ne 0 ]; then -lint_status=1 -echo Pylint checks failed. -cat $PYLINT_REPORT_PATH +echo Python lint checks failed. +cat $PYTHON_LINT_REPORT_PATH else -echo Pylint checks passed. +echo Python lint checks passed. fi -rm $PYLINT_REPORT_PATH +# rm $PEP8_SCRIPT_PATH +rm $PYTHON_LINT_REPORT_PATH exit $lint_status http://git-wip-us.apache.org/repos/asf/spark/blob/79c35826/pylintrc -- diff --git a/pylintrc b/pylintrc deleted file mode 100644 index 0617759..000 --- a/pylintrc +++ /dev/null @@ -1,404 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the License); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at
spark git commit: [SPARK-8596] Add module for rstudio link to spark
Repository: spark Updated Branches: refs/heads/master 6b8994383 - a5bc803b7 [SPARK-8596] Add module for rstudio link to spark shivaram, added module for rstudio install Author: Vincent D. Warmerdam vincentwarmer...@gmail.com Closes #7366 from koaning/rstudio-install and squashes the following commits: e47c2da [Vincent D. Warmerdam] added rstudio module Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a5bc803b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a5bc803b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a5bc803b Branch: refs/heads/master Commit: a5bc803b7271028e34de1548d55b80ecfb812a7b Parents: 6b89943 Author: Vincent D. Warmerdam vincentwarmer...@gmail.com Authored: Mon Jul 13 08:15:54 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Mon Jul 13 08:15:54 2015 -0700 -- ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a5bc803b/ec2/spark_ec2.py -- diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index ae4f2ec..7c83d68 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -793,7 +793,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon'] + 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] if opts.hadoop_major_version == 1: modules = list(filter(lambda x: x != mapreduce, modules)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8636] [SQL] Fix equalNullSafe comparison
Repository: spark Updated Branches: refs/heads/master 714fc55f4 - 4c797f2b0 [SPARK-8636] [SQL] Fix equalNullSafe comparison Author: Vinod K C vinod...@huawei.com Closes #7040 from vinodkc/fix_CaseKeyWhen_equalNullSafe and squashes the following commits: be5e641 [Vinod K C] Renamed equalNullSafe to threeValueEquals aac9f67 [Vinod K C] Updated test suite and genCode method f2d0b53 [Vinod K C] Fix equalNullSafe comparison Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c797f2b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c797f2b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c797f2b Branch: refs/heads/master Commit: 4c797f2b0989317a2d004e5f72a0e593919737ea Parents: 714fc55 Author: Vinod K C vinod...@huawei.com Authored: Mon Jul 13 12:51:33 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 13 12:51:33 2015 -0700 -- .../spark/sql/catalyst/expressions/conditionals.scala| 11 --- .../expressions/ConditionalExpressionSuite.scala | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c797f2b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala index e6a705f..84c28c2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala @@ -238,7 +238,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW // If all branches fail and an elseVal is not provided, the whole statement // defaults to null, according to Hive's semantics. while (i len - 1) { - if (equalNullSafe(evaluatedKey, branchesArr(i).eval(input))) { + if (threeValueEquals(evaluatedKey, branchesArr(i).eval(input))) { return branchesArr(i + 1).eval(input) } i += 2 @@ -261,8 +261,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW s if (!$got) { ${cond.code} - if (${keyEval.isNull} ${cond.isNull} || -!${keyEval.isNull} !${cond.isNull} + if (!${keyEval.isNull} !${cond.isNull} ${ctx.genEqual(key.dataType, keyEval.primitive, cond.primitive)}) { $got = true; ${res.code} @@ -296,10 +295,8 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW } - private def equalNullSafe(l: Any, r: Any) = { -if (l == null r == null) { - true -} else if (l == null || r == null) { + private def threeValueEquals(l: Any, r: Any) = { +if (l == null || r == null) { false } else { l == r http://git-wip-us.apache.org/repos/asf/spark/blob/4c797f2b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala index aaf40cc..adadc8c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala @@ -125,7 +125,7 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper val literalString = Literal(a) checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, c5)), b, row) -checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), b, row) +checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), c, row) checkEvaluation(CaseKeyWhen(c2, Seq(literalInt, c4, c5)), a, row) checkEvaluation(CaseKeyWhen(c2, Seq(c1, c4, c5)), b, row) checkEvaluation(CaseKeyWhen(c4, Seq(literalString, c2, c3)), 1, row) @@ -134,7 +134,7 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(CaseKeyWhen(literalInt, Seq(c2, c4, c5)), a, row) checkEvaluation(CaseKeyWhen(literalString, Seq(c5, c2, c4, c3)), 2, row) checkEvaluation(CaseKeyWhen(c6, Seq(c5, c2, c4, c3)), null, row) -checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), c, row) +checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), null, row) }
spark git commit: [SPARK-8533] [STREAMING] Upgrade Flume to 1.6.0
Repository: spark Updated Branches: refs/heads/master 4c797f2b0 - 0aed38e44 [SPARK-8533] [STREAMING] Upgrade Flume to 1.6.0 Author: Hari Shreedharan hshreedha...@apache.org Closes #6939 from harishreedharan/upgrade-flume-1.6.0 and squashes the following commits: 94b80ae [Hari Shreedharan] [SPARK-8533][Streaming] Upgrade Flume to 1.6.0 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0aed38e4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0aed38e4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0aed38e4 Branch: refs/heads/master Commit: 0aed38e4498b24d372bfdc7001959e78536369a1 Parents: 4c797f2 Author: Hari Shreedharan hshreedha...@apache.org Authored: Mon Jul 13 14:15:31 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jul 13 14:15:31 2015 -0700 -- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0aed38e4/pom.xml -- diff --git a/pom.xml b/pom.xml index c2ebc1a..370c95d 100644 --- a/pom.xml +++ b/pom.xml @@ -128,7 +128,7 @@ yarn.version${hadoop.version}/yarn.version hbase.version0.98.7-hadoop2/hbase.version hbase.artifacthbase/hbase.artifact -flume.version1.4.0/flume.version +flume.version1.6.0/flume.version zookeeper.version3.4.5/zookeeper.version curator.version2.4.0/curator.version hive.grouporg.spark-project.hive/hive.group - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8743] [STREAMING] Deregister Codahale metrics for streaming when StreamingContext is closed
Repository: spark Updated Branches: refs/heads/branch-1.4 898e5f76f - 50607eca5 [SPARK-8743] [STREAMING] Deregister Codahale metrics for streaming when StreamingContext is closed The issue link: https://issues.apache.org/jira/browse/SPARK-8743 Deregister Codahale metrics for streaming when StreamingContext is closed Design: Adding the method calls in the appropriate start() and stop () methods for the StreamingContext Actions in the PullRequest: 1) Added the registerSource method call to the start method for the Streaming Context. 2) Added the removeSource method to the stop method. 3) Added comments for both 1 and 2 and comment to show initialization of the StreamingSource 4) Added a test case to check for both registration and de-registration of metrics Previous closed PR for reference: https://github.com/apache/spark/pull/7250 Author: Neelesh Srinivas Salian nsal...@cloudera.com Closes #7362 from nssalian/branch-SPARK-8743 and squashes the following commits: 7d998a3 [Neelesh Srinivas Salian] Removed the Thread.sleep() call 8b26397 [Neelesh Srinivas Salian] Moved the scalatest.{} import 0e8007a [Neelesh Srinivas Salian] moved import org.apache.spark{} to correct place daedaa5 [Neelesh Srinivas Salian] Corrected Ordering of imports 8873180 [Neelesh Srinivas Salian] Removed redundancy in imports 59227a4 [Neelesh Srinivas Salian] Changed the ordering of the imports to classify scala and spark imports d8cb577 [Neelesh Srinivas Salian] Added registerSource to start() and removeSource to stop(). Wrote a test to check the registration and de-registration (cherry picked from commit b7bcbe25f90ba4e78b548465bc80d4de1d2c4a4a) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50607eca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50607eca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50607eca Branch: refs/heads/branch-1.4 Commit: 50607eca59568db47ff3a1c8a19ec96b919131d7 Parents: 898e5f7 Author: Neelesh Srinivas Salian nsal...@cloudera.com Authored: Mon Jul 13 15:46:51 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jul 13 15:47:18 2015 -0700 -- .../spark/streaming/StreamingContext.scala | 10 +++-- .../spark/streaming/StreamingContextSuite.scala | 41 +++- 2 files changed, 45 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/50607eca/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index ec49d0f..6b78a82 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -192,11 +192,8 @@ class StreamingContext private[streaming] ( None } - /** Register streaming source to metrics system */ + /* Initializing a streamingSource to register metrics */ private val streamingSource = new StreamingSource(this) - assert(env != null) - assert(env.metricsSystem != null) - env.metricsSystem.registerSource(streamingSource) private var state: StreamingContextState = INITIALIZED @@ -606,6 +603,9 @@ class StreamingContext private[streaming] ( } shutdownHookRef = Utils.addShutdownHook( StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown) +// Registering Streaming Metrics at the start of the StreamingContext +assert(env.metricsSystem != null) +env.metricsSystem.registerSource(streamingSource) uiTab.foreach(_.attach()) logInfo(StreamingContext started) case ACTIVE = @@ -682,6 +682,8 @@ class StreamingContext private[streaming] ( logWarning(StreamingContext has already been stopped) case ACTIVE = scheduler.stop(stopGracefully) + // Removing the streamingSource to de-register the metrics on stop() + env.metricsSystem.removeSource(streamingSource) uiTab.foreach(_.detach()) StreamingContext.setActiveContext(null) waiter.notifyStop() http://git-wip-us.apache.org/repos/asf/spark/blob/50607eca/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index 56b4ce5..289a159 100644 ---
spark git commit: [SPARK-6797] [SPARKR] Add support for YARN cluster mode.
Repository: spark Updated Branches: refs/heads/master a5bc803b7 - 7f487c8bd [SPARK-6797] [SPARKR] Add support for YARN cluster mode. This PR enables SparkR to dynamically ship the SparkR binary package to the AM node in YARN cluster mode, thus it is no longer required that the SparkR package be installed on each worker node. This PR uses the JDK jar tool to package the SparkR package, because jar is thought to be available on both Linux/Windows platforms where JDK has been installed. This PR does not address the R worker involved in RDD API. Will address it in a separate JIRA issue. This PR does not address SBT build. SparkR installation and packaging by SBT will be addressed in a separate JIRA issue. R/install-dev.bat is not tested. shivaram , Could you help to test it? Author: Sun Rui rui@intel.com Closes #6743 from sun-rui/SPARK-6797 and squashes the following commits: ca63c86 [Sun Rui] Adjust MimaExcludes after rebase. 7313374 [Sun Rui] Fix unit test errors. 72695fb [Sun Rui] Fix unit test failures. 193882f [Sun Rui] Fix Mima test error. fe25a33 [Sun Rui] Fix Mima test error. 35ecfa3 [Sun Rui] Fix comments. c38a005 [Sun Rui] Unzipped SparkR binary package is still required for standalone and Mesos modes. b05340c [Sun Rui] Fix scala style. 2ca5048 [Sun Rui] Fix comments. 1acefd1 [Sun Rui] Fix scala style. 0aa1e97 [Sun Rui] Fix scala style. 41d4f17 [Sun Rui] Add support for locating SparkR package for R workers required by RDD APIs. 49ff948 [Sun Rui] Invoke jar.exe with full path in install-dev.bat. 7b916c5 [Sun Rui] Use 'rem' consistently. 3bed438 [Sun Rui] Add a comment. 681afb0 [Sun Rui] Fix a bug that RRunner does not handle client deployment modes. cedfbe2 [Sun Rui] [SPARK-6797][SPARKR] Add support for YARN cluster mode. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f487c8b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f487c8b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f487c8b Branch: refs/heads/master Commit: 7f487c8bde14dbdd244a3493ad11a129ef2bb327 Parents: a5bc803 Author: Sun Rui rui@intel.com Authored: Mon Jul 13 08:21:47 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Mon Jul 13 08:21:47 2015 -0700 -- R/install-dev.bat | 5 ++ R/install-dev.sh| 8 ++- R/pkg/DESCRIPTION | 1 - R/pkg/R/RDD.R | 2 - R/pkg/R/pairRDD.R | 1 - R/pkg/R/sparkR.R| 10 --- R/pkg/R/zzz.R | 20 -- R/pkg/inst/profile/general.R| 4 +- .../scala/org/apache/spark/api/r/RRDD.scala | 21 +++ .../scala/org/apache/spark/api/r/RUtils.scala | 65 .../scala/org/apache/spark/deploy/RRunner.scala | 7 ++- .../org/apache/spark/deploy/SparkSubmit.scala | 27 .../apache/spark/deploy/SparkSubmitSuite.scala | 3 +- make-distribution.sh| 1 + project/MimaExcludes.scala | 12 15 files changed, 133 insertions(+), 54 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f487c8b/R/install-dev.bat -- diff --git a/R/install-dev.bat b/R/install-dev.bat index 008a5c6..f32670b 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -25,3 +25,8 @@ set SPARK_HOME=%~dp0.. MKDIR %SPARK_HOME%\R\lib R.exe CMD INSTALL --library=%SPARK_HOME%\R\lib %SPARK_HOME%\R\pkg\ + +rem Zip the SparkR package so that it can be distributed to worker nodes on YARN +pushd %SPARK_HOME%\R\lib +%JAVA_HOME%\bin\jar.exe cfM %SPARK_HOME%\R\lib\sparkr.zip SparkR +popd http://git-wip-us.apache.org/repos/asf/spark/blob/7f487c8b/R/install-dev.sh -- diff --git a/R/install-dev.sh b/R/install-dev.sh index 1edd551..4972bb9 100755 --- a/R/install-dev.sh +++ b/R/install-dev.sh @@ -34,7 +34,7 @@ LIB_DIR=$FWDIR/lib mkdir -p $LIB_DIR -pushd $FWDIR +pushd $FWDIR /dev/null # Generate Rd files if devtools is installed Rscript -e ' if(devtools %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg=./pkg, roclets=c(rd)) }' @@ -42,4 +42,8 @@ Rscript -e ' if(devtools %in% rownames(installed.packages())) { library(devtoo # Install SparkR to $LIB_DIR R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/ -popd +# Zip the SparkR package so that it can be distributed to worker nodes on YARN +cd $LIB_DIR +jar cfM $LIB_DIR/sparkr.zip SparkR + +popd /dev/null http://git-wip-us.apache.org/repos/asf/spark/blob/7f487c8b/R/pkg/DESCRIPTION
spark git commit: [SPARK-8743] [STREAMING] Deregister Codahale metrics for streaming when StreamingContext is closed
Repository: spark Updated Branches: refs/heads/master 0aed38e44 - b7bcbe25f [SPARK-8743] [STREAMING] Deregister Codahale metrics for streaming when StreamingContext is closed The issue link: https://issues.apache.org/jira/browse/SPARK-8743 Deregister Codahale metrics for streaming when StreamingContext is closed Design: Adding the method calls in the appropriate start() and stop () methods for the StreamingContext Actions in the PullRequest: 1) Added the registerSource method call to the start method for the Streaming Context. 2) Added the removeSource method to the stop method. 3) Added comments for both 1 and 2 and comment to show initialization of the StreamingSource 4) Added a test case to check for both registration and de-registration of metrics Previous closed PR for reference: https://github.com/apache/spark/pull/7250 Author: Neelesh Srinivas Salian nsal...@cloudera.com Closes #7362 from nssalian/branch-SPARK-8743 and squashes the following commits: 7d998a3 [Neelesh Srinivas Salian] Removed the Thread.sleep() call 8b26397 [Neelesh Srinivas Salian] Moved the scalatest.{} import 0e8007a [Neelesh Srinivas Salian] moved import org.apache.spark{} to correct place daedaa5 [Neelesh Srinivas Salian] Corrected Ordering of imports 8873180 [Neelesh Srinivas Salian] Removed redundancy in imports 59227a4 [Neelesh Srinivas Salian] Changed the ordering of the imports to classify scala and spark imports d8cb577 [Neelesh Srinivas Salian] Added registerSource to start() and removeSource to stop(). Wrote a test to check the registration and de-registration Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7bcbe25 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7bcbe25 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7bcbe25 Branch: refs/heads/master Commit: b7bcbe25f90ba4e78b548465bc80d4de1d2c4a4a Parents: 0aed38e Author: Neelesh Srinivas Salian nsal...@cloudera.com Authored: Mon Jul 13 15:46:51 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jul 13 15:46:51 2015 -0700 -- .../spark/streaming/StreamingContext.scala | 10 +++-- .../spark/streaming/StreamingContextSuite.scala | 41 +++- 2 files changed, 45 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7bcbe25/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index ec49d0f..6b78a82 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -192,11 +192,8 @@ class StreamingContext private[streaming] ( None } - /** Register streaming source to metrics system */ + /* Initializing a streamingSource to register metrics */ private val streamingSource = new StreamingSource(this) - assert(env != null) - assert(env.metricsSystem != null) - env.metricsSystem.registerSource(streamingSource) private var state: StreamingContextState = INITIALIZED @@ -606,6 +603,9 @@ class StreamingContext private[streaming] ( } shutdownHookRef = Utils.addShutdownHook( StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown) +// Registering Streaming Metrics at the start of the StreamingContext +assert(env.metricsSystem != null) +env.metricsSystem.registerSource(streamingSource) uiTab.foreach(_.attach()) logInfo(StreamingContext started) case ACTIVE = @@ -682,6 +682,8 @@ class StreamingContext private[streaming] ( logWarning(StreamingContext has already been stopped) case ACTIVE = scheduler.stop(stopGracefully) + // Removing the streamingSource to de-register the metrics on stop() + env.metricsSystem.removeSource(streamingSource) uiTab.foreach(_.detach()) StreamingContext.setActiveContext(null) waiter.notifyStop() http://git-wip-us.apache.org/repos/asf/spark/blob/b7bcbe25/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index 56b4ce5..289a159 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -20,20 +20,23
spark git commit: [SPARK-8203] [SPARK-8204] [SQL] conditional function: least/greatest
Repository: spark Updated Branches: refs/heads/master 20b474335 - 92540d22e [SPARK-8203] [SPARK-8204] [SQL] conditional function: least/greatest chenghao-intel zhichao-li qiansl127 Author: Daoyuan Wang daoyuan.w...@intel.com Closes #6851 from adrian-wang/udflg and squashes the following commits: 0f1bff2 [Daoyuan Wang] address comments from davis 7a6bdbb [Daoyuan Wang] add '.' for hex() c1f6824 [Daoyuan Wang] add codegen, test for all types ec625b0 [Daoyuan Wang] conditional function: least/greatest Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92540d22 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92540d22 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92540d22 Branch: refs/heads/master Commit: 92540d22e45f9300f413f520a1770e9f36cfa833 Parents: 20b4743 Author: Daoyuan Wang daoyuan.w...@intel.com Authored: Mon Jul 13 00:14:32 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Mon Jul 13 00:14:32 2015 -0700 -- .../catalyst/analysis/FunctionRegistry.scala| 2 + .../sql/catalyst/expressions/conditionals.scala | 103 ++- .../ConditionalExpressionSuite.scala| 81 +++ .../scala/org/apache/spark/sql/functions.scala | 60 ++- .../spark/sql/DataFrameFunctionsSuite.scala | 22 5 files changed, 263 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92540d22/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index f62d79f..ed69c42 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -76,9 +76,11 @@ object FunctionRegistry { expression[CreateArray](array), expression[Coalesce](coalesce), expression[Explode](explode), +expression[Greatest](greatest), expression[If](if), expression[IsNull](isnull), expression[IsNotNull](isnotnull), +expression[Least](least), expression[Coalesce](nvl), expression[Rand](rand), expression[Randn](randn), http://git-wip-us.apache.org/repos/asf/spark/blob/92540d22/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala index 395e84f..e6a705f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.types.{BooleanType, DataType} +import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.types.{NullType, BooleanType, DataType} case class If(predicate: Expression, trueValue: Expression, falseValue: Expression) @@ -312,3 +313,103 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW }.mkString } } + +case class Least(children: Expression*) extends Expression { + require(children.length 1, LEAST requires at least 2 arguments, got + children.length) + + override def nullable: Boolean = children.forall(_.nullable) + override def foldable: Boolean = children.forall(_.foldable) + + private lazy val ordering = TypeUtils.getOrdering(dataType) + + override def checkInputDataTypes(): TypeCheckResult = { +if (children.map(_.dataType).distinct.count(_ != NullType) 1) { + TypeCheckResult.TypeCheckFailure( +sThe expressions should all have the same type, + + s got LEAST (${children.map(_.dataType)}).) +} else { + TypeUtils.checkForOrderingExpr(dataType, function + prettyName) +} + } + + override def dataType: DataType = children.head.dataType + + override def eval(input: InternalRow): Any = { +children.foldLeft[Any](null)((r, c) = { + val evalc = c.eval(input) + if (evalc != null) { +if (r == null || ordering.lt(evalc, r)) evalc else r + } else { +r + } +}) + } + + override def genCode(ctx:
spark git commit: [SPARK-8944][SQL] Support casting between IntervalType and StringType
Repository: spark Updated Branches: refs/heads/master 92540d22e - 6b8994383 [SPARK-8944][SQL] Support casting between IntervalType and StringType Author: Wenchen Fan cloud0...@outlook.com Closes #7355 from cloud-fan/fromString and squashes the following commits: 3bbb9d6 [Wenchen Fan] fix code gen 7dab957 [Wenchen Fan] naming fix 0fbbe19 [Wenchen Fan] address comments ac1f3d1 [Wenchen Fan] Support casting between IntervalType and StringType Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b899438 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b899438 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b899438 Branch: refs/heads/master Commit: 6b89943834a8d9d5d0ecfd97efcc10056d08532a Parents: 92540d2 Author: Wenchen Fan cloud0...@outlook.com Authored: Mon Jul 13 00:49:39 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Mon Jul 13 00:49:39 2015 -0700 -- .../spark/sql/catalyst/expressions/Cast.scala | 17 ++- .../sql/catalyst/expressions/CastSuite.scala| 10 .../org/apache/spark/unsafe/types/Interval.java | 48 .../spark/unsafe/types/IntervalSuite.java | 46 +++ 4 files changed, 120 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6b899438/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 7f2383d..ab02add 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{Interval, UTF8String} object Cast { @@ -55,6 +55,9 @@ object Cast { case (_, DateType) = true +case (StringType, IntervalType) = true +case (IntervalType, StringType) = true + case (StringType, _: NumericType) = true case (BooleanType, _: NumericType) = true case (DateType, _: NumericType) = true @@ -232,6 +235,13 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w case _ = _ = null } + // IntervalConverter + private[this] def castToInterval(from: DataType): Any = Any = from match { +case StringType = + buildCast[UTF8String](_, s = Interval.fromString(s.toString)) +case _ = _ = null + } + // LongConverter private[this] def castToLong(from: DataType): Any = Any = from match { case StringType = @@ -405,6 +415,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w case DateType = castToDate(from) case decimal: DecimalType = castToDecimal(from, decimal) case TimestampType = castToTimestamp(from) +case IntervalType = castToInterval(from) case BooleanType = castToBoolean(from) case ByteType = castToByte(from) case ShortType = castToShort(from) @@ -442,6 +453,10 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w case (_, StringType) = defineCodeGen(ctx, ev, c = s${ctx.stringType}.fromString(String.valueOf($c))) + case (StringType, IntervalType) = +defineCodeGen(ctx, ev, c = + sorg.apache.spark.unsafe.types.Interval.fromString($c.toString())) + // fallback for DecimalType, this must be before other numeric types case (_, dt: DecimalType) = super.genCode(ctx, ev) http://git-wip-us.apache.org/repos/asf/spark/blob/6b899438/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 919fdd4..1de161c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -563,4 +563,14 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper { InternalRow(0L))) } + test(case between string and interval) { +import org.apache.spark.unsafe.types.Interval + +