ZEPPELIN-160 Working with provided Spark, Hadoop. Zeppelin currently embeds all spark dependencies under interpreter/spark and loading them on runtime.
Which is useful because of user can try Zeppelin + Spark with local mode without installation and configuration of spark. However, when user has existing spark and hadoop installation, it'll be really helpful to just pointing them instead of build zeppelin with specific version of spark and hadoop combination. This PR implements ability to use external spark and hadoop installation, by doing * spark-dependencies module packages spark/hadoop dependencies under interpreter/spark/dep, to support local mode (current behavior) * When SPARK_HOME and HADOOP_HOME is defined, bin/interpreter.sh exclude interpreter/spark/dep from classpath and include system installed spark and hadoop into the classpath. This patch makes Zeppelin binary independent from spark version. Once Zeppelin is been built, SPARK_HOME can point any version of spark. Author: Lee moon soo <[email protected]> Closes #244 from Leemoonsoo/spark_provided and squashes the following commits: 654c378 [Lee moon soo] use consistant, simpler expressions 57b3f96 [Lee moon soo] Add comment eb4ec09 [Lee moon soo] fix reading spark-*.conf file bacfd93 [Lee moon soo] Update readme 3a88c77 [Lee moon soo] Test use explicitly %spark 5a17d9c [Lee moon soo] Call sqlContext.sql using reflection 615c395 [Lee moon soo] get correct method 0c28561 [Lee moon soo] call listenerBus() using reflection 62b8c45 [Lee moon soo] Print all logs 5edb6fd [Lee moon soo] Use reflection to call addListener af7a925 [Lee moon soo] add pyspark flag 5f8a734 [Lee moon soo] test -> package a0150cf [Lee moon soo] not use travis-install for mvn test cd4519c [Lee moon soo] try sys.stdout.write instead of print 6304180 [Lee moon soo] enable 1.2.x test 797c0e2 [Lee moon soo] enable 1.3.x test 8de7add [Lee moon soo] trying to find why travis is not closing the test cf0a61e [Lee moon soo] rm -rf only interpreter directory instead of mvn clean 2606c04 [Lee moon soo] bringing travis-install.sh back df8f0ba [Lee moon soo] test more efficiently 9d6b40f [Lee moon soo] Update .travis 2ca3d95 [Lee moon soo] set SPARK_HOME 2a61ecd [Lee moon soo] Clear interpreter directory on mvn clean f1e8789 [Lee moon soo] update travis config 9e812e7 [Lee moon soo] Use reflection not to use import org.apache.spark.scheduler.Stage c3d96c1 [Lee moon soo] Handle ZEPPELIN_CLASSPATH proper way 0f9598b [Lee moon soo] py4j version as a property 1b7f951 [Lee moon soo] Add dependency for compile and test b1d62a5 [Lee moon soo] Add scala-library in test scope c49be62 [Lee moon soo] Add hadoop jar and spark jar from HADOOP_HOME, SPARK_HOME when they are defined 2052aa3 [Lee moon soo] Load interpreter/spark/dep only when SPARK_HOME is undefined 54fdf0d [Lee moon soo] Separate spark-dependency into submodule Project: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/commit/5de01c68 Tree: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/tree/5de01c68 Diff: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/diff/5de01c68 Branch: refs/heads/master Commit: 5de01c6800466ee6ab5de7d714461c287df13513 Parents: 6a096c9 Author: Lee moon soo <[email protected]> Authored: Mon Aug 31 12:12:54 2015 -0700 Committer: Lee moon soo <[email protected]> Committed: Tue Sep 1 10:05:41 2015 -0700 ---------------------------------------------------------------------- .travis.yml | 26 +- README.md | 51 +- bin/common.sh | 14 +- bin/interpreter.sh | 81 +- dev/travis/save-logs.py | 3 +- pom.xml | 1 + spark-dependencies/pom.xml | 791 +++++++++++++++++++ spark/pom.xml | 783 +++--------------- .../apache/zeppelin/spark/SparkInterpreter.java | 95 ++- .../zeppelin/spark/SparkSqlInterpreter.java | 137 +--- .../zeppelin/rest/AbstractTestRestApi.java | 16 +- .../zeppelin/rest/ZeppelinSparkClusterTest.java | 12 +- 12 files changed, 1103 insertions(+), 907 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index e1b3c5d..a1b467d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,34 +22,34 @@ before_install: - "sh -e /etc/init.d/xvfb start" install: - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn package -DskipTests -Phadoop-2.3 -Ppyspark -B + - mvn package -DskipTests -Phadoop-2.3 -Ppyspark -B before_script: - script: # spark 1.4 - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn package -Pbuild-distr -Phadoop-2.3 -Ppyspark -B + - mvn package -Pbuild-distr -Phadoop-2.3 -Ppyspark -B - ./testing/startSparkCluster.sh 1.4.0 2.3 - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn verify -Pusing-packaged-distr -Phadoop-2.3 -Ppyspark -B + - mvn verify -Pusing-packaged-distr -Phadoop-2.3 -Ppyspark -B - ./testing/stopSparkCluster.sh 1.4.0 2.3 # spark 1.3 - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn clean package -DskipTests -Pspark-1.3 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark' - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn package -Pbuild-distr -Pspark-1.3 -Phadoop-2.3 -B + - rm -rf `pwd`/interpreter/spark + - mvn package -DskipTests -Pspark-1.3 -Phadoop-2.3 -Ppyspark -B -pl 'zeppelin-interpreter,spark-dependencies,spark' - ./testing/startSparkCluster.sh 1.3.1 2.3 - - mvn verify -Pspark-1.3 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark' + - mvn package -Pspark-1.3 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,zeppelin-zengine,zeppelin-server' -Dtest=org.apache.zeppelin.rest.*Test -DfailIfNoTests=false - ./testing/stopSparkCluster.sh 1.3.1 2.3 -# spark 1.2 - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn clean package -DskipTests -Pspark-1.2 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark' - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn package -Pbuild-distr -Pspark-1.2 -Phadoop-2.3 -B + # spark 1.2 + - rm -rf `pwd`/interpreter/spark + - mvn package -Pspark-1.2 -Phadoop-2.3 -Ppyspark -B -pl 'zeppelin-interpreter,spark-dependencies,spark' - ./testing/startSparkCluster.sh 1.2.1 2.3 - - mvn verify -Pspark-1.2 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark' + - mvn package -Pspark-1.2 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,zeppelin-zengine,zeppelin-server' -Dtest=org.apache.zeppelin.rest.*Test -DfailIfNoTests=false - ./testing/stopSparkCluster.sh 1.2.1 2.3 # spark 1.1 - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn clean package -DskipTests -Pspark-1.1 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark' - - mvn package -Pbuild-distr -Pspark-1.1 -Phadoop-2.3 -B + - rm -rf `pwd`/interpreter/spark + - mvn package -Pspark-1.1 -Phadoop-2.3 -Ppyspark -B -pl 'zeppelin-interpreter,spark-dependencies,spark' - ./testing/startSparkCluster.sh 1.1.1 2.3 - - /bin/bash ./dev/travis/travis-install.sh `pwd` mvn verify -Pspark-1.1 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark' + - mvn package -Pspark-1.1 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,zeppelin-zengine,zeppelin-server' -Dtest=org.apache.zeppelin.rest.*Test -DfailIfNoTests=false - ./testing/stopSparkCluster.sh 1.1.1 2.3 after_failure: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/README.md ---------------------------------------------------------------------- diff --git a/README.md b/README.md index d565858..a3bc0ad 100644 --- a/README.md +++ b/README.md @@ -38,50 +38,52 @@ sudo apt-get install npm ### Build If you want to build Zeppelin from the source, please first clone this repository. And then: ``` -mvn clean package +mvn clean package -DskipTests ``` -Build with specific version -Spark 1.1.x -``` -mvn clean package -Pspark-1.1 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests -``` -Spark 1.2.x +Build with specific Spark version + +Spark 1.4.x ``` -mvn clean package -Pspark-1.2 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests +mvn clean package -Pspark-1.4 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests ``` Spark 1.3.x ``` mvn clean package -Pspark-1.3 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests ``` -Spark 1.4.x +Spark 1.2.x ``` -mvn clean package -Pspark-1.4 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests +mvn clean package -Pspark-1.2 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests +``` +Spark 1.1.x +``` +mvn clean package -Pspark-1.1 -Dhadoop.version=2.2.0 -Phadoop-2.2 -DskipTests ``` CDH 5.X ``` mvn clean package -Pspark-1.2 -Dhadoop.version=2.5.0-cdh5.3.0 -Phadoop-2.4 -DskipTests ``` -Yarn (Hadoop 2.2.x) +Yarn (Hadoop 2.7.x) ``` -mvn clean package -Pspark-1.1 -Dhadoop.version=2.2.0 -Phadoop-2.2 -Pyarn -DskipTests +mvn clean package -Pspark-1.4 -Dspark.version=1.4.1 -Dhadoop.version=2.7.0 -Phadoop-2.6 -Pyarn -DskipTests ``` -Yarn (Hadoop 2.3.x) +Yarn (Hadoop 2.6.x) ``` -mvn clean package -Pspark-1.1 -Dhadoop.version=2.3.0 -Phadoop-2.3 -Pyarn -DskipTests +mvn clean package -Pspark-1.1 -Dhadoop.version=2.6.0 -Phadoop-2.6 -Pyarn -DskipTests ``` Yarn (Hadoop 2.4.x) ``` mvn clean package -Pspark-1.1 -Dhadoop.version=2.4.0 -Phadoop-2.4 -Pyarn -DskipTests ``` -Yarn (Hadoop 2.6.x) +Yarn (Hadoop 2.3.x) ``` -mvn clean package -Pspark-1.1 -Dhadoop.version=2.6.0 -Phadoop-2.6 -Pyarn -DskipTests +mvn clean package -Pspark-1.1 -Dhadoop.version=2.3.0 -Phadoop-2.3 -Pyarn -DskipTests ``` -Yarn (Hadoop 2.7.x) +Yarn (Hadoop 2.2.x) ``` -mvn clean package -Pspark-1.4 -Dspark.version=1.4.1 -Dhadoop.version=2.7.0 -Phadoop-2.6 -Pyarn -DskipTests +mvn clean package -Pspark-1.1 -Dhadoop.version=2.2.0 -Phadoop-2.2 -Pyarn -DskipTests ``` + Ignite (1.1.0-incubating and later) ``` mvn clean package -Dignite.version=1.1.0-incubating -DskipTests @@ -96,6 +98,19 @@ If you wish to configure Zeppelin option (like port number), configure the follo (You can copy ```./conf/zeppelin-env.sh.template``` into ```./conf/zeppelin-env.sh```. Same for ```zeppelin-site.xml```.) + +#### Setting SPARK_HOME and HADOOP_HOME + +Without SPARK_HOME and HADOOP_HOME, Zeppelin uses embedded Spark and Hadoop binaries that you have specified with mvn build option. +If you want to use system provided Spark and Hadoop, export SPARK_HOME and HADOOP_HOME in zeppelin-env.sh +You can use any supported version of spark without rebuilding Zeppelin. + +``` +# ./conf/zeppelin-env.sh +export SPARK_HOME=... +export HADOOP_HOME=... +``` + #### External cluster configuration Mesos http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/bin/common.sh ---------------------------------------------------------------------- diff --git a/bin/common.sh b/bin/common.sh index 7aab870..188ff86 100644 --- a/bin/common.sh +++ b/bin/common.sh @@ -80,22 +80,10 @@ function addEachJarInDir(){ function addJarInDir(){ if [[ -d "${1}" ]]; then - export ZEPPELIN_CLASSPATH="${1}/*:${ZEPPELIN_CLASSPATH}" + ZEPPELIN_CLASSPATH="${1}/*:${ZEPPELIN_CLASSPATH}" fi } -if [[ ! -z "${SPARK_HOME}" ]] && [[ -d "${SPARK_HOME}" ]]; then - addJarInDir "${SPARK_HOME}" -fi - -if [[ ! -z "${HADOOP_HOME}" ]] && [[ -d "${HADOOP_HOME}" ]]; then - addJarInDir "${HADOOP_HOME}" -fi - -if [[ ! -z "${HADOOP_CONF_DIR}" ]] && [[ -d "${HADOOP_CONF_DIR}" ]]; then - ZEPPELIN_CLASSPATH+=":${HADOOP_CONF_DIR}" -fi - export ZEPPELIN_CLASSPATH # Text encoding for http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/bin/interpreter.sh ---------------------------------------------------------------------- diff --git a/bin/interpreter.sh b/bin/interpreter.sh index 93ae1e5..61dd249 100755 --- a/bin/interpreter.sh +++ b/bin/interpreter.sh @@ -57,9 +57,6 @@ fi addJarInDir "${ZEPPELIN_HOME}/zeppelin-interpreter/target/lib" addJarInDir "${INTERPRETER_DIR}" -export SPARK_CLASSPATH+=":${ZEPPELIN_CLASSPATH}" -CLASSPATH+=":${ZEPPELIN_CLASSPATH}" - HOSTNAME=$(hostname) ZEPPELIN_SERVER=org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer @@ -73,19 +70,77 @@ if [[ ! -d "${ZEPPELIN_LOG_DIR}" ]]; then $(mkdir -p "${ZEPPELIN_LOG_DIR}") fi -if [[ ! -z "${SPARK_HOME}" ]]; then - PYSPARKPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip" -else - PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip" -fi +# set spark related env variables +if [[ "${INTERPRETER_ID}" == "spark" ]]; then + # add Hadoop jars into classpath + if [[ -n "${HADOOP_HOME}" ]]; then + # Apache + addEachJarInDir "${HADOOP_HOME}/share" -if [[ x"" == x"${PYTHONPATH}" ]]; then - export PYTHONPATH="${PYSPARKPATH}" -else - export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}" + # CDH + addJarInDir "${HADOOP_HOME}" + addJarInDir "${HADOOP_HOME}/lib" + fi + + # autodetect HADOOP_CONF_HOME by heuristic + if [[ -n "${HADOOP_HOME}" ]] && [[ -z "${HADOOP_CONF_DIR}" ]]; then + if [[ -d "${HADOOP_HOME}/etc/hadoop" ]]; then + export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop" + elif [[ -d "/etc/hadoop/conf" ]]; then + export HADOOP_CONF_DIR="/etc/hadoop/conf" + fi + fi + + if [[ -n "${HADOOP_CONF_DIR}" ]] && [[ -d "${HADOOP_CONF_DIR}" ]]; then + ZEPPELIN_CLASSPATH+=":${HADOOP_CONF_DIR}" + fi + + # add Spark jars into classpath + if [[ -n "${SPARK_HOME}" ]]; then + addJarInDir "${SPARK_HOME}/lib" + PYSPARKPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip" + else + addJarInDir "${INTERPRETER_DIR}/dep" + PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip" + fi + + # autodetect SPARK_CONF_DIR + if [[ -n "${SPARK_HOME}" ]] && [[ -z "${SPARK_CONF_DIR}" ]]; then + if [[ -d "${SPARK_HOME}/conf" ]]; then + SPARK_CONF_DIR="${SPARK_HOME}/conf" + fi + fi + + # read spark-*.conf if exists + if [[ -d "${SPARK_CONF_DIR}" ]]; then + ls ${SPARK_CONF_DIR}/spark-*.conf > /dev/null 2>&1 + if [[ "$?" -eq 0 ]]; then + for file in ${SPARK_CONF_DIR}/spark-*.conf; do + while read -r line; do + echo "${line}" | grep -e "^spark[.]" > /dev/null + if [ "$?" -ne 0 ]; then + # skip the line not started with 'spark.' + continue; + fi + SPARK_CONF_KEY=`echo "${line}" | sed -e 's/\(^spark[^ ]*\)[ \t]*\(.*\)/\1/g'` + SPARK_CONF_VALUE=`echo "${line}" | sed -e 's/\(^spark[^ ]*\)[ \t]*\(.*\)/\2/g'` + export ZEPPELIN_JAVA_OPTS+=" -D${SPARK_CONF_KEY}=\"${SPARK_CONF_VALUE}\"" + done < "${file}" + done + fi + fi + + if [[ -z "${PYTHONPATH}" ]]; then + export PYTHONPATH="${PYSPARKPATH}" + else + export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}" + fi + + unset PYSPARKPATH fi -unset PYSPARKPATH +export SPARK_CLASSPATH+=":${ZEPPELIN_CLASSPATH}" +CLASSPATH+=":${ZEPPELIN_CLASSPATH}" ${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} & pid=$! http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/dev/travis/save-logs.py ---------------------------------------------------------------------- diff --git a/dev/travis/save-logs.py b/dev/travis/save-logs.py index 5f4ad28..d0480e8 100755 --- a/dev/travis/save-logs.py +++ b/dev/travis/save-logs.py @@ -42,8 +42,7 @@ def main(file, cmd): errcode = process.wait() diff = datetime.now() - start sys.stdout.write("\r%d seconds %d log lines"%(diff.seconds, count)) - print - print cmd, "done", errcode + sys.stdout.write("\n" + str(cmd) + " done " + str(errcode) + "\n") return errcode if __name__ == "__main__": http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 9b7cd46..ff65160 100755 --- a/pom.xml +++ b/pom.xml @@ -86,6 +86,7 @@ <modules> <module>zeppelin-interpreter</module> <module>zeppelin-zengine</module> + <module>spark-dependencies</module> <module>spark</module> <module>markdown</module> <module>angular</module> http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/spark-dependencies/pom.xml ---------------------------------------------------------------------- diff --git a/spark-dependencies/pom.xml b/spark-dependencies/pom.xml new file mode 100644 index 0000000..c451c39 --- /dev/null +++ b/spark-dependencies/pom.xml @@ -0,0 +1,791 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <artifactId>zeppelin</artifactId> + <groupId>org.apache.zeppelin</groupId> + <version>0.6.0-incubating-SNAPSHOT</version> + <relativePath>..</relativePath> + </parent> + + <groupId>org.apache.zeppelin</groupId> + <artifactId>zeppelin-spark-dependencies</artifactId> + <packaging>jar</packaging> + <version>0.6.0-incubating-SNAPSHOT</version> + <name>Zeppelin: Spark dependencies</name> + <description>Zeppelin spark support</description> + <url>http://zeppelin.incubator.apache.org</url> + + + <properties> + <spark.version>1.4.1</spark.version> + <scala.version>2.10.4</scala.version> + <scala.binary.version>2.10</scala.binary.version> + + <hadoop.version>2.3.0</hadoop.version> + <yarn.version>${hadoop.version}</yarn.version> + <avro.version>1.7.7</avro.version> + <avro.mapred.classifier></avro.mapred.classifier> + <jets3t.version>0.7.1</jets3t.version> + <protobuf.version>2.4.1</protobuf.version> + + <akka.group>org.spark-project.akka</akka.group> + <akka.version>2.3.4-spark</akka.version> + + <spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz</spark.download.url> + <py4j.version>0.8.2.1</py4j.version> + </properties> + + <repositories> + <repository> + <id>cloudera</id> + <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> + </repository> + </repositories> + + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.apache.avro</groupId> + <artifactId>avro</artifactId> + <version>${avro.version}</version> + </dependency> + <dependency> + <groupId>org.apache.avro</groupId> + <artifactId>avro-ipc</artifactId> + <version>${avro.version}</version> + <exclusions> + <exclusion> + <groupId>io.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.velocity</groupId> + <artifactId>velocity</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.avro</groupId> + <artifactId>avro-mapred</artifactId> + <version>${avro.version}</version> + <classifier>${avro.mapred.classifier}</classifier> + <exclusions> + <exclusion> + <groupId>io.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.velocity</groupId> + <artifactId>velocity</artifactId> + </exclusion> + </exclusions> + </dependency> + + <!-- See SPARK-1556 for info on this dependency: --> + <dependency> + <groupId>net.java.dev.jets3t</groupId> + <artifactId>jets3t</artifactId> + <version>${jets3t.version}</version> + <scope>runtime</scope> + <exclusions> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-api</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-common</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-server-web-proxy</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-client</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + </dependencies> + </dependencyManagement> + + <dependencies> + <!-- Spark --> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-core_2.10</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-repl_2.10</artifactId> + <version>${spark.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-sql_2.10</artifactId> + <version>${spark.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-hive_2.10</artifactId> + <version>${spark.version}</version> + </dependency> + + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-streaming_2.10</artifactId> + <version>${spark.version}</version> + </dependency> + + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-streaming-twitter_2.10</artifactId> + <version>${spark.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-catalyst_${scala.binary.version}</artifactId> + <version>${spark.version}</version> + </dependency> + + + <!-- hadoop --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <version>${hadoop.version}</version> + </dependency> + + + <dependency> + <groupId>com.google.protobuf</groupId> + <artifactId>protobuf-java</artifactId> + <version>${protobuf.version}</version> + </dependency> + + <dependency> + <groupId>${akka.group}</groupId> + <artifactId>akka-actor_${scala.binary.version}</artifactId> + <version>${akka.version}</version> + </dependency> + <dependency> + <groupId>${akka.group}</groupId> + <artifactId>akka-remote_${scala.binary.version}</artifactId> + <version>${akka.version}</version> + </dependency> + <dependency> + <groupId>${akka.group}</groupId> + <artifactId>akka-slf4j_${scala.binary.version}</artifactId> + <version>${akka.version}</version> + </dependency> + <dependency> + <groupId>${akka.group}</groupId> + <artifactId>akka-testkit_${scala.binary.version}</artifactId> + <version>${akka.version}</version> + </dependency> + <dependency> + <groupId>${akka.group}</groupId> + <artifactId>akka-zeromq_${scala.binary.version}</artifactId> + <version>${akka.version}</version> + <exclusions> + <exclusion> + <groupId>${akka.group}</groupId> + <artifactId>akka-actor_${scala.binary.version}</artifactId> + </exclusion> + </exclusions> + </dependency> + + </dependencies> + + <profiles> + <profile> + <id>spark-1.1</id> + <dependencies> + + </dependencies> + <properties> + <spark.version>1.1.1</spark.version> + <akka.version>2.2.3-shaded-protobuf</akka.version> + </properties> + </profile> + + <profile> + <id>cassandra-spark-1.1</id> + <dependencies> + <dependency> + <groupId>com.datastax.spark</groupId> + <artifactId>spark-cassandra-connector_${scala.binary.version}</artifactId> + <version>1.1.1</version> + <exclusions> + <exclusion> + <groupId>org.joda</groupId> + <artifactId>joda-convert</artifactId> + </exclusion> + </exclusions> + </dependency> + </dependencies> + <properties> + <spark.version>1.1.1</spark.version> + <akka.version>2.2.3-shaded-protobuf</akka.version> + </properties> + </profile> + + <profile> + <id>spark-1.2</id> + <dependencies> + </dependencies> + <properties> + <spark.version>1.2.1</spark.version> + </properties> + </profile> + + <profile> + <id>cassandra-spark-1.2</id> + <properties> + <spark.version>1.2.1</spark.version> + </properties> + <dependencies> + <dependency> + <groupId>com.datastax.spark</groupId> + <artifactId>spark-cassandra-connector_${scala.binary.version}</artifactId> + <version>1.2.1</version> + <exclusions> + <exclusion> + <groupId>org.joda</groupId> + <artifactId>joda-convert</artifactId> + </exclusion> + </exclusions> + </dependency> + </dependencies> + </profile> + + <profile> + <id>spark-1.3</id> + + <properties> + <spark.version>1.3.1</spark.version> + </properties> + + <dependencies> + </dependencies> + + </profile> + + <profile> + <id>cassandra-spark-1.3</id> + <properties> + <spark.version>1.3.0</spark.version> + </properties> + + <dependencies> + <dependency> + <groupId>com.datastax.spark</groupId> + <artifactId>spark-cassandra-connector_${scala.binary.version}</artifactId> + <!--You need to build your own version of Spark Cassandra connector 1.3.0-SNAPSHOT + because it is not yet released--> + <version>1.3.0-SNAPSHOT</version> + <exclusions> + <exclusion> + <groupId>org.joda</groupId> + <artifactId>joda-convert</artifactId> + </exclusion> + </exclusions> + </dependency> + </dependencies> + </profile> + + <profile> + <id>spark-1.4</id> + <properties> + <spark.version>1.4.1</spark.version> + </properties> + + <dependencies> + </dependencies> + </profile> + + <profile> + <id>hadoop-0.23</id> + <!-- SPARK-1121: Adds an explicit dependency on Avro to work around a + Hadoop 0.23.X issue --> + <dependencies> + <dependency> + <groupId>org.apache.avro</groupId> + <artifactId>avro</artifactId> + </dependency> + </dependencies> + <properties> + <hadoop.version>0.23.10</hadoop.version> + </properties> + </profile> + + <profile> + <id>hadoop-1</id> + <properties> + <hadoop.version>1.0.4</hadoop.version> + <avro.mapred.classifier>hadoop1</avro.mapred.classifier> + <codehaus.jackson.version>1.8.8</codehaus.jackson.version> + <akka.group>org.spark-project.akka</akka.group> + </properties> + </profile> + + <profile> + <id>hadoop-2.2</id> + <properties> + <hadoop.version>2.2.0</hadoop.version> + <protobuf.version>2.5.0</protobuf.version> + <avro.mapred.classifier>hadoop2</avro.mapred.classifier> + </properties> + </profile> + + <profile> + <id>hadoop-2.3</id> + <properties> + <hadoop.version>2.3.0</hadoop.version> + <protobuf.version>2.5.0</protobuf.version> + <jets3t.version>0.9.3</jets3t.version> + <avro.mapred.classifier>hadoop2</avro.mapred.classifier> + </properties> + </profile> + + <profile> + <id>hadoop-2.4</id> + <properties> + <hadoop.version>2.4.0</hadoop.version> + <protobuf.version>2.5.0</protobuf.version> + <jets3t.version>0.9.3</jets3t.version> + <avro.mapred.classifier>hadoop2</avro.mapred.classifier> + </properties> + </profile> + + <profile> + <id>hadoop-2.6</id> + <properties> + <hadoop.version>2.6.0</hadoop.version> + <protobuf.version>2.5.0</protobuf.version> + <jets3t.version>0.9.3</jets3t.version> + <avro.mapred.classifier>hadoop2</avro.mapred.classifier> + </properties> + </profile> + + <profile> + <id>mapr3</id> + <activation> + <activeByDefault>false</activeByDefault> + </activation> + <properties> + <hadoop.version>1.0.3-mapr-3.0.3</hadoop.version> + <yarn.version>2.3.0-mapr-4.0.0-FCS</yarn.version> + <jets3t.version>0.7.1</jets3t.version> + </properties> + </profile> + + <profile> + <id>mapr4</id> + <activation> + <activeByDefault>false</activeByDefault> + </activation> + <properties> + <hadoop.version>2.3.0-mapr-4.0.0-FCS</hadoop.version> + <yarn.version>2.3.0-mapr-4.0.0-FCS</yarn.version> + <jets3t.version>0.7.1</jets3t.version> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.curator</groupId> + <artifactId>curator-recipes</artifactId> + <version>2.4.0</version> + <exclusions> + <exclusion> + <groupId>org.apache.zookeeper</groupId> + <artifactId>zookeeper</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.zookeeper</groupId> + <artifactId>zookeeper</artifactId> + <version>3.4.5-mapr-1406</version> + </dependency> + </dependencies> + </profile> + + <profile> + <id>yarn</id> + <dependencies> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-yarn_2.10</artifactId> + <version>${spark.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-api</artifactId> + <version>${yarn.version}</version> + </dependency> + </dependencies> + </profile> + + <profile> + <id>pyspark</id> + <properties> + <spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz + </spark.download.url> + </properties> + <build> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <version>1.2.1</version> + <executions> + <execution> + <id>download-pyspark-files</id> + <phase>validate</phase> + <goals> + <goal>wget</goal> + </goals> + <configuration> + <url>${spark.download.url}</url> + <unpack>true</unpack> + <outputDirectory>${project.build.directory}/spark-dist</outputDirectory> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-clean-plugin</artifactId> + <configuration> + <filesets> + <fileset> + <directory>${basedir}/../python/build</directory> + </fileset> + <fileset> + <directory>${project.build.directory}/spark-dist</directory> + </fileset> + </filesets> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-antrun-plugin</artifactId> + <version>1.7</version> + <executions> + <execution> + <id>download-and-zip-pyspark-files</id> + <phase>generate-resources</phase> + <goals> + <goal>run</goal> + </goals> + <configuration> + <target> + <delete dir="../interpreter/spark/pyspark"/> + <copy todir="../interpreter/spark/pyspark" + file="${project.build.directory}/spark-dist/spark-${spark.version}/python/lib/py4j-${py4j.version}-src.zip"/> + <zip destfile="${project.build.directory}/../../interpreter/spark/pyspark/pyspark.zip" + basedir="${project.build.directory}/spark-dist/spark-${spark.version}/python" + includes="pyspark/*.py,pyspark/**/*.py"/> + </target> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + </profiles> + + <build> + <plugins> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <excludes> + <exclude>**/.idea/</exclude> + <exclude>**/*.iml</exclude> + <exclude>.gitignore</exclude> + <exclude>**/.settings/*</exclude> + <exclude>**/.classpath</exclude> + <exclude>**/.project</exclude> + <exclude>**/target/**</exclude> + <exclude>**/derby.log</exclude> + <exclude>**/metastore_db/</exclude> + <exclude>**/README.md</exclude> + <exclude>dependency-reduced-pom.xml</exclude> + </excludes> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-deploy-plugin</artifactId> + <version>2.7</version> + <configuration> + <skip>true</skip> + </configuration> + </plugin> + + <plugin> + <artifactId>maven-enforcer-plugin</artifactId> + <version>1.3.1</version> + <executions> + <execution> + <id>enforce</id> + <phase>none</phase> + </execution> + </executions> + </plugin> + + <plugin> + <artifactId>maven-clean-plugin</artifactId> + <configuration> + <filesets> + <fileset> + <directory>../interpreter/spark/dep</directory> + </fileset> + </filesets> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.17</version> + <configuration> + <forkCount>1</forkCount> + <reuseForks>false</reuseForks> + <argLine>-Xmx1024m -XX:MaxPermSize=256m</argLine> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.3</version> + <configuration> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>org/datanucleus/**</exclude> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> + <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> + <resource>reference.conf</resource> + </transformer> + </transformers> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + </execution> + </executions> + </plugin> + + <!-- Deploy datanucleus jars to the interpreter/spark directory --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <version>2.8</version> + <executions> + <execution> + <id>copy-dependencies</id> + <phase>package</phase> + <goals> + <goal>copy-dependencies</goal> + </goals> + <configuration> + <outputDirectory>${project.build.directory}/../../interpreter/spark/dep</outputDirectory> + <overWriteReleases>false</overWriteReleases> + <overWriteSnapshots>false</overWriteSnapshots> + <overWriteIfNewer>true</overWriteIfNewer> + <includeGroupIds>org.datanucleus</includeGroupIds> + </configuration> + </execution> + <execution> + <phase>package</phase> + <goals> + <goal>copy</goal> + </goals> + <configuration> + <outputDirectory>${project.build.directory}/../../interpreter/spark/dep</outputDirectory> + <overWriteReleases>false</overWriteReleases> + <overWriteSnapshots>false</overWriteSnapshots> + <overWriteIfNewer>true</overWriteIfNewer> + <artifactItems> + <artifactItem> + <groupId>${project.groupId}</groupId> + <artifactId>${project.artifactId}</artifactId> + <version>${project.version}</version> + <type>${project.packaging}</type> + </artifactItem> + </artifactItems> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/spark/pom.xml ---------------------------------------------------------------------- diff --git a/spark/pom.xml b/spark/pom.xml index aa07687..59da081 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -34,23 +34,13 @@ <description>Zeppelin spark support</description> <url>http://zeppelin.incubator.apache.org</url> - <properties> <spark.version>1.4.1</spark.version> <scala.version>2.10.4</scala.version> <scala.binary.version>2.10</scala.binary.version> <hadoop.version>2.3.0</hadoop.version> - <yarn.version>${hadoop.version}</yarn.version> - <avro.version>1.7.7</avro.version> - <avro.mapred.classifier></avro.mapred.classifier> - <jets3t.version>0.7.1</jets3t.version> - <protobuf.version>2.4.1</protobuf.version> - - <akka.group>org.spark-project.akka</akka.group> - <akka.version>2.3.4-spark</akka.version> - - <spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz</spark.download.url> + <py4j.version>0.8.2.1</py4j.version> </properties> <repositories> @@ -59,192 +49,6 @@ <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> </repositories> - - <dependencyManagement> - <dependencies> - <dependency> - <groupId>org.apache.avro</groupId> - <artifactId>avro</artifactId> - <version>${avro.version}</version> - </dependency> - <dependency> - <groupId>org.apache.avro</groupId> - <artifactId>avro-ipc</artifactId> - <version>${avro.version}</version> - <exclusions> - <exclusion> - <groupId>io.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty-util</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.apache.velocity</groupId> - <artifactId>velocity</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.avro</groupId> - <artifactId>avro-mapred</artifactId> - <version>${avro.version}</version> - <classifier>${avro.mapred.classifier}</classifier> - <exclusions> - <exclusion> - <groupId>io.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty-util</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.apache.velocity</groupId> - <artifactId>velocity</artifactId> - </exclusion> - </exclusions> - </dependency> - - <!-- See SPARK-1556 for info on this dependency: --> - <dependency> - <groupId>net.java.dev.jets3t</groupId> - <artifactId>jets3t</artifactId> - <version>${jets3t.version}</version> - <scope>runtime</scope> - <exclusions> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-api</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-common</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-server-web-proxy</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-client</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - </dependencies> - </dependencyManagement> <dependencies> <dependency> @@ -264,104 +68,19 @@ <scope>provided</scope> </dependency> - <!-- Spark --> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-core_2.10</artifactId> - <version>${spark.version}</version> - <exclusions> - <exclusion> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-repl_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-sql_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-hive_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-streaming_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-streaming-twitter_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-catalyst_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - </dependency> - - - <!-- hadoop --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - <version>${hadoop.version}</version> - </dependency> - - <dependency> - <groupId>com.google.protobuf</groupId> - <artifactId>protobuf-java</artifactId> - <version>${protobuf.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>zeppelin-spark-dependencies</artifactId> + <version>${project.version}</version> + <scope>provided</scope> </dependency> <dependency> - <groupId>${akka.group}</groupId> - <artifactId>akka-actor_${scala.binary.version}</artifactId> - <version>${akka.version}</version> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>14.0.1</version> </dependency> - <dependency> - <groupId>${akka.group}</groupId> - <artifactId>akka-remote_${scala.binary.version}</artifactId> - <version>${akka.version}</version> - </dependency> - <dependency> - <groupId>${akka.group}</groupId> - <artifactId>akka-slf4j_${scala.binary.version}</artifactId> - <version>${akka.version}</version> - </dependency> - <dependency> - <groupId>${akka.group}</groupId> - <artifactId>akka-testkit_${scala.binary.version}</artifactId> - <version>${akka.version}</version> - </dependency> - <dependency> - <groupId>${akka.group}</groupId> - <artifactId>akka-zeromq_${scala.binary.version}</artifactId> - <version>${akka.version}</version> - <exclusions> - <exclusion> - <groupId>${akka.group}</groupId> - <artifactId>akka-actor_${scala.binary.version}</artifactId> - </exclusion> - </exclusions> - </dependency> - + <!-- Aether :: maven dependency resolution --> <dependency> <groupId>org.apache.maven</groupId> @@ -482,6 +201,92 @@ <version>1.1</version> </dependency> + <!-- to compile and test code. + Runtime dependency is provided by either spark-dependencies submodule or SPARK_HOME + --> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-core_2.10</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-repl_2.10</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-sql_2.10</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-hive_2.10</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-catalyst_2.10</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <version>${scala.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-compiler</artifactId> + <version>${scala.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-reflect</artifactId> + <version>${scala.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>1.9</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>net.sf.py4j</groupId> + <artifactId>py4j</artifactId> + <version>${py4j.version}</version> + </dependency> + + <!--TEST--> <dependency> <groupId>org.scalatest</groupId> @@ -497,356 +302,6 @@ </dependency> </dependencies> - <profiles> - <profile> - <id>spark-1.1</id> - <dependencies> - - </dependencies> - <properties> - <spark.version>1.1.1</spark.version> - <akka.version>2.2.3-shaded-protobuf</akka.version> - </properties> - </profile> - - <profile> - <id>cassandra-spark-1.1</id> - <dependencies> - <dependency> - <groupId>com.datastax.spark</groupId> - <artifactId>spark-cassandra-connector_${scala.binary.version}</artifactId> - <version>1.1.1</version> - <exclusions> - <exclusion> - <groupId>org.joda</groupId> - <artifactId>joda-convert</artifactId> - </exclusion> - </exclusions> - </dependency> - </dependencies> - <properties> - <spark.version>1.1.1</spark.version> - <akka.version>2.2.3-shaded-protobuf</akka.version> - </properties> - </profile> - - <profile> - <id>spark-1.2</id> - <dependencies> - </dependencies> - <properties> - <spark.version>1.2.1</spark.version> - </properties> - </profile> - - <profile> - <id>cassandra-spark-1.2</id> - <properties> - <spark.version>1.2.1</spark.version> - </properties> - <dependencies> - <dependency> - <groupId>com.datastax.spark</groupId> - <artifactId>spark-cassandra-connector_${scala.binary.version}</artifactId> - <version>1.2.1</version> - <exclusions> - <exclusion> - <groupId>org.joda</groupId> - <artifactId>joda-convert</artifactId> - </exclusion> - </exclusions> - </dependency> - </dependencies> - </profile> - - <profile> - <id>spark-1.3</id> - - <properties> - <spark.version>1.3.1</spark.version> - </properties> - - <dependencies> - </dependencies> - - </profile> - - <profile> - <id>cassandra-spark-1.3</id> - <properties> - <spark.version>1.3.0</spark.version> - </properties> - - <dependencies> - <dependency> - <groupId>com.datastax.spark</groupId> - <artifactId>spark-cassandra-connector_${scala.binary.version}</artifactId> - <!--You need to build your own version of Spark Cassandra connector 1.3.0-SNAPSHOT - because it is not yet released--> - <version>1.3.0-SNAPSHOT</version> - <exclusions> - <exclusion> - <groupId>org.joda</groupId> - <artifactId>joda-convert</artifactId> - </exclusion> - </exclusions> - </dependency> - </dependencies> - </profile> - - <profile> - <id>spark-1.4</id> - <properties> - <spark.version>1.4.1</spark.version> - </properties> - - <dependencies> - </dependencies> - </profile> - - <profile> - <id>hadoop-0.23</id> - <!-- SPARK-1121: Adds an explicit dependency on Avro to work around a - Hadoop 0.23.X issue --> - <dependencies> - <dependency> - <groupId>org.apache.avro</groupId> - <artifactId>avro</artifactId> - </dependency> - </dependencies> - <properties> - <hadoop.version>0.23.10</hadoop.version> - </properties> - </profile> - - <profile> - <id>hadoop-1</id> - <properties> - <hadoop.version>1.0.4</hadoop.version> - <avro.mapred.classifier>hadoop1</avro.mapred.classifier> - <codehaus.jackson.version>1.8.8</codehaus.jackson.version> - <akka.group>org.spark-project.akka</akka.group> - </properties> - </profile> - - <profile> - <id>hadoop-2.2</id> - <properties> - <hadoop.version>2.2.0</hadoop.version> - <protobuf.version>2.5.0</protobuf.version> - <avro.mapred.classifier>hadoop2</avro.mapred.classifier> - </properties> - </profile> - - <profile> - <id>hadoop-2.3</id> - <properties> - <hadoop.version>2.3.0</hadoop.version> - <protobuf.version>2.5.0</protobuf.version> - <jets3t.version>0.9.3</jets3t.version> - <avro.mapred.classifier>hadoop2</avro.mapred.classifier> - </properties> - </profile> - - <profile> - <id>hadoop-2.4</id> - <properties> - <hadoop.version>2.4.0</hadoop.version> - <protobuf.version>2.5.0</protobuf.version> - <jets3t.version>0.9.3</jets3t.version> - <avro.mapred.classifier>hadoop2</avro.mapred.classifier> - </properties> - </profile> - - <profile> - <id>hadoop-2.6</id> - <properties> - <hadoop.version>2.6.0</hadoop.version> - <protobuf.version>2.5.0</protobuf.version> - <jets3t.version>0.9.3</jets3t.version> - <avro.mapred.classifier>hadoop2</avro.mapred.classifier> - </properties> - </profile> - - <profile> - <id>mapr3</id> - <activation> - <activeByDefault>false</activeByDefault> - </activation> - <properties> - <hadoop.version>1.0.3-mapr-3.0.3</hadoop.version> - <yarn.version>2.3.0-mapr-4.0.0-FCS</yarn.version> - <jets3t.version>0.7.1</jets3t.version> - </properties> - </profile> - - <profile> - <id>mapr4</id> - <activation> - <activeByDefault>false</activeByDefault> - </activation> - <properties> - <hadoop.version>2.3.0-mapr-4.0.0-FCS</hadoop.version> - <yarn.version>2.3.0-mapr-4.0.0-FCS</yarn.version> - <jets3t.version>0.7.1</jets3t.version> - </properties> - <dependencies> - <dependency> - <groupId>org.apache.curator</groupId> - <artifactId>curator-recipes</artifactId> - <version>2.4.0</version> - <exclusions> - <exclusion> - <groupId>org.apache.zookeeper</groupId> - <artifactId>zookeeper</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.zookeeper</groupId> - <artifactId>zookeeper</artifactId> - <version>3.4.5-mapr-1406</version> - </dependency> - </dependencies> - </profile> - - <profile> - <id>yarn</id> - <dependencies> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-yarn_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-api</artifactId> - <version>${yarn.version}</version> - </dependency> - </dependencies> - </profile> - - <profile> - <id>pyspark</id> - <properties> - <spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz - </spark.download.url> - </properties> - <build> - <plugins> - <plugin> - <groupId>com.googlecode.maven-download-plugin</groupId> - <artifactId>download-maven-plugin</artifactId> - <version>1.2.1</version> - <executions> - <execution> - <id>download-pyspark-files</id> - <phase>validate</phase> - <goals> - <goal>wget</goal> - </goals> - <configuration> - <url>${spark.download.url}</url> - <unpack>true</unpack> - <outputDirectory>${project.build.directory}/spark-dist</outputDirectory> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <artifactId>maven-clean-plugin</artifactId> - <configuration> - <filesets> - <fileset> - <directory>${basedir}/../python/build</directory> - </fileset> - <fileset> - <directory>${project.build.directory}/spark-dist</directory> - </fileset> - </filesets> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-antrun-plugin</artifactId> - <version>1.7</version> - <executions> - <execution> - <id>download-and-zip-pyspark-files</id> - <phase>generate-resources</phase> - <goals> - <goal>run</goal> - </goals> - <configuration> - <target> - <delete dir="../interpreter/spark/pyspark"/> - <copy todir="../interpreter/spark/pyspark" - file="${project.build.directory}/spark-dist/spark-${spark.version}/python/lib/py4j-0.8.2.1-src.zip"/> - <zip destfile="${project.build.directory}/../../interpreter/spark/pyspark/pyspark.zip" - basedir="${project.build.directory}/spark-dist/spark-${spark.version}/python" - includes="pyspark/*.py,pyspark/**/*.py"/> - </target> - </configuration> - </execution> - </executions> - </plugin> - </plugins> - </build> - </profile> - - <!-- Build without Hadoop dependencies that are included in some runtime environments. --> - <profile> - <id>hadoop-provided</id> - <activation> - <activeByDefault>false</activeByDefault> - </activation> - <dependencies> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-api</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-common</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-server-web-proxy</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-client</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.avro</groupId> - <artifactId>avro</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.avro</groupId> - <artifactId>avro-ipc</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.zookeeper</groupId> - <artifactId>zookeeper</artifactId> - <version>${zookeeper.version}</version> - <scope>provided</scope> - </dependency> - </dependencies> - </profile> - </profiles> - <build> <plugins> <plugin> @@ -890,6 +345,17 @@ </plugin> <plugin> + <artifactId>maven-clean-plugin</artifactId> + <configuration> + <filesets> + <fileset> + <directory>../interpreter/spark</directory> + </fileset> + </filesets> + </configuration> + </plugin> + + <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> <version>2.17</version> @@ -902,40 +368,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-shade-plugin</artifactId> - <version>2.3</version> - <configuration> - <filters> - <filter> - <artifact>*:*</artifact> - <excludes> - <exclude>org/datanucleus/**</exclude> - <exclude>META-INF/*.SF</exclude> - <exclude>META-INF/*.DSA</exclude> - <exclude>META-INF/*.RSA</exclude> - </excludes> - </filter> - </filters> - <transformers> - <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> - <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> - <resource>reference.conf</resource> - </transformer> - </transformers> - </configuration> - <executions> - <execution> - <phase>package</phase> - <goals> - <goal>shade</goal> - </goals> - </execution> - </executions> - </plugin> - - <!-- Deploy datanucleus jars to the interpreter/spark directory --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-dependency-plugin</artifactId> <version>2.8</version> <executions> @@ -950,7 +382,7 @@ <overWriteReleases>false</overWriteReleases> <overWriteSnapshots>false</overWriteSnapshots> <overWriteIfNewer>true</overWriteIfNewer> - <includeGroupIds>org.datanucleus</includeGroupIds> + <includeScope>runtime</includeScope> </configuration> </execution> <execution> @@ -963,6 +395,7 @@ <overWriteReleases>false</overWriteReleases> <overWriteSnapshots>false</overWriteSnapshots> <overWriteIfNewer>true</overWriteIfNewer> + <includeScope>runtime</includeScope> <artifactItems> <artifactItem> <groupId>${project.groupId}</groupId> http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java ---------------------------------------------------------------------- diff --git a/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java b/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java index a4ff494..e684c52 100644 --- a/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java +++ b/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java @@ -29,6 +29,7 @@ import java.net.URLClassLoader; import java.util.*; import com.google.common.base.Joiner; + import org.apache.spark.HttpServer; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; @@ -40,7 +41,7 @@ import org.apache.spark.repl.SparkJLineCompletion; import org.apache.spark.scheduler.ActiveJob; import org.apache.spark.scheduler.DAGScheduler; import org.apache.spark.scheduler.Pool; -import org.apache.spark.scheduler.Stage; +import org.apache.spark.scheduler.SparkListener; import org.apache.spark.sql.SQLContext; import org.apache.spark.ui.jobs.JobProgressListener; import org.apache.zeppelin.interpreter.Interpreter; @@ -67,6 +68,7 @@ import scala.Tuple2; import scala.collection.Iterator; import scala.collection.JavaConversions; import scala.collection.JavaConverters; +import scala.collection.Seq; import scala.collection.mutable.HashMap; import scala.collection.mutable.HashSet; import scala.tools.nsc.Settings; @@ -156,7 +158,14 @@ public class SparkInterpreter extends Interpreter { private static JobProgressListener setupListeners(SparkContext context) { JobProgressListener pl = new JobProgressListener(context.getConf()); - context.listenerBus().addListener(pl); + try { + Object listenerBus = context.getClass().getMethod("listenerBus").invoke(context); + Method m = listenerBus.getClass().getMethod("addListener", SparkListener.class); + m.invoke(listenerBus, pl); + } catch (NoSuchMethodException | SecurityException | IllegalAccessException + | IllegalArgumentException | InvocationTargetException e) { + e.printStackTrace(); + } return pl; } @@ -271,7 +280,7 @@ public class SparkInterpreter extends Interpreter { } //TODO(jongyoul): Move these codes into PySparkInterpreter.java - + String pysparkBasePath = getSystemDefault("SPARK_HOME", "spark.home", null); File pysparkPath; if (null == pysparkBasePath) { @@ -606,7 +615,7 @@ public class SparkInterpreter extends Interpreter { String incomplete = ""; for (int l = 0; l < linesToRun.length; l++) { - String s = linesToRun[l]; + String s = linesToRun[l]; // check if next line starts with "." (but not ".." or "./") it is treated as an invocation if (l + 1 < linesToRun.length) { String nextLine = linesToRun[l + 1].trim(); @@ -671,18 +680,26 @@ public class SparkInterpreter extends Interpreter { if (jobGroup.equals(g)) { int[] progressInfo = null; - if (sc.version().startsWith("1.0")) { - progressInfo = getProgressFromStage_1_0x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.1")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.2")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.3")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.4")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else { - continue; + try { + Object finalStage = job.getClass().getMethod("finalStage").invoke(job); + if (sc.version().startsWith("1.0")) { + progressInfo = getProgressFromStage_1_0x(sparkListener, finalStage); + } else if (sc.version().startsWith("1.1")) { + progressInfo = getProgressFromStage_1_1x(sparkListener, finalStage); + } else if (sc.version().startsWith("1.2")) { + progressInfo = getProgressFromStage_1_1x(sparkListener, finalStage); + } else if (sc.version().startsWith("1.3")) { + progressInfo = getProgressFromStage_1_1x(sparkListener, finalStage); + } else if (sc.version().startsWith("1.4")) { + progressInfo = getProgressFromStage_1_1x(sparkListener, finalStage); + } else { + continue; + } + } catch (IllegalAccessException | IllegalArgumentException + | InvocationTargetException | NoSuchMethodException + | SecurityException e) { + logger.error("Can't get progress info", e); + return 0; } totalTasks += progressInfo[0]; completedTasks += progressInfo[1]; @@ -695,33 +712,27 @@ public class SparkInterpreter extends Interpreter { return completedTasks * 100 / totalTasks; } - private int[] getProgressFromStage_1_0x(JobProgressListener sparkListener, Stage stage) { - int numTasks = stage.numTasks(); + private int[] getProgressFromStage_1_0x(JobProgressListener sparkListener, Object stage) + throws IllegalAccessException, IllegalArgumentException, + InvocationTargetException, NoSuchMethodException, SecurityException { + int numTasks = (int) stage.getClass().getMethod("numTasks").invoke(stage); int completedTasks = 0; - Method method; + int id = (int) stage.getClass().getMethod("id").invoke(stage); + Object completedTaskInfo = null; - try { - method = sparkListener.getClass().getMethod("stageIdToTasksComplete"); - completedTaskInfo = - JavaConversions.asJavaMap((HashMap<Object, Object>) method.invoke(sparkListener)).get( - stage.id()); - } catch (NoSuchMethodException | SecurityException e) { - logger.error("Error while getting progress", e); - } catch (IllegalAccessException e) { - logger.error("Error while getting progress", e); - } catch (IllegalArgumentException e) { - logger.error("Error while getting progress", e); - } catch (InvocationTargetException e) { - logger.error("Error while getting progress", e); - } + + completedTaskInfo = JavaConversions.asJavaMap( + (HashMap<Object, Object>) sparkListener.getClass() + .getMethod("stageIdToTasksComplete").invoke(sparkListener)).get(id); if (completedTaskInfo != null) { completedTasks += (int) completedTaskInfo; } - List<Stage> parents = JavaConversions.asJavaList(stage.parents()); + List<Object> parents = JavaConversions.asJavaList((Seq<Object>) stage.getClass() + .getMethod("parents").invoke(stage)); if (parents != null) { - for (Stage s : parents) { + for (Object s : parents) { int[] p = getProgressFromStage_1_0x(sparkListener, s); numTasks += p[0]; completedTasks += p[1]; @@ -731,9 +742,12 @@ public class SparkInterpreter extends Interpreter { return new int[] {numTasks, completedTasks}; } - private int[] getProgressFromStage_1_1x(JobProgressListener sparkListener, Stage stage) { - int numTasks = stage.numTasks(); + private int[] getProgressFromStage_1_1x(JobProgressListener sparkListener, Object stage) + throws IllegalAccessException, IllegalArgumentException, + InvocationTargetException, NoSuchMethodException, SecurityException { + int numTasks = (int) stage.getClass().getMethod("numTasks").invoke(stage); int completedTasks = 0; + int id = (int) stage.getClass().getMethod("id").invoke(stage); try { Method stageIdToData = sparkListener.getClass().getMethod("stageIdToData"); @@ -747,7 +761,7 @@ public class SparkInterpreter extends Interpreter { Set<Tuple2<Object, Object>> keys = JavaConverters.asJavaSetConverter(stageIdData.keySet()).asJava(); for (Tuple2<Object, Object> k : keys) { - if (stage.id() == (int) k._1()) { + if (id == (int) k._1()) { Object uiData = stageIdData.get(k).get(); completedTasks += (int) numCompletedTasks.invoke(uiData); } @@ -756,9 +770,10 @@ public class SparkInterpreter extends Interpreter { logger.error("Error on getting progress information", e); } - List<Stage> parents = JavaConversions.asJavaList(stage.parents()); + List<Object> parents = JavaConversions.asJavaList((Seq<Object>) stage.getClass() + .getMethod("parents").invoke(stage)); if (parents != null) { - for (Stage s : parents) { + for (Object s : parents) { int[] p = getProgressFromStage_1_1x(sparkListener, s); numTasks += p[0]; completedTasks += p[1]; http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/spark/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java ---------------------------------------------------------------------- diff --git a/spark/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java b/spark/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java index d3bda44..053b887 100644 --- a/spark/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java +++ b/spark/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java @@ -21,21 +21,15 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.List; import java.util.Properties; -import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import org.apache.spark.SparkContext; -import org.apache.spark.scheduler.ActiveJob; -import org.apache.spark.scheduler.DAGScheduler; -import org.apache.spark.scheduler.Stage; import org.apache.spark.sql.SQLContext; -import org.apache.spark.ui.jobs.JobProgressListener; import org.apache.zeppelin.interpreter.Interpreter; import org.apache.zeppelin.interpreter.InterpreterContext; import org.apache.zeppelin.interpreter.InterpreterException; import org.apache.zeppelin.interpreter.InterpreterPropertyBuilder; import org.apache.zeppelin.interpreter.InterpreterResult; -import org.apache.zeppelin.interpreter.InterpreterUtils; import org.apache.zeppelin.interpreter.InterpreterResult.Code; import org.apache.zeppelin.interpreter.LazyOpenInterpreter; import org.apache.zeppelin.interpreter.WrappedInterpreter; @@ -44,13 +38,6 @@ import org.apache.zeppelin.scheduler.SchedulerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import scala.Tuple2; -import scala.collection.Iterator; -import scala.collection.JavaConversions; -import scala.collection.JavaConverters; -import scala.collection.mutable.HashMap; -import scala.collection.mutable.HashSet; - /** * Spark SQL interpreter for Zeppelin. * @@ -129,8 +116,19 @@ public class SparkSqlInterpreter extends Interpreter { sc.setLocalProperty("spark.scheduler.pool", null); } + Object rdd = null; + try { + // method signature of sqlc.sql() is changed + // from def sql(sqlText: String): SchemaRDD (1.2 and prior) + // to def sql(sqlText: String): DataFrame (1.3 and later). + // Therefore need to use reflection to keep binary compatibility for all spark versions. + Method sqlMethod = sqlc.getClass().getMethod("sql", String.class); + rdd = sqlMethod.invoke(sqlc, st); + } catch (NoSuchMethodException | SecurityException | IllegalAccessException + | IllegalArgumentException | InvocationTargetException e) { + throw new InterpreterException(e); + } - Object rdd = sqlc.sql(st); String msg = ZeppelinContext.showDF(sc, context, rdd, maxResult); return new InterpreterResult(Code.SUCCESS, msg); } @@ -151,117 +149,10 @@ public class SparkSqlInterpreter extends Interpreter { @Override public int getProgress(InterpreterContext context) { - String jobGroup = getJobGroup(context); - SQLContext sqlc = getSparkInterpreter().getSQLContext(); - SparkContext sc = sqlc.sparkContext(); - JobProgressListener sparkListener = getSparkInterpreter().getJobProgressListener(); - int completedTasks = 0; - int totalTasks = 0; - - DAGScheduler scheduler = sc.dagScheduler(); - HashSet<ActiveJob> jobs = scheduler.activeJobs(); - Iterator<ActiveJob> it = jobs.iterator(); - while (it.hasNext()) { - ActiveJob job = it.next(); - String g = (String) job.properties().get("spark.jobGroup.id"); - if (jobGroup.equals(g)) { - int[] progressInfo = null; - if (sc.version().startsWith("1.0")) { - progressInfo = getProgressFromStage_1_0x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.1")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.2")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.3")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else if (sc.version().startsWith("1.4")) { - progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); - } else { - logger.warn("Spark {} getting progress information not supported" + sc.version()); - continue; - } - totalTasks += progressInfo[0]; - completedTasks += progressInfo[1]; - } - } - - if (totalTasks == 0) { - return 0; - } - return completedTasks * 100 / totalTasks; - } - - private int[] getProgressFromStage_1_0x(JobProgressListener sparkListener, Stage stage) { - int numTasks = stage.numTasks(); - int completedTasks = 0; - - Method method; - Object completedTaskInfo = null; - try { - method = sparkListener.getClass().getMethod("stageIdToTasksComplete"); - completedTaskInfo = - JavaConversions.asJavaMap((HashMap<Object, Object>) method.invoke(sparkListener)).get( - stage.id()); - } catch (NoSuchMethodException | SecurityException e) { - logger.error("Error while getting progress", e); - } catch (IllegalAccessException e) { - logger.error("Error while getting progress", e); - } catch (IllegalArgumentException e) { - logger.error("Error while getting progress", e); - } catch (InvocationTargetException e) { - logger.error("Error while getting progress", e); - } - - if (completedTaskInfo != null) { - completedTasks += (int) completedTaskInfo; - } - List<Stage> parents = JavaConversions.asJavaList(stage.parents()); - if (parents != null) { - for (Stage s : parents) { - int[] p = getProgressFromStage_1_0x(sparkListener, s); - numTasks += p[0]; - completedTasks += p[1]; - } - } - - return new int[] {numTasks, completedTasks}; + SparkInterpreter sparkInterpreter = getSparkInterpreter(); + return sparkInterpreter.getProgress(context); } - private int[] getProgressFromStage_1_1x(JobProgressListener sparkListener, Stage stage) { - int numTasks = stage.numTasks(); - int completedTasks = 0; - - try { - Method stageIdToData = sparkListener.getClass().getMethod("stageIdToData"); - HashMap<Tuple2<Object, Object>, Object> stageIdData = - (HashMap<Tuple2<Object, Object>, Object>) stageIdToData.invoke(sparkListener); - Class<?> stageUIDataClass = - this.getClass().forName("org.apache.spark.ui.jobs.UIData$StageUIData"); - - Method numCompletedTasks = stageUIDataClass.getMethod("numCompleteTasks"); - - Set<Tuple2<Object, Object>> keys = - JavaConverters.asJavaSetConverter(stageIdData.keySet()).asJava(); - for (Tuple2<Object, Object> k : keys) { - if (stage.id() == (int) k._1()) { - Object uiData = stageIdData.get(k).get(); - completedTasks += (int) numCompletedTasks.invoke(uiData); - } - } - } catch (Exception e) { - logger.error("Error on getting progress information", e); - } - - List<Stage> parents = JavaConversions.asJavaList(stage.parents()); - if (parents != null) { - for (Stage s : parents) { - int[] p = getProgressFromStage_1_1x(sparkListener, s); - numTasks += p[0]; - completedTasks += p[1]; - } - } - return new int[] {numTasks, completedTasks}; - } @Override public Scheduler getScheduler() { http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/5de01c68/zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java ---------------------------------------------------------------------- diff --git a/zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java b/zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java index aa395aa..aab8043 100644 --- a/zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java +++ b/zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java @@ -108,8 +108,12 @@ public abstract class AbstractTestRestApi { // ci environment runs spark cluster for testing // so configure zeppelin use spark cluster if ("true".equals(System.getenv("CI"))) { - // assume first one is spark - InterpreterSetting sparkIntpSetting = ZeppelinServer.notebook.getInterpreterFactory().get().get(0); + InterpreterSetting sparkIntpSetting = null; + for(InterpreterSetting intpSetting : ZeppelinServer.notebook.getInterpreterFactory().get()) { + if (intpSetting.getGroup().equals("spark")) { + sparkIntpSetting = intpSetting; + } + } // set spark master sparkIntpSetting.getProperties().setProperty("master", "spark://" + getHostname() + ":7071"); @@ -120,8 +124,12 @@ public abstract class AbstractTestRestApi { ZeppelinServer.notebook.getInterpreterFactory().restart(sparkIntpSetting.id()); } else { - // assume first one is spark - InterpreterSetting sparkIntpSetting = ZeppelinServer.notebook.getInterpreterFactory().get().get(0); + InterpreterSetting sparkIntpSetting = null; + for(InterpreterSetting intpSetting : ZeppelinServer.notebook.getInterpreterFactory().get()) { + if (intpSetting.getGroup().equals("spark")) { + sparkIntpSetting = intpSetting; + } + } String sparkHome = getSparkHome(); if (sparkHome != null) {
