Ottomata has uploaded a new change for review.
https://gerrit.wikimedia.org/r/191665
Change subject: Puppetizing spark (in YARN)
......................................................................
Puppetizing spark (in YARN)
:D
Change-Id: I832e96840d89fa93b796f93e89d40f13f8387adc
---
M TODO.md
A manifests/spark.pp
A templates/spark/spark-env.sh.erb
3 files changed, 173 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet/cdh
refs/changes/65/191665/1
diff --git a/TODO.md b/TODO.md
index f58b32a..4ad9c00 100644
--- a/TODO.md
+++ b/TODO.md
@@ -12,6 +12,9 @@
- Make hadoop-metrics2.properties more configurable.
- Support HA automatic failover.
- HA NameNode Fencing support.
+- YARN HA
+- Create one variable for namenode address independent of nameservice_id and
primary_namenode_host_
+- Spark History Server?
## Zookeeper
diff --git a/manifests/spark.pp b/manifests/spark.pp
new file mode 100644
index 0000000..26b7f46
--- /dev/null
+++ b/manifests/spark.pp
@@ -0,0 +1,75 @@
+# == Class cdh::spark
+# Installs spark set up to work in YARN mode.
+# You should include this on your client nodes.
+# This does not need to be on all worker nodes.
+#
+class cdh::spark {
+ # Spark requires Hadoop configs installed.
+ Class['cdh::hadoop'] -> Class['cdh::spark']
+
+ package { ['spark-core', 'spark-python']:
+ ensure => 'installed',
+ }
+
+ $config_directory = "/etc/spark/conf.${cdh::hadoop::cluster_name}"
+ # Create the $cluster_name based $config_directory.
+ file { $config_directory:
+ ensure => 'directory',
+ require => Package['spark-core'],
+ }
+ cdh::alternative { 'spark-conf':
+ link => '/etc/spark/conf',
+ path => $config_directory,
+ }
+
+
+ # sudo -u hdfs hdfs dfs -mkdir /user/oozie
+ # sudo -u hdfs hdfs dfs -chmod 0775 /user/oozie
+ # sudo -u hdfs hdfs dfs -chown oozie:oozie /user/oozie
+ cdh::hadoop::directory { '/user/spark':
+ owner => 'spark',
+ group => 'spark',
+ mode => '0755',
+ require => Package['spark-core'],
+ }
+
+ cdh::hadoop::directory { '/user/spark/share':
+ owner => 'spark',
+ group => 'spark',
+ mode => '0755',
+ require => Cdh::Hadoop::Directory['/user/spark'],
+
+ }
+ cdh::hadoop::directory { '/user/spark/share/lib':
+ owner => 'spark',
+ group => 'spark',
+ mode => '0755',
+ require => Cdh::Hadoop::Directory['/user/spark/share'],
+ }
+
+ cdh::hadoop::directory { ['/user/spark/applicationHistory']:
+ owner => 'spark',
+ group => 'spark',
+ mode => '1775',
+ require => Cdh::Hadoop::Directory['/user/spark'],
+ }
+
+ # Put Spark assembly jar into HDFS so that it d
+ # doesn't have to be loaded for each spark job submission.
+ $namenode_address = $ha_enabled ? {
+ true => $cdh::hadoop::nameservice_id,
+ default => $cdh::hadoop::primary_namenode_host,
+ }
+ $spark_jar_hdfs_path =
"hdfs://${namenode_address}/user/spark/share/lib/spark-assembly.jar"
+ exec { 'spark_assembly_jar_install':
+ command => "/usr/bin/hdfs dfs -put -f
/usr/lib/spark/lib/spark-assembly.jar ${spark_jar_hdfs_path}",
+ unless => '/usr/bin/hdfs dfs -ls
/user/spark/share/lib/spark-assembly.jar | grep -q
/user/spark/share/lib/spark-assembly.jar',
+ user => 'spark',
+ require => Cdh::Hadoop::Directory['/user/spark/share/lib'],
+ }
+
+ file { "${config_directory}/spark-env.sh":
+ content => template('cdh/spark/spark-env.sh.erb'),
+ require => Exec['spark_assembly_jar_install'],
+ }
+}
diff --git a/templates/spark/spark-env.sh.erb b/templates/spark/spark-env.sh.erb
new file mode 100755
index 0000000..f89738f
--- /dev/null
+++ b/templates/spark/spark-env.sh.erb
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+# Note: This file is managed by Puppet.
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and
RDD data
+# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
+# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
+# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
+# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests
(Default: ‘default’)
+# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed
with the job.
+# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be
distributed with the job.
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for
the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g.
"-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give
executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for
the worker
+# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g.
"-Dx=y")
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server
(e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g.
"-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR Where log files are stored. (Default:
${SPARK_HOME}/logs)
+# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING A string representing this instance of spark.
(Default: $USER)
+# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0)
+
+###
+### === IMPORTANT ===
+### Change the following to specify a real cluster's Master host
+###
+export STANDALONE_SPARK_MASTER_HOST=`hostname`
+
+export SPARK_MASTER_IP=$STANDALONE_SPARK_MASTER_HOST
+
+### Let's run everything with JVM runtime, instead of Scala
+export SPARK_LAUNCH_WITH_SCALA=0
+export SPARK_LIBRARY_PATH=${SPARK_HOME}/lib
+export SCALA_LIBRARY_PATH=${SPARK_HOME}/lib
+export SPARK_MASTER_WEBUI_PORT=18080
+export SPARK_MASTER_PORT=7077
+export SPARK_WORKER_PORT=7078
+export SPARK_WORKER_WEBUI_PORT=18081
+export SPARK_WORKER_DIR=/var/run/spark/work
+export SPARK_LOG_DIR=/var/log/spark
+export SPARK_PID_DIR='/var/run/spark/'
+
+if [ -n "$HADOOP_HOME" ]; then
+ export
LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/libfakeroot:/usr/lib64/libfakeroot:/usr/lib32/libfakeroot:/lib/native
+fi
+
+# NOTE: This is a hack to fix a bug in the spark-env.sh currently shipped
with CDH.
+# See: https://issues.cloudera.org/browse/DISTRO-664 and
+#
https://groups.google.com/a/cloudera.org/forum/#!topic/cdh-user/Q9wjkdoTqKg
+#
+# export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-etc/hadoop/conf}
+export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
+
+### Comment above 2 lines and uncomment the following if
+### you want to run with scala version, that is included with the package
+#export SCALA_HOME=${SCALA_HOME:-/usr/lib/spark/scala}
+#export PATH=$PATH:$SCALA_HOME/bin
+
+# Set SPARK_JAR to the spark-assembly.jar in HDFS. This makes it so
+# that the spark jar doesn't have to be uploaded to HDFS every time
+# a user submits a job.
+# See:
http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/cdh_ig_running_spark_apps.html
+# If you upgrade spark, be sure to upload the new spark-assembly.jar to this
HDFS path.
+export SPARK_JAR=hdfs://<%= @spark_jar_hdfs_path %>
--
To view, visit https://gerrit.wikimedia.org/r/191665
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I832e96840d89fa93b796f93e89d40f13f8387adc
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet/cdh
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits