Ottomata has submitted this change and it was merged. Change subject: Remove unused analytics role classes ......................................................................
Remove unused analytics role classes Bug: T109859 Change-Id: I48f7d76255ca0bf04322cd4e6e306e1a60ebf374 --- M hieradata/eqiad.yaml D manifests/role/analytics.pp D manifests/role/analytics/hadoop.pp D manifests/role/analytics/hive.pp D manifests/role/analytics/hue.pp D manifests/role/analytics/impala.pp D manifests/role/analytics/mahout.pp D manifests/role/analytics/oozie.pp D manifests/role/analytics/pig.pp D manifests/role/analytics/refinery.pp D manifests/role/analytics/spark.pp D manifests/role/analytics/sqoop.pp M modules/camus/manifests/job.pp 13 files changed, 2 insertions(+), 1,574 deletions(-) Approvals: Ottomata: Looks good to me, approved jenkins-bot: Verified diff --git a/hieradata/eqiad.yaml b/hieradata/eqiad.yaml index 8d10be5..756caef 100644 --- a/hieradata/eqiad.yaml +++ b/hieradata/eqiad.yaml @@ -141,11 +141,4 @@ labs_baremetal_servers: - '10.64.20.12' -# Used in role::analytics::hive::config -hive_server_host: analytics1027.eqiad.wmnet -hive_metastore_host: analytics1027.eqiad.wmnet - -# Used in role::analytics::oozie::config -oozie_host: analytics1027.eqiad.wmnet - ldap_labs_hostname: ldap-labs.eqiad.wikimedia.org diff --git a/manifests/role/analytics.pp b/manifests/role/analytics.pp deleted file mode 100644 index 81a9b5c..0000000 --- a/manifests/role/analytics.pp +++ /dev/null @@ -1,27 +0,0 @@ -# analytics servers (RT-1985) -# == Class role::analytics -# Base class for all analytics nodes. -# All analytics nodes should include this. -class role::analytics { - system::role { 'role::analytics': description => 'analytics server' } - - require_package('openjdk-7-jdk') - - # This packages conflicts with the hadoop-fuse-dfs - # and with impalad in that two libjvm.so files get added - # to LD_LIBRARY_PATH. We dont't need this - # package anyway, so ensure it is absent. - package { 'icedtea-7-jre-jamvm': - ensure => 'absent' - } -} - -# == Class role::analytics::hadoop::monitor_disks -# Installs monitoring plugins for disks -# -class role::analytics::monitor_disks { - if $::standard::has_ganglia { - ganglia::plugin::python { 'diskstat': } - } - -} diff --git a/manifests/role/analytics/hadoop.pp b/manifests/role/analytics/hadoop.pp deleted file mode 100644 index 071ac98..0000000 --- a/manifests/role/analytics/hadoop.pp +++ /dev/null @@ -1,826 +0,0 @@ - # role/analytics/hadoop.pp -# -# Role classes for Analytics Hadoop nodes. -# These role classes will configure Hadoop properly in either -# the Labs or Production environments. -# -# -# Production configs are hardcoded here. Labs has a few parameters -# that need to be specified as global variables via the Manage Instances GUI: -# -# $cluster_name - Logical name of this cluster. Required. -# -# $hadoop_namenodes - Comma separated list of FQDNs that should be NameNodes -# for this cluster. The first entry in the list -# is assumed to be the preferred primary NameNode. Required. -# This list will also be used as $resourcemanager_hosts. -# If hiera('zookeeper_hosts') is set, and this list has more -# than one entry, and $journalnode_hosts is also set, then -# HA YARN ResourceManager will be configured. -# TODO: Change the name of this variable to hadoop_masters -# When we make this work better with hiera. -# -# $journalnode_hosts - Comma separated list of FQDNs that should be JournalNodes -# for this cluster. Optional. If not specified, HA will not be configured. -# -# $heapsize - Optional. Set this to a value in MB to limit the JVM -# heapsize for all Hadoop daemons. Optional. -# -# -# Usage: -# -# To install only hadoop client packages and configs: -# include role::analytics::hadoop::client -# -# To install a Hadoop Master (NameNode + ResourceManager, etc.): -# include role::analytics::hadoop::master -# -# To install a Hadoop Worker (DataNode + NodeManager + etc.): -# include role::analytics::hadoop::worker -# - -# == Class role::analytics::hadoop::config -# This is just a config class. You can include this -# anywhere if you need to infer Hadoop configs. It -# only sets variables, it will not install or configure -# any packages. hadoop::client inherits from this class. -# -class role::analytics::hadoop::config { - - # Configs common to both Production and Labs. - $hadoop_var_directory = '/var/lib/hadoop' - $hadoop_name_directory = "${hadoop_var_directory}/name" - $hadoop_data_directory = "${hadoop_var_directory}/data" - $hadoop_journal_directory = "${hadoop_var_directory}/journal" - $dfs_block_size = 268435456 # 256 MB - $io_file_buffer_size = 131072 - # Turn on Snappy compression by default for maps and final outputs - $mapreduce_intermediate_compression_codec = 'org.apache.hadoop.io.compress.SnappyCodec' - $mapreduce_output_compression = true - $mapreduce_output_compression_codec = 'org.apache.hadoop.io.compress.SnappyCodec' - $mapreduce_output_compression_type = 'BLOCK' - $mapreduce_job_reuse_jvm_num_tasks = 1 - $fair_scheduler_template = 'hadoop/fair-scheduler.xml.erb' - # setting this to false or undef interferes with defining it within a node - $gelf_logging_enabled = false - - # This needs to be set in order to use Impala - $dfs_datanode_hdfs_blocks_metadata_enabled = true - - # Yarn App Master possible port ranges - $yarn_app_mapreduce_am_job_client_port_range = '55000-55199' - - # Look up zookeeper_hosts from hiera. - $zookeeper_hosts = keys(hiera('zookeeper_hosts', undef)) - - # Configs specific to Production. - if $::realm == 'production' { - # This is the logical name of the Analytics Hadoop cluster. - $cluster_name = 'analytics-hadoop' - - $namenode_hosts = [ - 'analytics1001.eqiad.wmnet', - 'analytics1002.eqiad.wmnet', - ] - $resourcemanager_hosts = $namenode_hosts - - # JournalNodes are colocated on worker DataNodes. - $journalnode_hosts = [ - 'analytics1052.eqiad.wmnet', # Row A3 - 'analytics1028.eqiad.wmnet', # Row C2 - 'analytics1035.eqiad.wmnet', # Row D2 - ] - - # analytics1011-analytics1020 have 12 mounts on disks sda - sdl. - if $::hostname =~ /analytics10(1[1-9]|20)/ { - $datanode_mounts = [ - "${hadoop_data_directory}/a", - "${hadoop_data_directory}/b", - "${hadoop_data_directory}/c", - "${hadoop_data_directory}/d", - "${hadoop_data_directory}/e", - "${hadoop_data_directory}/f", - "${hadoop_data_directory}/g", - "${hadoop_data_directory}/h", - "${hadoop_data_directory}/i", - "${hadoop_data_directory}/j", - "${hadoop_data_directory}/k", - "${hadoop_data_directory}/l", - ] - } - # analytics1028-analytics1041 have mounts on disks sdb - sdm. - # (sda is hardware raid on the 2 2.5 drives in the flex bays.) - else { - $datanode_mounts = [ - "${hadoop_data_directory}/b", - "${hadoop_data_directory}/c", - "${hadoop_data_directory}/d", - "${hadoop_data_directory}/e", - "${hadoop_data_directory}/f", - "${hadoop_data_directory}/g", - "${hadoop_data_directory}/h", - "${hadoop_data_directory}/i", - "${hadoop_data_directory}/j", - "${hadoop_data_directory}/k", - "${hadoop_data_directory}/l", - "${hadoop_data_directory}/m", - ] - } - - $mapreduce_reduce_shuffle_parallelcopies = 10 - $mapreduce_task_io_sort_mb = 200 - $mapreduce_task_io_sort_factor = 10 - - - # Configure memory based on these recommendations and then adjusted: - # http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.0.6.0/bk_installing_manually_book/content/rpm-chap1-11.html - - ### These Map/Reduce and YARN ApplicationMaster master settings are - # settable per job, and the defaults when clients submit them are often - # picked up from the local versions of the /etc/hadoop/conf/{mapred,yarn}-site.xml files. - # That means they should not be set relative to the local node facter variables, and as such - # use a hardcoded value of memory_per_container to work from. Otherwise a job - # submitted from a relatively small client node will use bad job defaults. - # - # We currently run two different types of worker nodes in production. - # The older Dells have 48G of RAM, and the newer ones have 64G. - # - # Using + 0 here ensures that these variables are - # integers (Fixnums) and won't throw errors - # when used with min()/max() functions. - - # Worker nodes are heterogenous, so I don't want to use a variable - # memory per container size across the cluster. Larger nodes will just - # allocate a few more containers. Setting this to 2G. - $memory_per_container_mb = 2048 + 0 - - # Map container size and JVM max heap size (-XmX) - $mapreduce_map_memory_mb = floor($memory_per_container_mb) - $mapreduce_reduce_memory_mb = floor(2 * $memory_per_container_mb) - $map_jvm_heap_size = floor(0.8 * $memory_per_container_mb) - # Reduce container size and JVM max heap size (-Xmx) - $mapreduce_map_java_opts = "-Xmx${map_jvm_heap_size}m" - $reduce_jvm_heap_size = floor(0.8 * 2 * $memory_per_container_mb) - $mapreduce_reduce_java_opts = "-Xmx${reduce_jvm_heap_size}m" - - # Yarn ApplicationMaster container size and max heap size (-Xmx) - $yarn_app_mapreduce_am_resource_mb = floor(2 * $memory_per_container_mb) - $mapreduce_am_heap_size = floor(0.8 * 2 * $memory_per_container_mb) - $yarn_app_mapreduce_am_command_opts = "-Xmx${mapreduce_am_heap_size}m" - - ### The amount of RAM for NodeManagers will only be be used by NodeManager - # processes running on the worker nodes themselves. Client nodes that submit - # jobs will ignore these settings. These are safe to set relative to the - # node currently evaluating puppet's facter variables. - - # Select a 'reserve' memory size for the - # OS and other Hadoop processes. - if $::memorysize_mb <= 1024 { - $reserve_memory_mb = 256 - } - elsif $::memorysize_mb <= 2048 { - $reserve_memory_mb = 512 - } - elsif $::memorysize_mb <= 4096 { - $reserve_memory_mb = 1024 - } - elsif $::memorysize_mb <= 16384 { - $reserve_memory_mb = 2048 - } - elsif $::memorysize_mb <= 24576 { - $reserve_memory_mb = 4096 - } - elsif $::memorysize_mb <= 49152 { - $reserve_memory_mb = 6144 - } - elsif $::memorysize_mb <= 73728 { - $reserve_memory_mb = 8192 - } - elsif $::memorysize_mb <= 98304 { - $reserve_memory_mb = 12288 - } - elsif $::memorysize_mb <= 131072 { - $reserve_memory_mb = 24576 - } - elsif $::memorysize_mb <= 262144 { - $reserve_memory_mb = 32768 - } - else { - $reserve_memory_mb = 65536 - } - - # Memory available for use by Hadoop jobs. - $available_memory_mb = $::memorysize_mb - $reserve_memory_mb - - # Since I have chosen a static $memory_per_container of 2048 across all - # node sizes, we should just choose to give NodeManagers - # $available_memory_mb to work with. - # This will give nodes with 48G of memory about 21 containers, and - # nodes with 64G memory about 28 containers. - # - # This is the total amount of memory that NodeManagers - # will use for allocation to containers. - $yarn_nodemanager_resource_memory_mb = floor($available_memory_mb) - - # Setting _minimum_allocation_mb to 0 to allow Impala to submit small reservation requests. - $yarn_scheduler_minimum_allocation_mb = 0 - $yarn_scheduler_maximum_allocation_mb = $yarn_nodemanager_resource_memory_mb - # Setting minimum_allocation_vcores to 0 to allow Impala to submit small reservation requests. - $yarn_scheduler_minimum_allocation_vcores = 0 - - # use net-topology.py.erb to map hostname to /datacenter/rack/row id. - $net_topology_script_template = 'hadoop/net-topology.py.erb' - $hadoop_heapsize = undef - # Increase NameNode heapsize independent from other daemons - $hadoop_namenode_opts = '-Xmx4096m' - - $yarn_heapsize = undef - - # TODO: use variables from new ganglia module once it is finished. - $ganglia_host = '208.80.154.10' - $ganglia_port = 9681 - $gelf_logging_host = 'logstash1002.eqiad.wmnet' - $gelf_logging_port = 12201 - # In production, make sure that HDFS user directories are - # created for everyone in these groups. - $hadoop_users_posix_groups = 'analytics-users analytics-privatedata-users analytics-admins analytics-search-users' - } - - # Configs specific to Labs. - elsif $::realm == 'labs' { - # These variables are configurable via the - # Labs Manage Instances GUI. - $namenode_hosts = $::hadoop_namenodes ? { - undef => [$::fqdn], - default => split($::hadoop_namenodes, ','), - } - $resourcemanager_hosts = $namenode_hosts - - $journalnode_hosts = $::hadoop_journalnodes ? { - undef => undef, - default => split($::hadoop_journalnodes, ','), - } - - $cluster_name = $::hadoop_cluster_name ? { - undef => undef, - default => $::hadoop_cluster_name, - } - - # Allow labs users to configure their Hadoop daemon - # Heapsize. NOTE: This will be applied to - # All Hadoop related services on this node. - $heapsize = $::hadoop_heapsize ? { - undef => undef, - default => $::hadoop_heapsize, - } - - $datanode_mounts = [ - "${hadoop_data_directory}/a", - "${hadoop_data_directory}/b", - ] - - # Labs sets these at undef, which lets the Hadoop defaults stick. - $hadoop_namenode_opts = undef - $mapreduce_reduce_shuffle_parallelcopies = undef - $mapreduce_task_io_sort_mb = undef - $mapreduce_task_io_sort_factor = undef - $mapreduce_map_memory_mb = undef - $mapreduce_reduce_memory_mb = undef - $mapreduce_map_java_opts = undef - $mapreduce_reduce_java_opts = undef - $yarn_app_mapreduce_am_resource_mb = undef - $yarn_app_mapreduce_am_command_opts = undef - $yarn_nodemanager_resource_memory_mb = undef - $yarn_scheduler_minimum_allocation_mb = 0 - $yarn_scheduler_maximum_allocation_mb = undef - $yarn_scheduler_minimum_allocation_vcores = 0 - - $net_topology_script_template = undef - - $ganglia_host = 'aggregator.eqiad.wmflabs' - $ganglia_port = 50090 - $gelf_logging_host = '127.0.0.1' - $gelf_logging_port = 12201 - # In labs, make sure that HDFS user directories are - # created for everyone in the current labs project. - $hadoop_users_posix_groups = $::labsproject - - # Hadoop directories in labs should be automatically created. - # This conditional could be added to each of the main classes - # below, but since it doesn't hurt to have these directories - # in labs, and since I don't want to add $::realm conditionals - # below, I just create them here. - file { [ - $hadoop_var_directory, - $hadoop_data_directory, - ]: - ensure => 'directory', - } - } -} - -# == Class role::analytics::hadoop::ferm::namenode -# -class role::analytics::hadoop::ferm::namenode { - ferm::service{ 'hadoop-hdfs-namenode': - proto => 'tcp', - port => '8020', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-hdfs-namenode-http-ui': - proto => 'tcp', - port => '50070', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-hdfs-httpfs': - proto => 'tcp', - port => '14000', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-hdfs-namenode-jmx': - proto => 'tcp', - port => '9980', - srange => '$ANALYTICS_NETWORKS', - } -} - -# == Class role::analytics::hadoop::ferm::resourcemanager -# - -class role::analytics::hadoop::ferm::resourcemanager { - - ferm::service{ 'hadoop-yarn-resourcemanager-scheduler': - proto => 'tcp', - port => '8030', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-yarn-resourcemanager-tracker': - proto => 'tcp', - port => '8031', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-yarn-resourcemanager': - proto => 'tcp', - port => '8032', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-yarn-resourcemanager-admin': - proto => 'tcp', - port => '8033', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-yarn-resourcemanager-http-ui': - proto => 'tcp', - port => '8088', - srange => '$INTERNAL', - } - - ferm::service{ 'hadoop-mapreduce-historyserver': - proto => 'tcp', - port => '10020', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-mapreduce-historyserver-admin': - proto => 'tcp', - port => '10033', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-mapreduce-historyserver-http-ui': - proto => 'tcp', - port => '19888', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'hadoop-yarn-resourcemanager-jmx': - proto => 'tcp', - port => '9983', - srange => '$ANALYTICS_NETWORKS', - } - - -} - - -# == Class role::analytics::hadoop -# Installs Hadoop client pacakges and configuration. -# -class role::analytics::hadoop::client inherits role::analytics::hadoop::config { - # need java before hadoop is installed - require_package('openjdk-7-jdk') - - class { 'cdh::hadoop': - cluster_name => $cluster_name, - namenode_hosts => $namenode_hosts, - journalnode_hosts => $journalnode_hosts, - resourcemanager_hosts => $resourcemanager_hosts, - zookeeper_hosts => $zookeeper_hosts, - datanode_mounts => $datanode_mounts, - dfs_name_dir => [$hadoop_name_directory], - dfs_journalnode_edits_dir => $hadoop_journal_directory, - dfs_block_size => $dfs_block_size, - io_file_buffer_size => $io_file_buffer_size, - mapreduce_intermediate_compression_codec => $mapreduce_intermediate_compression_codec, - mapreduce_output_compression => $mapreduce_output_compression, - mapreduce_output_compression_codec => $mapreduce_output_compression_codec, - mapreduce_output_compression_type => $mapreduce_output_compression_type, - - mapreduce_job_reuse_jvm_num_tasks => $mapreduce_job_reuse_jvm_num_tasks, - mapreduce_reduce_shuffle_parallelcopies => $mapreduce_reduce_shuffle_parallelcopies, - mapreduce_task_io_sort_mb => $mapreduce_task_io_sort_mb, - mapreduce_task_io_sort_factor => $mapreduce_task_io_sort_factor, - - mapreduce_map_memory_mb => $mapreduce_map_memory_mb, - mapreduce_reduce_memory_mb => $mapreduce_reduce_memory_mb, - mapreduce_map_java_opts => $mapreduce_map_java_opts, - mapreduce_reduce_java_opts => $mapreduce_reduce_java_opts, - yarn_app_mapreduce_am_resource_mb => $yarn_app_mapreduce_am_resource_mb, - yarn_app_mapreduce_am_command_opts => $yarn_app_mapreduce_am_command_opts, - yarn_app_mapreduce_am_job_client_port_range => $yarn_app_mapreduce_am_job_client_port_range, - - yarn_nodemanager_resource_memory_mb => $yarn_nodemanager_resource_memory_mb, - yarn_scheduler_minimum_allocation_mb => $yarn_scheduler_minimum_allocation_mb, - yarn_scheduler_maximum_allocation_mb => $yarn_scheduler_maximum_allocation_mb, - yarn_scheduler_minimum_allocation_vcores => $yarn_scheduler_minimum_allocation_vcores, - - dfs_datanode_hdfs_blocks_metadata_enabled => $dfs_datanode_hdfs_blocks_metadata_enabled, - - - # Use net-topology.py.erb to map hostname to /datacenter/rack/row id. - net_topology_script_template => $net_topology_script_template, - # Use fair-scheduler.xml.erb to define FairScheduler queues. - fair_scheduler_template => $fair_scheduler_template, - - yarn_site_extra_properties => { - # Enable FairScheduler preemption. This will allow the essential queue - # to preempt non-essential jobs. - 'yarn.scheduler.fair.preemption' => true, - # Let YARN wait for at least 1/3 of nodes to present scheduling - # opportunties before scheduling a job for certain data - # on a node on which that data is not present. - 'yarn.scheduler.fair.locality.threshold.node' => '0.33', - # After upgrading to CDH 5.4.0, we are encountering this bug: - # https://issues.apache.org/jira/browse/MAPREDUCE-5799 - # This should work around the problem. - 'yarn.app.mapreduce.am.env' => 'LD_LIBRARY_PATH=/usr/lib/hadoop/lib/native', - # The default of 90.0 for this was marking older dells as unhealthy when they still - # had 2TB of space left. 99% will mark them at unhealthy with they still have - # > 200G free. - 'yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage' => '99.0', - }, - - gelf_logging_enabled => $gelf_logging_enabled, - gelf_logging_host => $gelf_logging_host, - gelf_logging_port => $gelf_logging_port, - - hadoop_namenode_opts => $hadoop_namenode_opts, - } - - # If in production AND the current node is a journalnode, then - # go ahead and include an icinga alert for the JournalNode process. - if $::realm == 'production' and member($journalnode_hosts, $::fqdn) { - nrpe::monitor_service { 'hadoop-hdfs-journalnode': - description => 'Hadoop JournalNode', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.hdfs.qjournal.server.JournalNode"', - require => Class['cdh::hadoop'], - critical => true, - } - } - file { '/usr/local/bin/hadoop-yarn-logging-helper.sh': - content => template('hadoop/hadoop-yarn-logging-helper.erb'), - mode => '0744', - } - if $gelf_logging_enabled { - ensure_packages([ - # library dependency - 'libjson-simple-java', - # the libary itself: logstash-gelf.jar - 'liblogstash-gelf-java', - ]) - # symlink into hadoop classpath - file { '/usr/lib/hadoop/lib/json_simple.jar': - ensure => 'link', - target => '/usr/share/java/json_simple.jar', - require => Package['libjson-simple-java'], - } - - # symlink into hadoop classpath - file { '/usr/lib/hadoop/lib/logstash-gelf.jar': - ensure => 'link', - target => '/usr/share/java/logstash-gelf.jar', - require => Package['liblogstash-gelf-java'], - } - # Patch container-log4j.properties inside nodemanager jar - # See script source for details - exec { 'hadoop-yarn-logging-helper-set': - command => '/usr/local/bin/hadoop-yarn-logging-helper.sh set', - subscribe => File['/usr/local/bin/hadoop-yarn-logging-helper.sh'], - } - } else { - # Revert to original unmodified jar - exec { 'hadoop-yarn-logging-helper-reset': - command => '/usr/local/bin/hadoop-yarn-logging-helper.sh reset', - subscribe => File['/usr/local/bin/hadoop-yarn-logging-helper.sh'], - } - } - - # Temporarily hardode DNS CNAMES into /etc/hosts. - # jobs are failing around the cluster because these - # are cached in DNS. I need to fix now. Will remove - # this after new DNS has propogated. - file_line { 'hadoop_master_cname_dns_override': - ensure => 'absent', - path => '/etc/hosts', - line => '10.64.36.118 namenode.analytics.eqiad.wmnet resoucemanager.analytics.eqiad.wmnet', - } - - # Install packages that are useful for distributed - # computation in Hadoop, and thus should be available on - # any Hadoop nodes. - ensure_packages([ - # Need python3 on Hadoop nodes in order to run - # Hadoop Streaming python jobs. - 'python3', - 'python-numpy', - 'python-pandas', - 'python-scipy', - 'python-requests', - 'python-matplotlib', - 'python-dateutil', - 'python-sympy', - ]) -} - - - -# == Class role::analytics::hadoop::master -# Includes cdh::hadoop::master classes -# -class role::analytics::hadoop::master inherits role::analytics::hadoop::client { - system::role { 'role::analytics::hadoop::master': - description => 'Hadoop Master (NameNode & ResourceManager)', - } - - class { 'cdh::hadoop::master': } - - # Master should run httpfs daemon. - class { 'cdh::hadoop::httpfs': - require => Class['cdh::hadoop::master'], - } - - # Hadoop nodes are spread across multiple rows - # and need to be able to send multicast packets - # multiple network hops. Hadoop GangliaContext - # does not support this. See: - # https://issues.apache.org/jira/browse/HADOOP-10181 - # We use jmxtrans instead. - # Use jmxtrans for sending metrics to ganglia and statsd - - # TODO: use variables for stats server from somewhere? - $statsd = 'statsd.eqiad.wmnet:8125' - - class { 'cdh::hadoop::jmxtrans::master': - ganglia => "${ganglia_host}:${ganglia_port}", - statsd => $statsd, - } - - # monitor disk statistics - include role::analytics::monitor_disks - - # FairScheduler is creating event logs in hadoop.log.dir/fairscheduler/ - # It rotates them but does not delete old ones. Set up cronjob to - # delete old files in this directory. - cron { 'hadoop-clean-fairscheduler-event-logs': - command => 'test -d /var/log/hadoop-yarn/fairscheduler && /usr/bin/find /var/log/hadoop-yarn/fairscheduler -type f -mtime +14 -exec rm {} >/dev/null \;', - minute => 5, - hour => 0, - require => Class['cdh::hadoop::master'], - } - - # Include icinga alerts if production realm. - if $::realm == 'production' { - # Icinga process alerts for NameNode, ResourceManager and HistoryServer - nrpe::monitor_service { 'hadoop-hdfs-namenode': - description => 'Hadoop Namenode - Primary', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"', - require => Class['cdh::hadoop::master'], - critical => true, - } - nrpe::monitor_service { 'hadoop-yarn-resourcemanager': - description => 'Hadoop ResourceManager', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.yarn.server.resourcemanager.ResourceManager"', - require => Class['cdh::hadoop::master'], - critical => true, - } - nrpe::monitor_service { 'hadoop-mapreduce-historyserver': - description => 'Hadoop HistoryServer', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer"', - require => Class['cdh::hadoop::master'], - } - - # Allow nagios to run the check_hdfs_active_namenode as hdfs user. - sudo::user { 'nagios-check_hdfs_active_namenode': - user => 'nagios', - privileges => ['ALL = NOPASSWD: /usr/local/bin/check_hdfs_active_namenode'], - } - # Alert if there is no active NameNode - nrpe::monitor_service { 'hadoop-hdfs-active-namenode': - description => 'At least one Hadoop HDFS NameNode is active', - nrpe_command => '/usr/bin/sudo /usr/local/bin/check_hdfs_active_namenode', - require => [ - Class['cdh::hadoop::master'], - Sudo::User['nagios-check_hdfs_active_namenode'], - ], - } - } - - # This will create HDFS user home directories - # for all users in the provided groups. - # This only needs to be run on the NameNode - # where all users that want to use Hadoop - # must have shell accounts anyway. - class { 'cdh::hadoop::users': - groups => $hadoop_users_posix_groups, - require => Class['cdh::hadoop::master'], - } - - - # Firewall - include role::analytics::hadoop::ferm::namenode - include role::analytics::hadoop::ferm::resourcemanager -} - -# == Class role::analytics::hadoop::worker -# Includes cdh::hadoop::worker classes -class role::analytics::hadoop::worker inherits role::analytics::hadoop::client { - system::role { 'role::analytics::hadoop::worker': - description => 'Hadoop Worker (DataNode & NodeManager)', - } - - class { 'cdh::hadoop::worker': } - - # Hadoop nodes are spread across multiple rows - # and need to be able to send multicast packets - # multiple network hops. Hadoop GangliaContext - # does not support this. See: - # https://issues.apache.org/jira/browse/HADOOP-10181 - # We use jmxtrans instead. - - # Use jmxtrans for sending metrics to ganglia - class { 'cdh::hadoop::jmxtrans::worker': - ganglia => "${ganglia_host}:${ganglia_port}", - statsd => $statsd, - } - - # monitor disk statistics - include role::analytics::monitor_disks - - # Include icinga alerts if production realm. - if $::realm == 'production' { - # Icinga process alerts for DataNode and NodeManager - nrpe::monitor_service { 'hadoop-hdfs-datanode': - description => 'Hadoop DataNode', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.hdfs.server.datanode.DataNode"', - require => Class['cdh::hadoop::worker'], - } - nrpe::monitor_service { 'hadoop-yarn-nodemanager': - description => 'Hadoop NodeManager', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.yarn.server.nodemanager.NodeManager"', - require => Class['cdh::hadoop::worker'], - } - - # Alert on datanode mount disk space. These mounts are ignored by the - # base module's check_disk via the base::monitoring::host::nrpe_check_disk_options - # override in worker.yaml hieradata. - nrpe::monitor_service { 'disk_space_hadoop_worker': - description => 'Disk space on Hadoop worker', - nrpe_command => '/usr/lib/nagios/plugins/check_disk --units GB -w 32 -c 16 -e -l -r "/var/lib/hadoop/data"', - } - - # Make sure that this worker node has NodeManager running in a RUNNING state. - # Install a custom check command for NodeManager Node-State: - file { '/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state': - source => 'puppet:///files/hadoop/check_hadoop_yarn_node_state', - owner => 'root', - group => 'root', - mode => '0755', - } - nrpe::monitor_service { 'hadoop_yarn_node_state': - description => 'YARN NodeManager Node-State', - nrpe_command => '/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state', - } - } - - - # Install hive client on worker nodes to get - # hive-hcatalog package. hive-catalog depends - # on hive package, so we might as well - # configure hive too. - include role::analytics::hive::client - - - # We use this to send passive checks off to icinga - # for generating alerts. We need the nsca-client package - # to do this remotely. Some oozie jobs use this, - # and it must be present on all datanodes. - include role::analytics::hadoop::monitor::nsca::client - - # Install MaxMind databases for geocoding UDFs - include geoip - - - # Firewall - ferm::service{ 'hadoop-access': - proto => 'tcp', - port => '1024:65535', - srange => '$ANALYTICS_NETWORKS', - } -} - -# == Class role::analytics::hadoop::monitor::nsca::client -# This class exists in order to override the group ownership -# and permissions of the /etc/send_nsca.cfg file. Hadoop -# processes need to be able to read this file in order to -# run send_nsca as part of Oozie submitted monitoring jobs. -class role::analytics::hadoop::monitor::nsca::client inherits icinga::nsca::client { - File ['/etc/send_nsca.cfg'] { - group => 'hadoop', - mode => '0440', - } -} - -# == Class role::analytics::hadoop::standby -# Include standby NameNode classes -# -class role::analytics::hadoop::standby inherits role::analytics::hadoop::client { - system::role { 'role::analytics::hadoop::standby': - description => 'Hadoop Standby NameNode', - } - - class { 'cdh::hadoop::namenode::standby': } - - - # Use jmxtrans for sending metrics to ganglia - class { 'cdh::hadoop::jmxtrans::namenode': - ganglia => "${ganglia_host}:${ganglia_port}", - } - - # monitor disk statistics - include role::analytics::monitor_disks - - - # Include icinga alerts if production realm. - if $::realm == 'production' { - # Icinga process alert for Stand By NameNode - nrpe::monitor_service { 'hadoop-hdfs-namenode': - description => 'Hadoop Namenode - Stand By', - nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"', - require => Class['cdh::hadoop::namenode::standby'], - critical => true, - } - } - - # If this is a resourcemanager host, then go ahead - # and include a resourcemanager on all standby nodes as well - # as the master node. - if $::fqdn in $resourcemanager_hosts { - include cdh::hadoop::resourcemanager - # Firewall - include role::analytics::hadoop::ferm::resourcemanager - } - - - # Firewall - include role::analytics::hadoop::ferm::namenode -} - - -# == Class role::analytics::hadoop::balancer -# Runs hdfs balancer periodically to keep data balanced across all DataNodes -class role::analytics::hadoop::balancer { - Class['role::analytics::hadoop::client'] -> Class['role::analytics::hadoop::balancer'] - - file { '/usr/local/bin/hdfs-balancer': - source => 'puppet:///files/hadoop/hdfs-balancer', - mode => '0754', - owner => 'hdfs', - group => 'hdfs', - } - - cron { 'hdfs-balancer': - command => '/usr/local/bin/hdfs-balancer >> /var/log/hadoop-hdfs/balancer.log 2>&1', - user => 'hdfs', - # Every day at 6am UTC. - minute => 0, - hour => 6, - require => File['/usr/local/bin/hdfs-balancer'], - } -} diff --git a/manifests/role/analytics/hive.pp b/manifests/role/analytics/hive.pp deleted file mode 100644 index 1cb4c61..0000000 --- a/manifests/role/analytics/hive.pp +++ /dev/null @@ -1,137 +0,0 @@ -# role/analytics/hive.pp -# -# Role classes for Analytics Hive client and server nodes. -# These role classes will configure Hive properly in either -# Labs or Production environments. -# -# If you are using these in Labs, you must include role::analytics::hive::server -# on your primary Hadoop NameNode. -# -# role::analytics::hive::client requires role::analytics::hadoop::client, -# and will install Hadoop client pacakges and configs. In Labs, -# you must set appropriate Hadoop client global parameters. See -# role/analytics/hadoop.pp documentation for more info. - - -# == Class role::analytics::hive::config -# -class role::analytics::hive::config { - # require zookeeper config to get zookeeper hosts array. - include role::analytics::hadoop::config - - # Set this pretty high, to avoid limiting the number - # of substitution variables a Hive script can use. - $variable_substitute_depth = 10000 - - # The WMF webrequest table uses HCatalog's JSON Serde. - # Automatically include this in Hive client classpaths. - $hcatalog_jar = 'file:///usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar' - - # If refinery is included on this node, then add - # refinery-hive.jar to the auxpath as well - if defined(Class['role::analytics::refinery']) { - $auxpath = "${hcatalog_jar},file://${::role::analytics::refinery::path}/artifacts/refinery-hive.jar" - } - else { - $auxpath = $hcatalog_jar - } - - # Hive uses Zookeeper for table locking. - $zookeeper_hosts = keys(hiera('zookeeper_hosts')) - - # We set support concurrency to false by default. - # if someone needs to use it in their hive job, they - # may manually set it to true via - # set hive.support.concurrency = true; - $support_concurrency = false - - if $::realm == 'production' { - include passwords::analytics - - $jdbc_password = $passwords::analytics::hive_jdbc_password - # Must set hive_server_host and hive_metastore_host in hiera - # in production. - $default_hive_host = undef - } - elsif $::realm == 'labs' { - $jdbc_password = 'hive' - # Default to hosting hive-server and hive-metastore on - # primary namenode in labs. - $default_hive_host = $role::analytics::hadoop::config::namenode_hosts[0] - } - - $server_host = hiera('hive_server_host', $default_hive_host) - $metastore_host = hiera('hive_metastore_host', $default_hive_host) -} - - -# == Class role::analytics::hive -# Installs base configs for hive client nodes -# -class role::analytics::hive::client inherits role::analytics::hive::config { - require role::analytics::hadoop::client - - class { '::cdh::hive': - metastore_host => $metastore_host, - jdbc_password => $jdbc_password, - zookeeper_hosts => $zookeeper_hosts, - support_concurrency => $support_concurrency, - variable_substitute_depth => $variable_substitute_depth, - auxpath => $auxpath, - # default to using Snappy for parquet formatted tables - parquet_compression => 'SNAPPY', - } -} - - -# == Class role::analytics::hive::server -# Sets up Hive Server2 and MySQL backed Hive Metastore. -# -class role::analytics::hive::server inherits role::analytics::hive::client { - if (!defined(Package['mysql-server'])) { - package { 'mysql-server': - ensure => 'installed', - } - } - - # Make sure mysql-server is installed before - # MySQL Hive Metastore database class is applied. - # Package['mysql-server'] -> Class['cdh::hive::metastore::mysql'] - - # TODO: Set these better once hive is on its own server. - # See: https://phabricator.wikimedia.org/T110090 - # http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/cdh_ig_hive_install.html#concept_alp_4kl_3q_unique_1 - # TODO: Use hiera. - $server_heapsize = $::realm ? { - 'production' => 1024, - default => undef, - } - $metastore_heapsize = $::realm ? { - 'production' => 256, - default => undef, - } - # # Setup Hive server and Metastore - # class { 'cdh::hive::master': - # server_heapsize => $server_heapsize, - # metastore_heapsize => $metastore_heapsize, - # } - - class { 'cdh::hive::server': - heapsize => $server_heapsize, - } - class { 'cdh::hive::metastore': - heapsize => $metastore_heapsize, - } - - ferm::service{ 'hive_server': - proto => 'tcp', - port => '10000', - srange => '$INTERNAL', - } - - ferm::service{ 'hive_metastore': - proto => 'tcp', - port => '9083', - srange => '$INTERNAL', - } -} diff --git a/manifests/role/analytics/hue.pp b/manifests/role/analytics/hue.pp deleted file mode 100644 index c352d63..0000000 --- a/manifests/role/analytics/hue.pp +++ /dev/null @@ -1,66 +0,0 @@ -# == Class role::analytics::hue -# Installs Hue server. -# -class role::analytics::hue { - # Require that all Hue applications - # have their corresponding clients - # and configs installed. - require role::analytics::hadoop::client - require role::analytics::hive::client - require role::analytics::oozie::client - require role::analytics::pig - require role::analytics::sqoop - - # LDAP Labs config is the same as LDAP in production. - include ldap::role::config::labs - - # Disable hue's SSL. SSL terminiation is handled by an upstream proxy. - $ssl_private_key = false - $ssl_certificate = false - $secure_proxy_ssl_header = true - - if ($::realm == 'production') { - include passwords::analytics - - $secret_key = $passwords::analytics::hue_secret_key - $hive_server_host = 'analytics1027.eqiad.wmnet' - # Disable automatic Hue user creation in production. - $ldap_create_users_on_login = false - } - elsif ($::realm == 'labs') { - $secret_key = 'oVEAAG5dp02MAuIScIetX3NZlmBkhOpagK92wY0GhBbq6ooc0B3rosmcxDg2fJBM' - # Assume that in Labs, Hue should run on the main master Hadoop NameNode. - $hive_server_host = $role::analytics::hadoop::config::namenode_hosts[0] - $ldap_create_users_on_login = true - } - - class { 'cdh::hue': - hive_server_host => $hive_server_host, - secret_key => $secret_key, - smtp_host => $::mail_smarthost[0], - smtp_from_email => "hue@${::fqdn}", - ldap_url => inline_template('<%= scope.lookupvar("ldap::role::config::labs::servernames").collect { |host| "ldaps://#{host}" }.join(" ") %>'), - ldap_bind_dn => $ldap::role::config::labs::ldapconfig['proxyagent'], - ldap_bind_password => $ldap::role::config::labs::ldapconfig['proxypass'], - ldap_base_dn => $ldap::role::config::labs::basedn, - ldap_username_pattern => 'uid=<username>,ou=people,dc=wikimedia,dc=org', - ldap_user_filter => 'objectclass=person', - ldap_user_name_attr => 'uid', - ldap_group_filter => 'objectclass=posixgroup', - ldap_group_member_attr => 'member', - ldap_create_users_on_login => $ldap_create_users_on_login, - # Disable ssl in labs. Labs proxy handles SSL termination. - ssl_private_key => $ssl_private_key, - ssl_certificate => $ssl_certificate, - secure_proxy_ssl_header => $secure_proxy_ssl_header, - } - - ferm::service{ 'hue_server': - proto => 'tcp', - port => '8888', - srange => '$INTERNAL', - } -} - -# TODO: Hue database backup. -# TODO: Make Hue use MySQL database. Maybe? diff --git a/manifests/role/analytics/impala.pp b/manifests/role/analytics/impala.pp deleted file mode 100644 index 2781b68..0000000 --- a/manifests/role/analytics/impala.pp +++ /dev/null @@ -1,66 +0,0 @@ -# Impala role classes. -# -# NOTE: Be sure that $analytics::impala::master_host is set in hiera! -# In production this is set in hieradata/eqiad/analytics/impala.yaml. - -# == Class role::analytics::impala -# Installs base impala packages and the impala-shell client. -# -class role::analytics::impala { - class { 'cdh::impala': - master_host => hiera('analytics::impala::master_host') - } -} - -# == Class role::analytics::impala::worker -# Installs and configures the impalad server. -# -class role::analytics::impala::worker { - include role::analytics::impala - include cdh::impala::worker - - ferm::service { 'impalad': - proto => 'tcp', - port => '(21000 21050 22000 23000 25000 28000)', - srange => '$ANALYTICS_NETWORKS', - } -} - -# == Class role::analytics::impala::master -# Installs and configures llama, impala-state-store and impala-catalog -# -class role::analytics::impala::master { - include role::analytics::impala - include base::firewall - - # The llama-master package stupidly creates the llama user - # with a non system uid. This causes our admin module to - # attempt to remove the user. Manage the user manually - # here in puppet before installing that package. - user { 'llama': - ensure => 'present', - comment => 'Llama', - home => '/var/lib/llama', - shell => '/bin/bash', - system => true, - before => Class['cdh::impala::master'], - } - - include cdh::impala::master - - ferm::service { 'impala-state-store': - proto => 'tcp', - port => '(24000 25010)', - srange => '$ANALYTICS_NETWORKS', - } - ferm::service { 'impala-catalog': - proto => 'tcp', - port => '(23020 25020 26000)', - srange => '$ANALYTICS_NETWORKS', - } - ferm::service { 'impala-llama': - proto => 'tcp', - port => '(15000 15001 15002)', - srange => '$ANALYTICS_NETWORKS', - } -} diff --git a/manifests/role/analytics/mahout.pp b/manifests/role/analytics/mahout.pp deleted file mode 100644 index d9108ea..0000000 --- a/manifests/role/analytics/mahout.pp +++ /dev/null @@ -1,3 +0,0 @@ -class role::analytics::mahout { - include cdh::mahout -} diff --git a/manifests/role/analytics/oozie.pp b/manifests/role/analytics/oozie.pp deleted file mode 100644 index 8723a80..0000000 --- a/manifests/role/analytics/oozie.pp +++ /dev/null @@ -1,89 +0,0 @@ -# == Class role::analytics::oozie::client -# Installs oozie client, which sets up the OOZIE_URL -# environment variable. If you are using this class in -# Labs, you must include oozie::server on your primary -# Hadoop NameNode for this to work and set appropriate -# Labs Hadoop global parameters. -# See role/analytics/hadoop.pp documentation for more info. - - -# == Class role::analytics::oozie::config -# -class role::analytics::oozie::config { - include role::analytics::hadoop::config - - if $::realm == 'production' { - include passwords::analytics - - $jdbc_password = $passwords::analytics::oozie_jdbc_password - # Must set oozie_host in hiera in production. - $default_oozie_host = undef - - } - elsif $::realm == 'labs' { - $jdbc_password = 'oozie' - # Default to running oozie server on primary namenode in labs. - $default_oozie_host = $role::analytics::hadoop::config::namenode_hosts[0] - } - - $oozie_host = hiera('oozie_host', $default_oozie_host) -} - - -# == Class role::analytics::oozie::client -# Installs Oozie client. -# -class role::analytics::oozie::client inherits role::analytics::oozie::config { - require role::analytics::hadoop::client - - class { 'cdh::oozie': - oozie_host => $oozie_host, - } -} - -# == Class role::analytics::oozie::server -# Installs Oozie server backed by a MySQL database. -# -class role::analytics::oozie::server inherits role::analytics::oozie::client { - if (!defined(Package['mysql-server'])) { - package { 'mysql-server': - ensure => 'installed', - } - } - # Make sure mysql-server is installed before - # MySQL Oozie database class is applied. - # Package['mysql-server'] -> Class['cdh::oozie::database::mysql'] - - class { 'cdh::oozie::server': - jdbc_password => $jdbc_password, - smtp_host => $::mail_smarthost[0], - smtp_from_email => "oozie@${::fqdn}", - # This is not currently working. Disabling - # this allows any user to manage any Oozie - # job. Since access to our cluster is limited, - # this isn't a big deal. But, we should still - # figure out why this isn't working and - # turn it back on. - # I was not able to kill any oozie jobs - # with this on, even though the - # oozie.service.ProxyUserService.proxyuser.* - # settings look like they are properly configured. - authorization_service_authorization_enabled => false, - } - - # Oozie is creating event logs in /var/log/oozie. - # It rotates them but does not delete old ones. Set up cronjob to - # delete old files in this directory. - cron { 'oozie-clean-logs': - command => 'test -d /var/log/oozie && /usr/bin/find /var/log/oozie -type f -mtime +62 -exec rm {} >/dev/null \;', - minute => 5, - hour => 0, - require => Class['cdh::oozie::server'], - } - - ferm::service{ 'oozie_server': - proto => 'tcp', - port => '11000', - srange => '$INTERNAL', - } -} diff --git a/manifests/role/analytics/pig.pp b/manifests/role/analytics/pig.pp deleted file mode 100644 index 150dc1e..0000000 --- a/manifests/role/analytics/pig.pp +++ /dev/null @@ -1,3 +0,0 @@ -class role::analytics::pig { - include cdh::pig -} \ No newline at end of file diff --git a/manifests/role/analytics/refinery.pp b/manifests/role/analytics/refinery.pp deleted file mode 100644 index 74ff5cd..0000000 --- a/manifests/role/analytics/refinery.pp +++ /dev/null @@ -1,286 +0,0 @@ -# == Class role::analytics::refinery -# Includes configuration and resources needed for deploying -# and using the analytics/refinery repository. -# -class role::analytics::refinery { - # Make this class depend on hadoop::client. Refinery - # is intended to work with Hadoop, and many of the - # role classes here use the hdfs user, which is created - # by the CDH packages. - Class['role::analytics::hadoop::client'] -> Class['role::analytics::refinery'] - - # Some refinery python scripts use docopt for CLI parsing. - if !defined(Package['python-docopt']) { - package { 'python-docopt': - ensure => 'installed', - } - } - # refinery python module uses dateutil - if !defined(Package['python-dateutil']) { - package { 'python-dateutil': - ensure => 'installed', - } - } - - # analytics/refinery will deployed to this node. - package { 'analytics/refinery': - provider => 'trebuchet', - } - - # analytics/refinery repository is deployed via git-deploy at this path. - # You must deploy this yourself; puppet will not do it for you. - $path = '/srv/deployment/analytics/refinery' - - # Put refinery python module in user PYTHONPATH - file { '/etc/profile.d/refinery.sh': - content => "export PYTHONPATH=\${PYTHONPATH}:${path}/python" - } - - # Create directory in /var/log for general purpose Refinery job logging. - $log_dir = '/var/log/refinery' - file { $log_dir: - ensure => 'directory', - owner => 'hdfs', - group => 'analytics-admins', - # setgid bit here to make refinery log files writeable - # by users in the analytics-admins group. - mode => '2775', - } -} - - -# == Class role::analytics::refinery::camus -# Uses camus::job to set up cron jobs to -# import data from Kafka into Hadoop. -# -class role::analytics::refinery::camus { - require role::analytics::refinery - include role::kafka::analytics::config - - # Make all uses of camus::job set default kafka_brokers and camus_jar. - # If you build a new camus or refinery, and you want to use it, you'll - # need to change these. You can also override these defaults - # for a particular camus::job instance by setting the parameter on - # the camus::job declaration. - Camus::Job { - kafka_brokers => suffix($role::kafka::analytics::config::brokers_array, ':9092'), - camus_jar => "${role::analytics::refinery::path}/artifacts/org/wikimedia/analytics/camus-wmf/camus-wmf-0.1.0-wmf6.jar", - check_jar => "${role::analytics::refinery::path}/artifacts/org/wikimedia/analytics/refinery/refinery-job-0.0.26.jar", - } - - # Import webrequest_* topics into /wmf/data/raw/webrequest - # every 10 minutes, check runs and flag fully imported hours. - camus::job { 'webrequest': - check => true, - minute => '*/10', - } - - # Import eventlogging_* topics into /wmf/data/raw/eventlogging - # once every hour. - camus::job { 'eventlogging': - minute => '5', - } - - # Import mediawiki_* topics into /wmf/data/raw/mediawiki - # once every hour. This data is expected to be Avro binary. - camus::job { 'mediawiki': - check => true, - minute => '15', - # refinery-camus contains some custom decoder classes which - # are needed to import Avro binary data. - libjars => "${role::analytics::refinery::path}/artifacts/org/wikimedia/analytics/refinery/refinery-camus-0.0.23.jar", - } -} - -# == Class role::analytics::refinery::data::drop -# Installs cron job to drop old hive partitions -# and delete old data from HDFS. -# -class role::analytics::refinery::data::drop { - require role::analytics::refinery - - $webrequest_log_file = "${role::analytics::refinery::log_dir}/drop-webrequest-partitions.log" - $eventlogging_log_file = "${role::analytics::refinery::log_dir}/drop-eventlogging-partitions.log" - - # keep this many days of raw webrequest data - $raw_retention_days = 31 - cron { 'refinery-drop-webrequest-raw-partitions': - command => "export PYTHONPATH=\${PYTHONPATH}:${role::analytics::refinery::path}/python && ${role::analytics::refinery::path}/bin/refinery-drop-webrequest-partitions -d ${raw_retention_days} -D wmf_raw -l /wmf/data/raw/webrequest -w raw >> ${webrequest_log_file} 2>&1", - user => 'hdfs', - minute => '15', - hour => '*/4', - } - - # keep this many days of refined webrequest data - $refined_retention_days = 62 - cron { 'refinery-drop-webrequest-refined-partitions': - command => "export PYTHONPATH=\${PYTHONPATH}:${role::analytics::refinery::path}/python && ${role::analytics::refinery::path}/bin/refinery-drop-webrequest-partitions -d ${refined_retention_days} -D wmf -l /wmf/data/wmf/webrequest -w refined >> ${webrequest_log_file} 2>&1", - user => 'hdfs', - minute => '45', - hour => '*/4', - } - - # keep this many days of eventlogging data - $eventlogging_retention_days = 90 - cron {'refinery-drop-eventlogging-partitions': - command => "export PYTHONPATH=\${PYTHONPATH}:${role::analytics::refinery::path}/python && ${role::analytics::refinery::path}/bin/refinery-drop-eventlogging-partitions -d ${eventlogging_retention_days} -l /wmf/data/raw/eventlogging >> ${eventlogging_log_file} 2>&1", - user => 'hdfs', - minute => '15', - hour => '*/4', - } -} - -# == Class role::analytics::refinery::data::check::icinga -# Configures passive/freshness icinga checks or data imports -# in HDFS. -# -# For webrequest imports, the Oozie job that is responsible -# for adding Hive partitions and checking data integrity -# is responsible for triggering these passive checks. -# -# NOTE: These are disasbled due to nsca not working -# properly between versions provided in Precise and Trusty. -# we may reenable these if the icinga server gets upgraded -# to Trusty. -# See: https://phabricator.wikimedia.org/T76414 -# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=670373 -# -class role::analytics::refinery::data::check::icinga { - # We are monitoring hourly datasets. - # Give Oozie a little time to finish running - # the monitor_done_flag workflow for each hour. - # 5400 seconds == 1.5 hours. - $freshness_threshold = 5400 - - # 1 == warning, 2 == critical. - # Use warning for now while we make sure this works. - $alert_return_code = 1 - - # Monitor that each webrequest source is succesfully imported. - # This is a passive check that is triggered by the Oozie - # webrequest add partition jobs. - monitoring::service { 'hive_partition_webrequest-bits': - ensure => 'absent', - description => 'hive_partition_webrequest-bits', - check_command => "analytics_cluster_data_import-FAIL!wmf_raw.webrequest bits!${alert_return_code}", - passive => true, - freshness => $freshness_threshold, - retries => 1, - } - monitoring::service { 'hive_partition_webrequest-mobile': - ensure => 'absent', - description => 'hive_partition_webrequest-mobile', - check_command => "analytics_cluster_data_import-FAIL!wmf_raw.webrequest mobile!${alert_return_code}", - passive => true, - freshness => $freshness_threshold, - retries => 1, - } - monitoring::service { 'hive_partition_webrequest-text': - ensure => 'absent', - description => 'hive_partition_webrequest-text', - check_command => "analytics_cluster_data_import-FAIL!wmf_raw.webrequest text!${alert_return_code}", - passive => true, - freshness => $freshness_threshold, - retries => 1, - } - monitoring::service { 'hive_partition_webrequest-upload': - ensure => 'absent', - description => 'hive_partition_webrequest-upload', - check_command => "analytics_cluster_data_import-FAIL!wmf_raw.webrequest upload!${alert_return_code}", - passive => true, - freshness => $freshness_threshold, - retries => 1, - } -} - -# == Class role::analytics::refinery::data::check::email -# Configures cron jobs that send email about the faultyness of webrequest data -# -# These checks walk HDFS through the plain file system. -# -class role::analytics::refinery::data::check::email { - require role::analytics::refinery - - # This should not be hardcoded. Instead, one should be able to use - # $::cdh::hadoop::mount::mount_point to reference the user supplied - # parameter when the cdh::hadoop::mount class is evaluated. - # I am not sure why this is not working. - $hdfs_mount_point = '/mnt/hdfs' - - $mail_to = 'analytics-ale...@wikimedia.org' - - # Since the 'stats' user is not in ldap, it is unnecessarily hard - # to grant it access to the private data in hdfs. As discussed in - # https://gerrit.wikimedia.org/r/#/c/186254 - # the cron runs as hdfs instead. - cron { 'refinery data check hdfs_mount': - command => "${::role::analytics::refinery::path}/bin/refinery-dump-status-webrequest-partitions --hdfs-mount ${hdfs_mount_point} --datasets webrequest,raw_webrequest --quiet --percent-lost", - environment => "MAILTO=${$mail_to}", - user => 'hdfs', - hour => 10, - minute => 0, - } - - cron { 'refinery data check pagecounts': - command => "${::role::analytics::refinery::path}/bin/refinery-dump-status-webrequest-partitions --hdfs-mount ${hdfs_mount_point} --datasets pagecounts_all_sites,pagecounts_raw --quiet", - environment => "MAILTO=${$mail_to}", - user => 'hdfs', # See comment in above cron - hour => 10, - minute => 5, - } - - cron { 'refinery data check pageviews': - command => "${::role::analytics::refinery::path}/bin/refinery-dump-status-webrequest-partitions --hdfs-mount ${hdfs_mount_point} --datasets pageview,projectview --quiet", - environment => "MAILTO=${$mail_to}", - user => 'hdfs', # See comment in first cron above - hour => 10, - minute => 10, - } -} - -# == Class role::analytics::refinery::source -# Clones analytics/refinery/source repo and keeps it up-to-date -# -class role::analytics::refinery::source { - require statistics - - $path = "${::statistics::working_path}/refinery-source" - - $user = $::statistics::user::username - $group = $user - - file { $path: - ensure => 'directory', - owner => $user, - group => $group, - mode => '0755', - } - - git::clone { 'refinery_source': - ensure => 'latest', - directory => $path, - origin => 'https://gerrit.wikimedia.org/r/p/analytics/refinery/source.git', - owner => $user, - group => $group, - mode => '0755', - require => File[$path], - } -} - -# == Class role::analytics::refinery::guard -# Configures a cron job that runs analytics/refinery/source guards daily and -# sends out an email upon issues -# -class role::analytics::refinery::guard { - require role::analytics::refinery::source - - include ::maven - - cron { 'refinery source guard': - command => "${role::analytics::refinery::source::path}/guard/run_all_guards.sh --rebuild-jar --quiet", - environment => 'MAILTO=o...@wikimedia.org', - user => $role::analytics::refinery::source::user, - hour => 15, - minute => 35, - } -} diff --git a/manifests/role/analytics/spark.pp b/manifests/role/analytics/spark.pp deleted file mode 100644 index 1ffa076..0000000 --- a/manifests/role/analytics/spark.pp +++ /dev/null @@ -1,59 +0,0 @@ -# == Class role::analytics::spark -# -class role::analytics::spark { - include cdh::spark -} - -# == Class role::analytics::spark::standalone -# Configures a spark standalone cluster. -# This runs spark daemons outside of YARN. -# do not include role::analytics::spark -# and role::analytics::spark::standalone on the same node. -class role::analytics::spark::standalone { - class { 'cdh::spark': - master_host => hiera('spark_master_host', $::fqdn), - worker_instances => hiera('spark_worker_instances', undef), - worker_cores => hiera('spark_worker_cores', floor($::processorcount / hiera('spark_worker_instances', 1))), - worker_memory => hiera('spark_worker_memory', undef) - } -} - -class role::analytics::spark::standalone::master { - require role::analytics::spark::standalone - include cdh::spark::master - - ferm::service{ 'spark-master-web-ui': - proto => 'tcp', - port => '18080', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'spark-master-rpc': - proto => 'tcp', - port => '7077', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'spark-rest-server': - proto => 'tcp', - port => '6066', - srange => '$ANALYTICS_NETWORKS', - } -} - -class role::analytics::spark::standalone::worker { - require role::analytics::spark::standalone - include cdh::spark::worker - - ferm::service{ 'spark-worker-web-ui': - proto => 'tcp', - port => '18081', - srange => '$ANALYTICS_NETWORKS', - } - - ferm::service{ 'spark-worker-rpc': - proto => 'tcp', - port => '7078', - srange => '$ANALYTICS_NETWORKS', - } -} diff --git a/manifests/role/analytics/sqoop.pp b/manifests/role/analytics/sqoop.pp deleted file mode 100644 index 7d35a00..0000000 --- a/manifests/role/analytics/sqoop.pp +++ /dev/null @@ -1,3 +0,0 @@ -class role::analytics::sqoop { - include cdh::sqoop -} \ No newline at end of file diff --git a/modules/camus/manifests/job.pp b/modules/camus/manifests/job.pp index 0a0a6f4..223e66d 100644 --- a/modules/camus/manifests/job.pp +++ b/modules/camus/manifests/job.pp @@ -11,8 +11,8 @@ # # [*script*] # Path to camus wrapper script. This is currently deployed with the refinery -# source. You must include role::analytics::refinery if you don't override -# this to a custom path. +# source. You must include role::analytics_cluster::refinery if you don't +# override this to a custom path. # See: https://github.com/wikimedia/analytics-refinery/blob/master/bin/camus # # [*user*] -- To view, visit https://gerrit.wikimedia.org/r/270851 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I48f7d76255ca0bf04322cd4e6e306e1a60ebf374 Gerrit-PatchSet: 2 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits