Repository: spark Updated Branches: refs/heads/branch-2.1 5634fadb0 -> 444cca14d
[SPARK-18535][SPARK-19720][CORE][BACKPORT-2.1] Redact sensitive information ## What changes were proposed in this pull request? Backporting SPARK-18535 and SPARK-19720 to spark 2.1 It's a backport PR that redacts senstive information by configuration to Spark UI and Spark Submit console logs. Using reference from Mark Grover markapache.org PRs ## How was this patch tested? Same tests from PR applied Author: Mark Grover <m...@apache.org> Closes #18802 from dmvieira/feature-redact. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/444cca14 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/444cca14 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/444cca14 Branch: refs/heads/branch-2.1 Commit: 444cca14d7ac8c5ab5d7e9d080b11f4d6babe3bf Parents: 5634fad Author: Mark Grover <m...@apache.org> Authored: Mon Aug 7 14:23:05 2017 -0700 Committer: Marcelo Vanzin <van...@cloudera.com> Committed: Mon Aug 7 14:23:05 2017 -0700 ---------------------------------------------------------------------- .../org/apache/spark/deploy/SparkSubmit.scala | 3 +- .../spark/deploy/SparkSubmitArguments.scala | 12 +++++-- .../apache/spark/internal/config/package.scala | 9 ++++++ .../spark/scheduler/EventLoggingListener.scala | 13 +++++++- .../apache/spark/ui/env/EnvironmentPage.scala | 12 +++---- .../apache/spark/ui/env/EnvironmentTab.scala | 1 + .../scala/org/apache/spark/util/Utils.scala | 33 +++++++++++++++++++- .../scheduler/EventLoggingListenerSuite.scala | 12 +++++++ .../org/apache/spark/util/UtilsSuite.scala | 20 ++++++++++++ docs/configuration.md | 9 ++++++ .../spark/deploy/yarn/ExecutorRunnable.scala | 3 +- 11 files changed, 111 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 443f1f5..653830e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -670,7 +670,8 @@ object SparkSubmit { if (verbose) { printStream.println(s"Main class:\n$childMainClass") printStream.println(s"Arguments:\n${childArgs.mkString("\n")}") - printStream.println(s"System properties:\n${sysProps.mkString("\n")}") + // sysProps may contain sensitive information, so redact before printing + printStream.println(s"System properties:\n${Utils.redact(sysProps).mkString("\n")}") printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}") printStream.println("\n") } http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index f1761e7..883842c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -84,9 +84,15 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S // scalastyle:off println if (verbose) SparkSubmit.printStream.println(s"Using properties file: $propertiesFile") Option(propertiesFile).foreach { filename => - Utils.getPropertiesFromFile(filename).foreach { case (k, v) => + val properties = Utils.getPropertiesFromFile(filename) + properties.foreach { case (k, v) => defaultProperties(k) = v - if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v") + } + // Property files may contain sensitive information, so redact before printing + if (verbose) { + Utils.redact(properties).foreach { case (k, v) => + SparkSubmit.printStream.println(s"Adding default property: $k=$v") + } } } // scalastyle:on println @@ -318,7 +324,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | |Spark properties used, including those specified through | --conf and those from the properties file $propertiesFile: - |${sparkProperties.mkString(" ", "\n ", "\n")} + |${Utils.redact(sparkProperties).mkString(" ", "\n ", "\n")} """.stripMargin } http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/internal/config/package.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index f4844de..33396a5 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -220,4 +220,13 @@ package object config { " bigger files.") .longConf .createWithDefault(4 * 1024 * 1024) + + private[spark] val SECRET_REDACTION_PATTERN = + ConfigBuilder("spark.redaction.regex") + .doc("Regex to decide which Spark configuration properties and environment variables in " + + "driver and executor environments contain sensitive information. When this regex matches " + + "a property, its value is redacted from the environment UI and various logs like YARN " + + "and event logs.") + .stringConf + .createWithDefault("(?i)secret|password") } http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index ce78774..f39565e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -153,7 +153,9 @@ private[spark] class EventLoggingListener( override def onTaskEnd(event: SparkListenerTaskEnd): Unit = logEvent(event) - override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = logEvent(event) + override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = { + logEvent(redactEvent(event)) + } // Events that trigger a flush override def onStageCompleted(event: SparkListenerStageCompleted): Unit = { @@ -231,6 +233,15 @@ private[spark] class EventLoggingListener( } } + private[spark] def redactEvent( + event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { + // "Spark Properties" entry will always exist because the map is always populated with it. + val redactedProps = Utils.redact(sparkConf, event.environmentDetails("Spark Properties")) + val redactedEnvironmentDetails = event.environmentDetails + + ("Spark Properties" -> redactedProps) + SparkListenerEnvironmentUpdate(redactedEnvironmentDetails) + } + } private[spark] object EventLoggingListener extends Logging { http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala index 9f6e9a6..b11f8f1 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala @@ -22,21 +22,17 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node import org.apache.spark.ui.{UIUtils, WebUIPage} +import org.apache.spark.util.Utils private[ui] class EnvironmentPage(parent: EnvironmentTab) extends WebUIPage("") { private val listener = parent.listener - private def removePass(kv: (String, String)): (String, String) = { - if (kv._1.toLowerCase.contains("password") || kv._1.toLowerCase.contains("secret")) { - (kv._1, "******") - } else kv - } - def render(request: HttpServletRequest): Seq[Node] = { val runtimeInformationTable = UIUtils.listingTable( propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true) - val sparkPropertiesTable = UIUtils.listingTable( - propertyHeader, propertyRow, listener.sparkProperties.map(removePass), fixedWidth = true) + val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, + Utils.redact(parent.conf, listener.sparkProperties), fixedWidth = true) + val systemPropertiesTable = UIUtils.listingTable( propertyHeader, propertyRow, listener.systemProperties, fixedWidth = true) val classpathEntriesTable = UIUtils.listingTable( http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala index f62260c..70b3ffd 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala @@ -23,6 +23,7 @@ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener + val conf = parent.conf attachPage(new EnvironmentPage(this)) } http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/util/Utils.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 56de802..aebc257 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -38,6 +38,7 @@ import scala.io.Source import scala.reflect.ClassTag import scala.util.Try import scala.util.control.{ControlThrowable, NonFatal} +import scala.util.matching.Regex import _root_.io.netty.channel.unix.Errors.NativeIoException import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} @@ -55,7 +56,7 @@ import org.slf4j.Logger import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES} +import org.apache.spark.internal.config._ import org.apache.spark.network.util.JavaUtils import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance} @@ -2571,6 +2572,36 @@ private[spark] object Utils extends Logging { sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten } } + + private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + + def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, String)] = { + val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r + redact(redactionPattern, kvs) + } + + private def redact(redactionPattern: Regex, kvs: Seq[(String, String)]): Seq[(String, String)] = { + kvs.map { kv => + redactionPattern.findFirstIn(kv._1) + .map { _ => (kv._1, REDACTION_REPLACEMENT_TEXT) } + .getOrElse(kv) + } + } + + /** + * Looks up the redaction regex from within the key value pairs and uses it to redact the rest + * of the key value pairs. No care is taken to make sure the redaction property itself is not + * redacted. So theoretically, the property itself could be configured to redact its own value + * when printing. + */ + def redact(kvs: Map[String, String]): Seq[(String, String)] = { + val redactionPattern = kvs.getOrElse( + SECRET_REDACTION_PATTERN.key, + SECRET_REDACTION_PATTERN.defaultValueString + ).r + redact(redactionPattern, kvs.toArray) + } + } private[util] object CallerContext extends Logging { http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala ---------------------------------------------------------------------- diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index 7f48592..b5a9d17 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -95,6 +95,18 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit } } + test("Event logging with password redaction") { + val key = "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD" + val secretPassword = "secret_password" + val conf = getLoggingConf(testDirPath, None) + .set(key, secretPassword) + val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf) + val envDetails = SparkEnv.environmentDetails(conf, "FIFO", Seq.empty, Seq.empty) + val event = SparkListenerEnvironmentUpdate(envDetails) + val redactedProps = eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap + assert(redactedProps(key) == "*********(redacted)") + } + test("Log overwriting") { val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None) val logPath = new URI(logUri).getPath http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala ---------------------------------------------------------------------- diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8706d72..15339eb 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -975,4 +975,24 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(pValue > threshold) } + + test("redact sensitive information") { + val sparkConf = new SparkConf + + // Set some secret keys + val secretKeys = Seq( + "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", + "spark.my.password", + "spark.my.sECreT") + secretKeys.foreach { key => sparkConf.set(key, "secret_password") } + // Set a non-secret key + sparkConf.set("spark.regular.property", "not_a_secret") + + // Redact sensitive information + val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap + + // Assert that secret information got redacted while the regular property remained the same + secretKeys.foreach { key => assert(redactedConf(key) === Utils.REDACTION_REPLACEMENT_TEXT) } + assert(redactedConf("spark.regular.property") === "not_a_secret") + } } http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/docs/configuration.md ---------------------------------------------------------------------- diff --git a/docs/configuration.md b/docs/configuration.md index 7c51e13..5f15d5b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -351,6 +351,15 @@ Apart from these, the following properties are also available, and may be useful </td> </tr> <tr> + <td><code>spark.redaction.regex</code></td> + <td>(?i)secret|password</td> + <td> + Regex to decide which Spark configuration properties and environment variables in driver and + executor environments contain sensitive information. When this regex matches a property, its + value is redacted from the environment UI and various logs like YARN and event logs. + </td> +</tr> +<tr> <td><code>spark.python.profile</code></td> <td>false</td> <td> http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala ---------------------------------------------------------------------- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index 8e0533f..868c2ed 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -36,7 +36,6 @@ import org.apache.hadoop.yarn.ipc.YarnRPC import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import org.apache.spark.{SecurityManager, SparkConf, SparkException} -import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.launcher.YarnCommandBuilderUtils @@ -75,7 +74,7 @@ private[yarn] class ExecutorRunnable( |=============================================================================== |YARN executor launch context: | env: - |${env.map { case (k, v) => s" $k -> $v\n" }.mkString} + |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString} | command: | ${commands.mkString(" \\ \n ")} | --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org