Repository: spark
Updated Branches:
  refs/heads/branch-2.1 5634fadb0 -> 444cca14d


[SPARK-18535][SPARK-19720][CORE][BACKPORT-2.1] Redact sensitive information

## What changes were proposed in this pull request?

Backporting SPARK-18535 and SPARK-19720 to spark 2.1

It's a backport PR that redacts senstive information by configuration to Spark 
UI and Spark Submit console logs.

Using reference from Mark Grover markapache.org PRs

## How was this patch tested?

Same tests from PR applied

Author: Mark Grover <m...@apache.org>

Closes #18802 from dmvieira/feature-redact.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/444cca14
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/444cca14
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/444cca14

Branch: refs/heads/branch-2.1
Commit: 444cca14d7ac8c5ab5d7e9d080b11f4d6babe3bf
Parents: 5634fad
Author: Mark Grover <m...@apache.org>
Authored: Mon Aug 7 14:23:05 2017 -0700
Committer: Marcelo Vanzin <van...@cloudera.com>
Committed: Mon Aug 7 14:23:05 2017 -0700

----------------------------------------------------------------------
 .../org/apache/spark/deploy/SparkSubmit.scala   |  3 +-
 .../spark/deploy/SparkSubmitArguments.scala     | 12 +++++--
 .../apache/spark/internal/config/package.scala  |  9 ++++++
 .../spark/scheduler/EventLoggingListener.scala  | 13 +++++++-
 .../apache/spark/ui/env/EnvironmentPage.scala   | 12 +++----
 .../apache/spark/ui/env/EnvironmentTab.scala    |  1 +
 .../scala/org/apache/spark/util/Utils.scala     | 33 +++++++++++++++++++-
 .../scheduler/EventLoggingListenerSuite.scala   | 12 +++++++
 .../org/apache/spark/util/UtilsSuite.scala      | 20 ++++++++++++
 docs/configuration.md                           |  9 ++++++
 .../spark/deploy/yarn/ExecutorRunnable.scala    |  3 +-
 11 files changed, 111 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 443f1f5..653830e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -670,7 +670,8 @@ object SparkSubmit {
     if (verbose) {
       printStream.println(s"Main class:\n$childMainClass")
       printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
-      printStream.println(s"System properties:\n${sysProps.mkString("\n")}")
+      // sysProps may contain sensitive information, so redact before printing
+      printStream.println(s"System 
properties:\n${Utils.redact(sysProps).mkString("\n")}")
       printStream.println(s"Classpath 
elements:\n${childClasspath.mkString("\n")}")
       printStream.println("\n")
     }

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index f1761e7..883842c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -84,9 +84,15 @@ private[deploy] class SparkSubmitArguments(args: 
Seq[String], env: Map[String, S
     // scalastyle:off println
     if (verbose) SparkSubmit.printStream.println(s"Using properties file: 
$propertiesFile")
     Option(propertiesFile).foreach { filename =>
-      Utils.getPropertiesFromFile(filename).foreach { case (k, v) =>
+      val properties = Utils.getPropertiesFromFile(filename)
+      properties.foreach { case (k, v) =>
         defaultProperties(k) = v
-        if (verbose) SparkSubmit.printStream.println(s"Adding default 
property: $k=$v")
+      }
+      // Property files may contain sensitive information, so redact before 
printing
+      if (verbose) {
+        Utils.redact(properties).foreach { case (k, v) =>
+          SparkSubmit.printStream.println(s"Adding default property: $k=$v")
+        }
       }
     }
     // scalastyle:on println
@@ -318,7 +324,7 @@ private[deploy] class SparkSubmitArguments(args: 
Seq[String], env: Map[String, S
     |
     |Spark properties used, including those specified through
     | --conf and those from the properties file $propertiesFile:
-    |${sparkProperties.mkString("  ", "\n  ", "\n")}
+    |${Utils.redact(sparkProperties).mkString("  ", "\n  ", "\n")}
     """.stripMargin
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/internal/config/package.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala 
b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index f4844de..33396a5 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -220,4 +220,13 @@ package object config {
       " bigger files.")
     .longConf
     .createWithDefault(4 * 1024 * 1024)
+
+  private[spark] val SECRET_REDACTION_PATTERN =
+    ConfigBuilder("spark.redaction.regex")
+      .doc("Regex to decide which Spark configuration properties and 
environment variables in " +
+        "driver and executor environments contain sensitive information. When 
this regex matches " +
+        "a property, its value is redacted from the environment UI and various 
logs like YARN " +
+        "and event logs.")
+      .stringConf
+      .createWithDefault("(?i)secret|password")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala 
b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index ce78774..f39565e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -153,7 +153,9 @@ private[spark] class EventLoggingListener(
 
   override def onTaskEnd(event: SparkListenerTaskEnd): Unit = logEvent(event)
 
-  override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): 
Unit = logEvent(event)
+  override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): 
Unit = {
+    logEvent(redactEvent(event))
+  }
 
   // Events that trigger a flush
   override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
@@ -231,6 +233,15 @@ private[spark] class EventLoggingListener(
     }
   }
 
+  private[spark] def redactEvent(
+      event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = 
{
+    // "Spark Properties" entry will always exist because the map is always 
populated with it.
+    val redactedProps = Utils.redact(sparkConf, 
event.environmentDetails("Spark Properties"))
+    val redactedEnvironmentDetails = event.environmentDetails +
+      ("Spark Properties" -> redactedProps)
+    SparkListenerEnvironmentUpdate(redactedEnvironmentDetails)
+  }
+
 }
 
 private[spark] object EventLoggingListener extends Logging {

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala 
b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala
index 9f6e9a6..b11f8f1 100644
--- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala
@@ -22,21 +22,17 @@ import javax.servlet.http.HttpServletRequest
 import scala.xml.Node
 
 import org.apache.spark.ui.{UIUtils, WebUIPage}
+import org.apache.spark.util.Utils
 
 private[ui] class EnvironmentPage(parent: EnvironmentTab) extends 
WebUIPage("") {
   private val listener = parent.listener
 
-  private def removePass(kv: (String, String)): (String, String) = {
-    if (kv._1.toLowerCase.contains("password") || 
kv._1.toLowerCase.contains("secret")) {
-      (kv._1, "******")
-    } else kv
-  }
-
   def render(request: HttpServletRequest): Seq[Node] = {
     val runtimeInformationTable = UIUtils.listingTable(
       propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true)
-    val sparkPropertiesTable = UIUtils.listingTable(
-      propertyHeader, propertyRow, listener.sparkProperties.map(removePass), 
fixedWidth = true)
+    val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, 
propertyRow,
+      Utils.redact(parent.conf, listener.sparkProperties), fixedWidth = true)
+
     val systemPropertiesTable = UIUtils.listingTable(
       propertyHeader, propertyRow, listener.systemProperties, fixedWidth = 
true)
     val classpathEntriesTable = UIUtils.listingTable(

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala 
b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala
index f62260c..70b3ffd 100644
--- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala
@@ -23,6 +23,7 @@ import org.apache.spark.ui._
 
 private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, 
"environment") {
   val listener = parent.environmentListener
+  val conf = parent.conf
   attachPage(new EnvironmentPage(this))
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/main/scala/org/apache/spark/util/Utils.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 56de802..aebc257 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -38,6 +38,7 @@ import scala.io.Source
 import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.{ControlThrowable, NonFatal}
+import scala.util.matching.Regex
 
 import _root_.io.netty.channel.unix.Errors.NativeIoException
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
@@ -55,7 +56,7 @@ import org.slf4j.Logger
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, 
DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
+import org.apache.spark.internal.config._
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.{DeserializationStream, 
SerializationStream, SerializerInstance}
 
@@ -2571,6 +2572,36 @@ private[spark] object Utils extends Logging {
       sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
     }
   }
+
+  private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
+
+  def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, 
String)] = {
+    val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r
+    redact(redactionPattern, kvs)
+  }
+
+  private def redact(redactionPattern: Regex, kvs: Seq[(String, String)]): 
Seq[(String, String)] = {
+    kvs.map { kv =>
+      redactionPattern.findFirstIn(kv._1)
+        .map { _ => (kv._1, REDACTION_REPLACEMENT_TEXT) }
+        .getOrElse(kv)
+    }
+  }
+
+  /**
+   * Looks up the redaction regex from within the key value pairs and uses it 
to redact the rest
+   * of the key value pairs. No care is taken to make sure the redaction 
property itself is not
+   * redacted. So theoretically, the property itself could be configured to 
redact its own value
+   * when printing.
+   */
+  def redact(kvs: Map[String, String]): Seq[(String, String)] = {
+    val redactionPattern = kvs.getOrElse(
+      SECRET_REDACTION_PATTERN.key,
+      SECRET_REDACTION_PATTERN.defaultValueString
+    ).r
+    redact(redactionPattern, kvs.toArray)
+  }
+
 }
 
 private[util] object CallerContext extends Logging {

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
 
b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index 7f48592..b5a9d17 100644
--- 
a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -95,6 +95,18 @@ class EventLoggingListenerSuite extends SparkFunSuite with 
LocalSparkContext wit
     }
   }
 
+  test("Event logging with password redaction") {
+    val key = "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD"
+    val secretPassword = "secret_password"
+    val conf = getLoggingConf(testDirPath, None)
+      .set(key, secretPassword)
+    val eventLogger = new EventLoggingListener("test", None, 
testDirPath.toUri(), conf)
+    val envDetails = SparkEnv.environmentDetails(conf, "FIFO", Seq.empty, 
Seq.empty)
+    val event = SparkListenerEnvironmentUpdate(envDetails)
+    val redactedProps = 
eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap
+    assert(redactedProps(key) == "*********(redacted)")
+  }
+
   test("Log overwriting") {
     val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None)
     val logPath = new URI(logUri).getPath

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 8706d72..15339eb 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -975,4 +975,24 @@ class UtilsSuite extends SparkFunSuite with 
ResetSystemProperties with Logging {
 
     assert(pValue > threshold)
   }
+
+  test("redact sensitive information") {
+    val sparkConf = new SparkConf
+
+    // Set some secret keys
+    val secretKeys = Seq(
+      "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
+      "spark.my.password",
+      "spark.my.sECreT")
+    secretKeys.foreach { key => sparkConf.set(key, "secret_password") }
+    // Set a non-secret key
+    sparkConf.set("spark.regular.property", "not_a_secret")
+
+    // Redact sensitive information
+    val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap
+
+    // Assert that secret information got redacted while the regular property 
remained the same
+    secretKeys.foreach { key => assert(redactedConf(key) === 
Utils.REDACTION_REPLACEMENT_TEXT) }
+    assert(redactedConf("spark.regular.property") === "not_a_secret")
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 7c51e13..5f15d5b 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -351,6 +351,15 @@ Apart from these, the following properties are also 
available, and may be useful
   </td>
 </tr>
 <tr>
+  <td><code>spark.redaction.regex</code></td>
+  <td>(?i)secret|password</td>
+  <td>
+    Regex to decide which Spark configuration properties and environment 
variables in driver and
+    executor environments contain sensitive information. When this regex 
matches a property, its
+    value is redacted from the environment UI and various logs like YARN and 
event logs.
+  </td>
+</tr>
+<tr>
   <td><code>spark.python.profile</code></td>
   <td>false</td>
   <td>

http://git-wip-us.apache.org/repos/asf/spark/blob/444cca14/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
----------------------------------------------------------------------
diff --git 
a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala 
b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 8e0533f..868c2ed 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -36,7 +36,6 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
 import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
-import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.launcher.YarnCommandBuilderUtils
@@ -75,7 +74,7 @@ private[yarn] class ExecutorRunnable(
     
|===============================================================================
     |YARN executor launch context:
     |  env:
-    |${env.map { case (k, v) => s"    $k -> $v\n" }.mkString}
+    |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s"    $k -> 
$v\n" }.mkString}
     |  command:
     |    ${commands.mkString(" \\ \n      ")}
     |


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to