[spark] branch branch-3.0 updated: [SPARK-33504][CORE] The application log in the Spark history server contains sensitive attributes should be redacted

tgraves Wed, 02 Dec 2020 07:28:23 -0800

This is an automated email from the ASF dual-hosted git repository.

tgraves pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new e59179b  [SPARK-33504][CORE] The application log in the Spark history 
server contains sensitive attributes should be redacted
e59179b is described below

commit e59179b7326112f526e4c000e21146df283d861c
Author: neko <echoh...@gmail.com>
AuthorDate: Wed Dec 2 09:24:19 2020 -0600

    [SPARK-33504][CORE] The application log in the Spark history server 
contains sensitive attributes should be redacted
    
    ### What changes were proposed in this pull request?
    To make sure the sensitive attributes to be redacted in the history server 
log.
    
    ### Why are the changes needed?
    We found the secure attributes like password  in SparkListenerJobStart and 
SparkListenerStageSubmitted events would not been redated, resulting in 
sensitive attributes can be viewd directly.
    The screenshot can be viewed in the attachment of JIRA spark-33504
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    muntual test works well, I have also added unit testcase.
    
    Closes #30446 from akiyamaneko/eventlog_unredact.
    
    Authored-by: neko <echoh...@gmail.com>
    Signed-off-by: Thomas Graves <tgra...@apache.org>
    (cherry picked from commit 28dad1ba770e5b7f7cf542da1ae3f05975a969c6)
    Signed-off-by: Thomas Graves <tgra...@apache.org>
---
 .../spark/scheduler/EventLoggingListener.scala     | 24 +++++++-
 .../scheduler/EventLoggingListenerSuite.scala      | 64 +++++++++++++++++++++-
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala 
b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 24e2a5e..5673c02 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.scheduler
 
 import java.net.URI
+import java.util.Properties
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.hadoop.conf.Configuration
@@ -103,7 +105,7 @@ private[spark] class EventLoggingListener(
 
   // Events that do not trigger a flush
   override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = {
-    logEvent(event)
+    logEvent(event.copy(properties = redactProperties(event.properties)))
     if (shouldLogStageExecutorMetrics) {
       // record the peak metrics for the new stage
       liveStageExecutorMetrics.put((event.stageInfo.stageId, 
event.stageInfo.attemptNumber()),
@@ -156,7 +158,9 @@ private[spark] class EventLoggingListener(
     logEvent(event, flushLogger = true)
   }
 
-  override def onJobStart(event: SparkListenerJobStart): Unit = 
logEvent(event, flushLogger = true)
+  override def onJobStart(event: SparkListenerJobStart): Unit = {
+    logEvent(event.copy(properties = redactProperties(event.properties)), 
flushLogger = true)
+  }
 
   override def onJobEnd(event: SparkListenerJobEnd): Unit = logEvent(event, 
flushLogger = true)
 
@@ -246,6 +250,22 @@ private[spark] class EventLoggingListener(
     logWriter.stop()
   }
 
+  private def redactProperties(properties: Properties): Properties = {
+    if (properties == null) {
+      return properties
+    }
+    val redactedProperties = new Properties
+    // properties may contain some custom local properties such as stage/job 
description
+    // only properties in sparkConf need to be redacted.
+    val (globalProperties, localProperties) = 
properties.asScala.toSeq.partition {
+      case (key, _) => sparkConf.contains(key)
+    }
+    (Utils.redact(sparkConf, globalProperties) ++ localProperties).foreach {
+      case (key, value) => redactedProperties.setProperty(key, value)
+    }
+    redactedProperties
+  }
+
   private[spark] def redactEvent(
       event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = 
{
     // environmentDetails maps a string descriptor to a set of properties
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
 
b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index 046564d..e0e6406 100644
--- 
a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.scheduler
 
 import java.io.{File, InputStream}
-import java.util.Arrays
+import java.util.{Arrays, Properties}
 
 import scala.collection.immutable.Map
 import scala.collection.mutable
@@ -96,6 +96,68 @@ class EventLoggingListenerSuite extends SparkFunSuite with 
LocalSparkContext wit
     assert(redactedProps(key) == "*********(redacted)")
   }
 
+  test("Spark-33504 sensitive attributes redaction in properties") {
+    val (secretKey, secretPassword) = 
("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
+      "secret_password")
+    val (customKey, customValue) = ("parse_token", "secret_password")
+
+    val conf = getLoggingConf(testDirPath, None).set(secretKey, secretPassword)
+
+    val properties = new Properties()
+    properties.setProperty(secretKey, secretPassword)
+    properties.setProperty(customKey, customValue)
+
+    val logName = "properties-reaction-test"
+    val eventLogger = new EventLoggingListener(logName, None, 
testDirPath.toUri(), conf)
+    val listenerBus = new LiveListenerBus(conf)
+
+    val stageId = 1
+    val jobId = 1
+    val stageInfo = new StageInfo(stageId, 0, stageId.toString, 0,
+      Seq.empty, Seq.empty, "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+
+    val events = Array(SparkListenerStageSubmitted(stageInfo, properties),
+      SparkListenerJobStart(jobId, 0, Seq(stageInfo), properties))
+
+    eventLogger.start()
+    listenerBus.start(Mockito.mock(classOf[SparkContext]), 
Mockito.mock(classOf[MetricsSystem]))
+    listenerBus.addToEventLogQueue(eventLogger)
+    events.foreach(event => listenerBus.post(event))
+    listenerBus.stop()
+    eventLogger.stop()
+
+    val logData = EventLogFileReader.openEventLog(new 
Path(eventLogger.logWriter.logPath),
+      fileSystem)
+    try {
+      val lines = readLines(logData)
+      val logStart = SparkListenerLogStart(SPARK_VERSION)
+      assert(lines.size === 3)
+      assert(lines(0).contains("SparkListenerLogStart"))
+      assert(lines(1).contains("SparkListenerStageSubmitted"))
+      assert(lines(2).contains("SparkListenerJobStart"))
+
+      lines.foreach{
+        line => JsonProtocol.sparkEventFromJson(parse(line)) match {
+          case logStartEvent: SparkListenerLogStart =>
+            assert(logStartEvent == logStart)
+
+          case stageSubmittedEvent: SparkListenerStageSubmitted =>
+            assert(stageSubmittedEvent.properties.getProperty(secretKey) == 
"*********(redacted)")
+            assert(stageSubmittedEvent.properties.getProperty(customKey) ==  
customValue)
+
+          case jobStartEvent : SparkListenerJobStart =>
+            assert(jobStartEvent.properties.getProperty(secretKey) == 
"*********(redacted)")
+            assert(jobStartEvent.properties.getProperty(customKey) ==  
customValue)
+
+          case _ => assert(false)
+        }
+      }
+    } finally {
+      logData.close()
+    }
+  }
+
   test("Executor metrics update") {
     testStageExecutorMetricsEventLogging()
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-33504][CORE] The application log in the Spark history server contains sensitive attributes should be redacted

Reply via email to