This is an automated email from the ASF dual-hosted git repository.
ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new f04ebccd4 [CELEBORN-1368] Log celeborn config for debugging purposes
f04ebccd4 is described below
commit f04ebccd4d34561335d7b54be189264e58d3f7e7
Author: Aravind Patnam <[email protected]>
AuthorDate: Mon Apr 8 15:11:35 2024 +0800
[CELEBORN-1368] Log celeborn config for debugging purposes
### What changes were proposed in this pull request?
Log celeborn config for debugging purposes.
### Why are the changes needed?
Help with debugging
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
tested the patch internally.
Closes #2442 from akpatnam25/CELEBORN-1368.
Authored-by: Aravind Patnam <[email protected]>
Signed-off-by: mingji <[email protected]>
---
.../org/apache/celeborn/common/CelebornConf.scala | 22 ++++++++++++
.../org/apache/celeborn/common/util/Utils.scala | 41 ++++++++++++++++++++++
docs/configuration/master.md | 2 ++
docs/configuration/worker.md | 2 ++
.../celeborn/service/deploy/master/Master.scala | 4 +++
.../celeborn/server/common/HttpService.scala | 6 ++--
.../celeborn/service/deploy/worker/Worker.scala | 5 +++
7 files changed, 80 insertions(+), 2 deletions(-)
diff --git
a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
index d8b482877..6ab67fb89 100644
--- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -1281,6 +1281,10 @@ class CelebornConf(loadDefaults: Boolean) extends
Cloneable with Logging with Se
// Rack Resolver //
// //////////////////////////////////////////////////////
def rackResolverRefreshInterval = get(RACKRESOLVER_REFRESH_INTERVAL)
+
+ def logCelebornConfEnabled = get(LOG_CELEBORN_CONF_ENABLED)
+
+ def secretRedactionPattern = get(SECRET_REDACTION_PATTERN)
}
object CelebornConf extends Logging {
@@ -4924,4 +4928,22 @@ object CelebornConf extends Logging {
s"Invalid maxEncryptedBlockSize, must be a position number upto
${Int.MaxValue}")
.createWithDefaultString("64k")
+ val SECRET_REDACTION_PATTERN =
+ buildConf("celeborn.redaction.regex")
+ .categories("master", "worker")
+ .doc("Regex to decide which Celeborn configuration properties and
environment variables in " +
+ "master and worker environments contain sensitive information. When
this regex matches " +
+ "a property key or value, the value is redacted from the logging.")
+ .version("0.5.0")
+ .regexConf
+ .createWithDefault("(?i)secret|password|token|access[.]key".r)
+
+ val LOG_CELEBORN_CONF_ENABLED: ConfigEntry[Boolean] =
+ buildConf("celeborn.logConf.enabled")
+ .categories("master", "worker")
+ .version("0.5.0")
+ .doc("When `true`, log the CelebornConf for debugging purposes.")
+ .booleanConf
+ .createWithDefault(false)
+
}
diff --git a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
index bab5b5d8b..07e9fe30b 100644
--- a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
@@ -34,6 +34,7 @@ import scala.io.Source
import scala.reflect.ClassTag
import scala.util.{Random => ScalaRandom, Try}
import scala.util.control.{ControlThrowable, NonFatal}
+import scala.util.matching.Regex
import com.google.protobuf.{ByteString, GeneratedMessageV3}
import io.netty.channel.unix.Errors.NativeIoException
@@ -1101,4 +1102,44 @@ object Utils extends Logging {
val host = components.dropRight(portsNum).mkString(":")
Array(host) ++ portsArr
}
+
+ private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
+
+ /**
+ * Redact the sensitive values in the given map. If a map key matches the
redaction pattern then
+ * its value is replaced with a dummy text.
+ */
+ def redact(conf: CelebornConf, kvs: Seq[(String, String)]): Seq[(String,
String)] = {
+ val redactionPattern = conf.secretRedactionPattern
+ redact(redactionPattern, kvs)
+ }
+
+ private def redact[K, V](redactionPattern: Regex, kvs: Seq[(K, V)]): Seq[(K,
V)] = {
+ // If the sensitive information regex matches with either the key or the
value, redact the value
+ // While the original intent was to only redact the value if the key
matched with the regex,
+ // we've found that especially in verbose mode, the value of the property
may contain sensitive
+ // information like so:
+ //
+ // celeborn.dynamicConfig.store.db.hikari.password=secret_password ...
+ //
+ // And, in such cases, simply searching for the sensitive information
regex in the key name is
+ // not sufficient. The values themselves have to be searched as well and
redacted if matched.
+ // This does mean we may be accounting more false positives - for example,
if the value of an
+ // arbitrary property contained the term 'password', we may redact the
value from the UI and
+ // logs. In order to work around it, user would have to make the
celeborn.redaction.regex property
+ // more specific.
+ kvs.map {
+ case (key: String, value: String) =>
+ redactionPattern.findFirstIn(key)
+ .orElse(redactionPattern.findFirstIn(value))
+ .map { _ => (key, REDACTION_REPLACEMENT_TEXT) }
+ .getOrElse((key, value))
+ case (key, value: String) =>
+ redactionPattern.findFirstIn(value)
+ .map { _ => (key, REDACTION_REPLACEMENT_TEXT) }
+ .getOrElse((key, value))
+ case (key, value) =>
+ (key, value)
+ }.asInstanceOf[Seq[(K, V)]]
+ }
}
diff --git a/docs/configuration/master.md b/docs/configuration/master.md
index e2217868b..582f3f8b3 100644
--- a/docs/configuration/master.md
+++ b/docs/configuration/master.md
@@ -33,6 +33,7 @@ license: |
| celeborn.dynamicConfig.store.db.hikari.username | | false | The username of
db store backend. | 0.5.0 | |
| celeborn.dynamicConfig.store.fs.path | <undefined> | false | The path
of dynamic config file for fs store backend. The file format should be yaml.
The default path is `${CELEBORN_CONF_DIR}/dynamicConfig.yaml`. | 0.5.0 | |
| celeborn.internal.port.enabled | false | false | Whether to create a
internal port on Masters/Workers for inter-Masters/Workers communication. This
is beneficial when SASL authentication is enforced for all interactions between
clients and Celeborn Services, but the services can exchange messages without
being subject to SASL authentication. | 0.5.0 | |
+| celeborn.logConf.enabled | false | false | When `true`, log the CelebornConf
for debugging purposes. | 0.5.0 | |
| celeborn.master.estimatedPartitionSize.initialSize | 64mb | false | Initial
partition size for estimation, it will change according to runtime stats. |
0.3.0 | celeborn.shuffle.initialEstimatedPartitionSize |
| celeborn.master.estimatedPartitionSize.maxSize | <undefined> | false |
Max partition size for estimation. Default value should be
celeborn.worker.shuffle.partitionSplit.max * 2. | 0.4.1 | |
| celeborn.master.estimatedPartitionSize.minSize | 8mb | false | Ignore
partition size smaller than this configuration of partition size for
estimation. | 0.3.0 | celeborn.shuffle.minPartitionSizeToEstimate |
@@ -60,6 +61,7 @@ license: |
| celeborn.master.userResourceConsumption.update.interval | 30s | false | Time
length for a window about compute user resource consumption. | 0.3.0 | |
| celeborn.master.workerUnavailableInfo.expireTimeout | 1800s | false | Worker
unavailable info would be cleared when the retention period is expired | 0.3.1
| |
| celeborn.quota.enabled | true | false | When Master side sets to true, the
master will enable to check the quota via QuotaManager. When Client side sets
to true, LifecycleManager will request Master side to check whether the current
user has enough quota before registration of shuffle. Fallback to the default
shuffle service of Spark when Master side checks that there is no enough quota
for current user. | 0.2.0 | |
+| celeborn.redaction.regex | (?i)secret|password|token|access[.]key | false |
Regex to decide which Celeborn configuration properties and environment
variables in master and worker environments contain sensitive information. When
this regex matches a property key or value, the value is redacted from the
logging. | 0.5.0 | |
| celeborn.storage.availableTypes | HDD | false | Enabled storages. Available
options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical.
| 0.3.0 | celeborn.storage.activeTypes |
| celeborn.storage.hdfs.dir | <undefined> | false | HDFS base directory
for Celeborn to store shuffle data. | 0.2.0 | |
| celeborn.storage.hdfs.kerberos.keytab | <undefined> | false | Kerberos
keytab file path for HDFS storage connection. | 0.3.2 | |
diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md
index 675ec7064..63bd69b7a 100644
--- a/docs/configuration/worker.md
+++ b/docs/configuration/worker.md
@@ -33,9 +33,11 @@ license: |
| celeborn.dynamicConfig.store.db.hikari.username | | false | The username of
db store backend. | 0.5.0 | |
| celeborn.dynamicConfig.store.fs.path | <undefined> | false | The path
of dynamic config file for fs store backend. The file format should be yaml.
The default path is `${CELEBORN_CONF_DIR}/dynamicConfig.yaml`. | 0.5.0 | |
| celeborn.internal.port.enabled | false | false | Whether to create a
internal port on Masters/Workers for inter-Masters/Workers communication. This
is beneficial when SASL authentication is enforced for all interactions between
clients and Celeborn Services, but the services can exchange messages without
being subject to SASL authentication. | 0.5.0 | |
+| celeborn.logConf.enabled | false | false | When `true`, log the CelebornConf
for debugging purposes. | 0.5.0 | |
| celeborn.master.endpoints | <localhost>:9097 | false | Endpoints of
master nodes for celeborn client to connect, allowed pattern is:
`<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If
the port is omitted, 9097 will be used. | 0.2.0 | |
| celeborn.master.estimatedPartitionSize.minSize | 8mb | false | Ignore
partition size smaller than this configuration of partition size for
estimation. | 0.3.0 | celeborn.shuffle.minPartitionSizeToEstimate |
| celeborn.master.internal.endpoints | <localhost>:8097 | false |
Endpoints of master nodes just for celeborn workers to connect, allowed pattern
is: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:8097,clb2:8097,clb3:8097`.
If the port is omitted, 8097 will be used. | 0.5.0 | |
+| celeborn.redaction.regex | (?i)secret|password|token|access[.]key | false |
Regex to decide which Celeborn configuration properties and environment
variables in master and worker environments contain sensitive information. When
this regex matches a property key or value, the value is redacted from the
logging. | 0.5.0 | |
| celeborn.shuffle.chunk.size | 8m | false | Max chunk size of reducer's
merged shuffle data. For example, if a reducer's shuffle data is 128M and the
data will need 16 fetch chunk requests to fetch. | 0.2.0 | |
| celeborn.storage.availableTypes | HDD | false | Enabled storages. Available
options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical.
| 0.3.0 | celeborn.storage.activeTypes |
| celeborn.storage.hdfs.dir | <undefined> | false | HDFS base directory
for Celeborn to store shuffle data. | 0.2.0 | |
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 736cc7e45..835ba96ab 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -82,6 +82,10 @@ private[celeborn] class Master(
// Send ApplicationMeta to workers
private var sendApplicationMetaExecutor: ExecutorService = _
+ if (conf.logCelebornConfEnabled) {
+ logInfo(getConf)
+ }
+
override val rpcEnv: RpcEnv =
if (!authEnabled) {
RpcEnv.create(
diff --git
a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
index 18e016187..540b3eadd 100644
--- a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
+++ b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
@@ -23,6 +23,7 @@ import scala.collection.JavaConverters._
import org.apache.celeborn.common.CelebornConf
import org.apache.celeborn.common.internal.Logging
+import org.apache.celeborn.common.util.Utils
import org.apache.celeborn.server.common.http.HttpServer
import org.apache.celeborn.server.common.http.api.ApiRootResource
import org.apache.celeborn.server.common.service.config.ConfigLevel
@@ -35,8 +36,9 @@ abstract class HttpService extends Service with Logging {
val sb = new StringBuilder
sb.append("=========================== Configuration
============================\n")
if (conf.getAll.nonEmpty) {
- val maxKeyLength = conf.getAll.toMap.keys.map(_.length).max
- conf.getAll.sortBy(_._1).foreach { case (key, value) =>
+ val redactedConf = Utils.redact(conf, conf.getAll)
+ val maxKeyLength = redactedConf.toMap.keys.map(_.length).max
+ redactedConf.sortBy(_._1).foreach { case (key, value) =>
sb.append(config(key, value, maxKeyLength))
}
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 0da4b5513..e1d4bf812 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -86,6 +86,11 @@ private[celeborn] class Worker(
val workerStatusManager = new WorkerStatusManager(conf)
private val authEnabled = conf.authEnabled
private val secretRegistry = new
WorkerSecretRegistryImpl(conf.workerApplicationRegistryCacheSize)
+
+ if (conf.logCelebornConfEnabled) {
+ logInfo(getConf)
+ }
+
val rpcEnv: RpcEnv =
if (!authEnabled) {
RpcEnv.create(