aryangupta1998 commented on code in PR #8386:
URL: https://github.com/apache/ozone/pull/8386#discussion_r2072876551
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java:
##########
@@ -122,21 +131,65 @@ protected synchronized boolean validate() {
@Override
protected synchronized void process(Pipeline pipeline) {
+ Preconditions.checkNotNull(pipeline);
// When SCM is in safe mode for long time, already registered
// datanode can send pipeline report again, or SCMPipelineManager will
// create new pipelines.
- Preconditions.checkNotNull(pipeline);
- if (pipeline.getType() == HddsProtos.ReplicationType.RATIS &&
- ((RatisReplicationConfig) pipeline.getReplicationConfig())
- .getReplicationFactor() == HddsProtos.ReplicationFactor.THREE &&
- !processedPipelineIDs.contains(pipeline.getId())) {
- getSafeModeMetrics().incCurrentHealthyPipelinesCount();
- currentHealthyPipelineCount++;
- processedPipelineIDs.add(pipeline.getId());
- unProcessedPipelineSet.remove(pipeline.getId());
+
+ // Only handle RATIS + 3-replica pipelines.
+ if (pipeline.getType() != HddsProtos.ReplicationType.RATIS ||
+ ((RatisReplicationConfig)
pipeline.getReplicationConfig()).getReplicationFactor() !=
+ HddsProtos.ReplicationFactor.THREE) {
+ SCMSafeModeManager.getLogger().warn(
+ "Skipping pipeline safemode report processing as Replication type
isn't RATIS " +
+ "or replication factor isn't 3.");
+ return;
+ }
+
+ // Skip already processed ones.
+ if (processedPipelineIDs.contains(pipeline.getId())) {
+ LOG.info("Skipping pipeline safemode report processing check as
pipeline: {} is already recorded.",
+ pipeline.getId());
+ return;
+ }
+
+ List<DatanodeDetails> pipelineDns = pipeline.getNodes();
+ if (pipelineDns.size() != 3) {
+ LOG.warn("Only {} DNs reported this pipeline: {}, all 3 DNs should
report the pipeline", pipelineDns.size(),
+ pipeline.getId());
+ return;
}
+ Map<DatanodeDetails, String> badDnsWithReasons = new LinkedHashMap<>();
+
+ for (DatanodeDetails dn : pipelineDns) {
+ try {
+ NodeStatus status = nodeManager.getNodeStatus(dn);
+ if (!status.equals(NodeStatus.inServiceHealthy())) {
+ String reason = String.format("Health: %s, Operational State: %s",
+ status.getHealth(), status.getOperationalState());
+ badDnsWithReasons.put(dn, reason);
+ }
+ } catch (NodeNotFoundException e) {
+ badDnsWithReasons.put(dn, "DN not registered with SCM");
+ }
+ }
+
+ if (!badDnsWithReasons.isEmpty()) {
+ LOG.warn("Below DNs reported by Pipeline: {} are either in bad health or
un-registered with SCMs",
+ pipeline.getId());
+ for (Map.Entry<DatanodeDetails, String> entry :
badDnsWithReasons.entrySet()) {
+ LOG.warn("DN {}: {}", entry.getKey().getID(), entry.getValue());
+ }
+ return;
+ }
Review Comment:
Wouldn't it be better to display the information for each DN one by one?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]