peterxcli commented on code in PR #8386:
URL: https://github.com/apache/ozone/pull/8386#discussion_r2077479063


##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java:
##########
@@ -122,21 +131,65 @@ protected synchronized boolean validate() {
 
   @Override
   protected synchronized void process(Pipeline pipeline) {
+    Preconditions.checkNotNull(pipeline);
 
     // When SCM is in safe mode for long time, already registered
     // datanode can send pipeline report again, or SCMPipelineManager will
     // create new pipelines.
-    Preconditions.checkNotNull(pipeline);
-    if (pipeline.getType() == HddsProtos.ReplicationType.RATIS &&
-        ((RatisReplicationConfig) pipeline.getReplicationConfig())
-            .getReplicationFactor() == HddsProtos.ReplicationFactor.THREE &&
-        !processedPipelineIDs.contains(pipeline.getId())) {
-      getSafeModeMetrics().incCurrentHealthyPipelinesCount();
-      currentHealthyPipelineCount++;
-      processedPipelineIDs.add(pipeline.getId());
-      unProcessedPipelineSet.remove(pipeline.getId());
+
+    // Only handle RATIS + 3-replica pipelines.
+    if (pipeline.getType() != HddsProtos.ReplicationType.RATIS ||
+        ((RatisReplicationConfig) 
pipeline.getReplicationConfig()).getReplicationFactor() !=
+            HddsProtos.ReplicationFactor.THREE) {
+      SCMSafeModeManager.getLogger().warn(
+          "Skipping pipeline safemode report processing as Replication type 
isn't RATIS " +
+              "or replication factor isn't 3.");
+      return;
+    }
+
+    // Skip already processed ones.
+    if (processedPipelineIDs.contains(pipeline.getId())) {
+      LOG.info("Skipping pipeline safemode report processing check as 
pipeline: {} is already recorded.",
+          pipeline.getId());
+      return;
+    }
+
+    List<DatanodeDetails> pipelineDns = pipeline.getNodes();
+    if (pipelineDns.size() != 3) {
+      LOG.warn("Only {} DNs reported this pipeline: {}, all 3 DNs should 
report the pipeline", pipelineDns.size(),
+          pipeline.getId());
+      return;
     }
 
+    Map<DatanodeDetails, String> badDnsWithReasons = new LinkedHashMap<>();
+
+    for (DatanodeDetails dn : pipelineDns) {
+      try {
+        NodeStatus status = nodeManager.getNodeStatus(dn);
+        if (!status.equals(NodeStatus.inServiceHealthy())) {
+          String reason = String.format("Health: %s, Operational State: %s",
+              status.getHealth(), status.getOperationalState());
+          badDnsWithReasons.put(dn, reason);
+        }
+      } catch (NodeNotFoundException e) {
+        badDnsWithReasons.put(dn, "DN not registered with SCM");
+      }
+    }
+
+    if (!badDnsWithReasons.isEmpty()) {
+      LOG.warn("Below DNs reported by Pipeline: {} are either in bad health or 
un-registered with SCMs",
+          pipeline.getId());
+      for (Map.Entry<DatanodeDetails, String> entry : 
badDnsWithReasons.entrySet()) {
+        LOG.warn("DN {}: {}", entry.getKey().getID(), entry.getValue());
+      }
+      return;
+    }

Review Comment:
   I’m just afraid that there may be other log entries between them, which 
could reduce the readability of the log.
   cc @nandakumar131, would you like to take a look?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to