This is an automated email from the ASF dual-hosted git repository.

lhotari pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 497aa4e53b Count the connection failure as the condition of quarantine 
(#4727)
497aa4e53b is described below

commit 497aa4e53b9b10dad3c1e8800bcf417c9ef8014a
Author: Yong Zhang <[email protected]>
AuthorDate: Thu Mar 19 17:09:49 2026 +0800

    Count the connection failure as the condition of quarantine (#4727)
    
    * Count the connection failure as the condition of quarantine
    ---
    
    ### Motivation
    
    Currently, the BookieClient quarantine mechanism primarily triggers based 
on read and write error responses from Bookies. However, in multi-region 
deployments, a common failure mode is the Network Partition or DNS Resolution 
Failure at the Region level.
    
    In such scenarios:
    
    A Bookie remains registered in ZooKeeper (it can still heartbeat to its 
local ZK observer).
    
    The Client (Broker) cannot resolve the Bookie's IP or establish a TCP 
connection.
    
    The EnsemblePlacementPolicy (especially RegionAwareEnsemblePlacementPolicy) 
sees the Bookie as "Available" and repeatedly selects it to satisfy minRack or 
E/Qw constraints.
    
    The LedgerHandle fails to write because it cannot initialize a connection 
handle, triggering an Ensemble Change.
    
    Because the connection failure didn't trigger a quarantine, the placement 
policy picks the same problematic Bookie again in the next iteration.
    
    This creates an infinite Ensemble Change loop, causing the Ledger write to 
hang indefinitely and bloating the Ledger metadata in ZooKeeper with thousands 
of segments.
    
    * Add configuration to control the behavior
---
 .../bookkeeper/conf/ClientConfiguration.java       | 23 ++++++++++++++++++++++
 .../bookkeeper/proto/PerChannelBookieClient.java   |  3 +++
 2 files changed, 26 insertions(+)

diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
index dde3f7e8f6..95cb43f06e 100644
--- 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
+++ 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
@@ -140,6 +140,7 @@ public class ClientConfiguration extends 
AbstractConfiguration<ClientConfigurati
     protected static final String BOOKIE_ERROR_THRESHOLD_PER_INTERVAL = 
"bookieErrorThresholdPerInterval";
     protected static final String BOOKIE_QUARANTINE_TIME_SECONDS = 
"bookieQuarantineTimeSeconds";
     protected static final String BOOKIE_QUARANTINE_RATIO = 
"bookieQuarantineRatio";
+    protected static final String BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED = 
"bookieConnectionErrorQuarantineEnabled";
 
     // Bookie info poll interval
     protected static final String DISK_WEIGHT_BASED_PLACEMENT_ENABLED = 
"diskWeightBasedPlacementEnabled";
@@ -1456,6 +1457,28 @@ public class ClientConfiguration extends 
AbstractConfiguration<ClientConfigurati
         return this;
     }
 
+
+    /**
+     * Set if count the bookie connecting error into the quarantine condition. 
If this is enabled, the connection
+     * error will be counted into the BookieErrorThresholdPerInterval. So be 
careful to set the quarantine time.
+     *
+     * @param enabled
+     * @return
+     */
+    public ClientConfiguration 
setBookieConnectionErrorQuarantineEnabled(boolean enabled) {
+        setProperty(BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED, enabled);
+        return this;
+    }
+
+    /**
+     * Get if count the bookie connecting error into the quarantine condition.
+     *
+     * @return
+     */
+    public boolean getBookieConnectionErrorQuarantineEnabled() {
+        return getBoolean(BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED, false);
+    }
+
     /**
      * Get the time for which a bookie will be quarantined.
      *
diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
index 655e206bbc..892306797e 100644
--- 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
+++ 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
@@ -1818,6 +1818,9 @@ public class PerChannelBookieClient extends 
ChannelInboundHandlerAdapter {
                     if (state != ConnectionState.CLOSED) {
                         state = ConnectionState.DISCONNECTED;
                     }
+                    if (conf.getBookieConnectionErrorQuarantineEnabled()) {
+                        recordError();
+                    }
                     failedConnectionCounter.inc();
                 }
 

Reply via email to