This is an automated email from the ASF dual-hosted git repository. yong pushed a commit to branch branch-4.17 in repository https://gitbox.apache.org/repos/asf/bookkeeper.git
commit accaa3dfcab9d50528adbed9222a9f3348b1d5e4 Author: Yong Zhang <[email protected]> AuthorDate: Thu Mar 19 17:09:49 2026 +0800 Count the connection failure as the condition of quarantine (#4727) * Count the connection failure as the condition of quarantine --- ### Motivation Currently, the BookieClient quarantine mechanism primarily triggers based on read and write error responses from Bookies. However, in multi-region deployments, a common failure mode is the Network Partition or DNS Resolution Failure at the Region level. In such scenarios: A Bookie remains registered in ZooKeeper (it can still heartbeat to its local ZK observer). The Client (Broker) cannot resolve the Bookie's IP or establish a TCP connection. The EnsemblePlacementPolicy (especially RegionAwareEnsemblePlacementPolicy) sees the Bookie as "Available" and repeatedly selects it to satisfy minRack or E/Qw constraints. The LedgerHandle fails to write because it cannot initialize a connection handle, triggering an Ensemble Change. Because the connection failure didn't trigger a quarantine, the placement policy picks the same problematic Bookie again in the next iteration. This creates an infinite Ensemble Change loop, causing the Ledger write to hang indefinitely and bloating the Ledger metadata in ZooKeeper with thousands of segments. * Add configuration to control the behavior (cherry picked from commit 497aa4e53b9b10dad3c1e8800bcf417c9ef8014a) --- .../bookkeeper/conf/ClientConfiguration.java | 23 ++++++++++++++++++++++ .../bookkeeper/proto/PerChannelBookieClient.java | 3 +++ 2 files changed, 26 insertions(+) diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java index f3dc75f553..099f20da4f 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java @@ -140,6 +140,7 @@ public class ClientConfiguration extends AbstractConfiguration<ClientConfigurati protected static final String BOOKIE_ERROR_THRESHOLD_PER_INTERVAL = "bookieErrorThresholdPerInterval"; protected static final String BOOKIE_QUARANTINE_TIME_SECONDS = "bookieQuarantineTimeSeconds"; protected static final String BOOKIE_QUARANTINE_RATIO = "bookieQuarantineRatio"; + protected static final String BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED = "bookieConnectionErrorQuarantineEnabled"; // Bookie info poll interval protected static final String DISK_WEIGHT_BASED_PLACEMENT_ENABLED = "diskWeightBasedPlacementEnabled"; @@ -1479,6 +1480,28 @@ public class ClientConfiguration extends AbstractConfiguration<ClientConfigurati return this; } + + /** + * Set if count the bookie connecting error into the quarantine condition. If this is enabled, the connection + * error will be counted into the BookieErrorThresholdPerInterval. So be careful to set the quarantine time. + * + * @param enabled + * @return + */ + public ClientConfiguration setBookieConnectionErrorQuarantineEnabled(boolean enabled) { + setProperty(BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED, enabled); + return this; + } + + /** + * Get if count the bookie connecting error into the quarantine condition. + * + * @return + */ + public boolean getBookieConnectionErrorQuarantineEnabled() { + return getBoolean(BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED, false); + } + /** * Get the time for which a bookie will be quarantined. * diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java index a8c95ca5ec..39e3311e1f 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java @@ -1818,6 +1818,9 @@ public class PerChannelBookieClient extends ChannelInboundHandlerAdapter { if (state != ConnectionState.CLOSED) { state = ConnectionState.DISCONNECTED; } + if (conf.getBookieConnectionErrorQuarantineEnabled()) { + recordError(); + } failedConnectionCounter.inc(); }
