This is an automated email from the ASF dual-hosted git repository.
sammichen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 515869dccc6 HDDS-14702. Make lazy source container replica deletion
interval configurable (#9837)
515869dccc6 is described below
commit 515869dccc6ebe8ac2eb625bdefc043cdb17816f
Author: Gargi Jaiswal <[email protected]>
AuthorDate: Fri Mar 6 12:20:56 2026 +0530
HDDS-14702. Make lazy source container replica deletion interval
configurable (#9837)
---
.../diskbalancer/DiskBalancerConfiguration.java | 18 ++++++++++++++++
.../diskbalancer/DiskBalancerService.java | 12 ++++++-----
.../diskbalancer/TestDiskBalancerTask.java | 4 ++--
hadoop-hdds/docs/content/design/diskbalancer.md | 12 +++++++++++
hadoop-hdds/docs/content/feature/DiskBalancer.md | 25 +++++++++++-----------
.../docs/content/feature/DiskBalancer.zh.md | 25 +++++++++++-----------
6 files changed, 65 insertions(+), 31 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java
index ba777aa2171..447691c0a5b 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java
@@ -116,6 +116,15 @@ public final class DiskBalancerConfiguration {
description = "If true, the DiskBalancer will automatically stop once
disks are balanced.")
private boolean stopAfterDiskEven = true;
+ @Config(key = "hdds.datanode.disk.balancer.replica.deletion.delay",
+ defaultValue = "5m",
+ type = ConfigType.TIME,
+ tags = { DATANODE, ConfigTag.DISKBALANCER },
+ description = "The delay after a container is successfully moved from
source volume to " +
+ "destination volume before the source container replica is deleted.
" +
+ "Unit could be defined with postfix (ns,ms,s,m,h,d).")
+ private long replicaDeletionDelay = Duration.ofMinutes(5).toMillis();
+
public DiskBalancerConfiguration(Double threshold,
Long bandwidthInMB,
Integer parallelThread,
@@ -181,6 +190,15 @@ public void setStopAfterDiskEven(boolean
stopAfterDiskEven) {
this.stopAfterDiskEven = stopAfterDiskEven;
}
+ /**
+ * Gets the replica deletion delay in milliseconds.
+ *
+ * @return delay in milliseconds before source replica is deleted after move
+ */
+ public long getReplicaDeletionDelay() {
+ return replicaDeletionDelay;
+ }
+
/**
* Gets the threshold value for DiskBalancer.
*
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
index aaa14321011..9503c2e3c1f 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
@@ -91,7 +91,7 @@ public class DiskBalancerService extends BackgroundService {
LoggerFactory.getLogger(DiskBalancerService.class);
public static final String DISK_BALANCER_DIR = "diskBalancer";
- private static long replicaDeletionDelayMills = 60 * 60 * 1000L; // 60
minutes
+ private long replicaDeletionDelay;
private OzoneContainer ozoneContainer;
private final ConfigurationSource conf;
@@ -162,6 +162,8 @@ public DiskBalancerService(OzoneContainer ozoneContainer,
throw new IOException(e);
}
+ replicaDeletionDelay = conf.getObject(DiskBalancerConfiguration.class)
+ .getReplicaDeletionDelay();
metrics = DiskBalancerServiceMetrics.create();
loadDiskBalancerInfo();
@@ -617,7 +619,7 @@ public BackgroundTaskResult call() {
}
if (moveSucceeded) {
// Add current old container to pendingDeletionContainers.
- pendingDeletionContainers.put(System.currentTimeMillis() +
replicaDeletionDelayMills, container);
+ pendingDeletionContainers.put(System.currentTimeMillis() +
replicaDeletionDelay, container);
ContainerLogger.logMoveSuccess(containerId, sourceVolume,
destVolume, containerSize, Time.monotonicNow() - startTime);
}
@@ -657,7 +659,7 @@ private void deleteContainer(Container container) {
container.delete();
container.getContainerData().getVolume().decrementUsedSpace(containerData.getBytesUsed());
LOG.info("Deleted expired container {} after delay {} ms.",
- containerData.getContainerID(), replicaDeletionDelayMills);
+ containerData.getContainerID(), replicaDeletionDelay);
} catch (IOException ex) {
LOG.warn("Failed to delete old container {} after it's marked as
DELETED. " +
"It will be handled by background scanners.",
container.getContainerData().getContainerID(), ex);
@@ -824,7 +826,7 @@ public static void setInjector(FaultInjector instance) {
}
@VisibleForTesting
- public static void setReplicaDeletionDelayMills(long durationMills) {
- replicaDeletionDelayMills = durationMills;
+ public void setReplicaDeletionDelay(long durationMills) {
+ this.replicaDeletionDelay = durationMills;
}
}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
index af9efdf7504..cd53404a670 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
@@ -246,7 +246,7 @@ public void setup() throws Exception {
conf.setFromObject(diskBalancerConfiguration);
diskBalancerService = new DiskBalancerServiceTestImpl(ozoneContainer,
100, conf, 1);
- DiskBalancerService.setReplicaDeletionDelayMills(0);
+ diskBalancerService.setReplicaDeletionDelay(0);
KeyValueContainer.setInjector(kvFaultInjector);
}
@@ -592,7 +592,7 @@ public void
testOldReplicaDelayedDeletion(ContainerTestVersionInfo versionInfo)
throws IOException, InterruptedException {
setLayoutAndSchemaForTest(versionInfo);
long delay = 2000L; // 2 second delay
- DiskBalancerService.setReplicaDeletionDelayMills(delay);
+ diskBalancerService.setReplicaDeletionDelay(delay);
Container container = createContainer(CONTAINER_ID, sourceVolume,
State.CLOSED);
KeyValueContainerData keyValueContainerData = (KeyValueContainerData)
container.getContainerData();
diff --git a/hadoop-hdds/docs/content/design/diskbalancer.md
b/hadoop-hdds/docs/content/design/diskbalancer.md
index f546b5253d7..aab8af2ff34 100644
--- a/hadoop-hdds/docs/content/design/diskbalancer.md
+++ b/hadoop-hdds/docs/content/design/diskbalancer.md
@@ -103,6 +103,18 @@ D1 ----> C1-CLOSED --- (5) ---> C1-DELETED
|
D2 ----> Temp C1-CLOSED --- (2) ---> Temp C1-RECOVERING --- (3) --->
C1-RECOVERING --- (4) ---> C1-CLOSED
```
+
+### Lazy Deletion of Source Container Replica
+
+The source container on D1 is **not** deleted immediately after the move
completes. Instead, it is scheduled for deletion after a configurable delay
using config `hdds.datanode.disk.balancer.replica.deletion.delay`, **default: 5
minutes**.
+
+**Rationale:** When a container has only one replica and that replica has an
in-flight read operation, the read thread may still hold a reference to the old
container at the source path.
+If the DiskBalancer deletes the old container immediately after the move, the
in-flight read would fail because the container data is now at the new path.
The lazy deletion provides a
+grace period for in-flight reads to complete before the old container is
removed, avoiding immediate read failures.
+
+**Note:** Because of this lazy deletion, the disk utilization of the source
volume will not decrease immediately after a container move or after the
DiskBalancer is stopped. The freed space
+and balanced state will be visible only after the configured delay, when the
source container replicas are actually deleted.
+
## DiskBalancing Policies
By default, the DiskBalancer uses specific policies to decide which disks to
balance and which containers to move. These
diff --git a/hadoop-hdds/docs/content/feature/DiskBalancer.md
b/hadoop-hdds/docs/content/feature/DiskBalancer.md
index d8e7b501ae3..317d3f02372 100644
--- a/hadoop-hdds/docs/content/feature/DiskBalancer.md
+++ b/hadoop-hdds/docs/content/feature/DiskBalancer.md
@@ -238,16 +238,17 @@ ozone admin datanode diskbalancer report
--in-service-datanodes --json
The DiskBalancer's behavior can be controlled using the following
configuration properties in `ozone-site.xml`.
-| Property | Default Value
|
Description
|
-|-------------------------------------------------------------|----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `hdds.datanode.disk.balancer.enabled` | `false`
| If
false, the DiskBalancer service on the Datanode is disabled. Configure it to
true for diskBalancer to be enabled.
|
-| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0`
| A
percentage (0-100). A datanode is considered balanced if for each volume, its
utilization differs from the average datanode utilization by no more than this
threshold. |
-| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10`
| The
maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid
impacting client I/O.
|
-| `hdds.datanode.disk.balancer.parallel.thread` | `5`
| The
number of worker threads to use for moving containers in parallel.
|
-| `hdds.datanode.disk.balancer.service.interval` | `60s`
| The
time interval at which the Datanode DiskBalancer service checks for imbalance
and updates its configuration.
|
-| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true`
| If
true, the DiskBalancer will automatically stop its balancing activity once
disks are considered balanced (i.e., all volume densities are within the
threshold). |
-| `hdds.datanode.disk.balancer.volume.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy`
| The policy class for selecting source and destination volumes for
balancing.
|
-| `hdds.datanode.disk.balancer.container.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy`
| The policy class for selecting which containers to move from a source volume
to destination volume.
|
-| `hdds.datanode.disk.balancer.service.timeout` | `300s`
|
Timeout for the Datanode DiskBalancer service operations.
|
-| `hdds.datanode.disk.balancer.should.run.default` | `false`
| If
the balancer fails to read its persisted configuration, this value determines
if the service should run by default.
|
+| Property | Default Value
|
Description
|
+|-------------------------------------------------------------|----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `hdds.datanode.disk.balancer.enabled` | `false`
| If
false, the DiskBalancer service on the Datanode is disabled. Configure it to
true for diskBalancer to be enabled.
|
+| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0`
| A
percentage (0-100). A datanode is considered balanced if for each volume, its
utilization differs from the average datanode utilization by no more than this
threshold.
|
+| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10`
| The
maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid
impacting client I/O.
|
+| `hdds.datanode.disk.balancer.parallel.thread` | `5`
| The
number of worker threads to use for moving containers in parallel.
|
+| `hdds.datanode.disk.balancer.service.interval` | `60s`
| The
time interval at which the Datanode DiskBalancer service checks for imbalance
and updates its configuration.
|
+| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true`
| If
true, the DiskBalancer will automatically stop its balancing activity once
disks are considered balanced (i.e., all volume densities are within the
threshold).
|
+| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m`
| The
delay after a container is successfully moved from source volume to destination
volume before the source container replica is deleted. This lazy deletion
provides a grace period before failing the read thread holding the old
container replica. Unit: ns, ms, s, m, h, d. |
+| `hdds.datanode.disk.balancer.volume.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy`
| The policy class for selecting source and destination volumes for
balancing.
|
+| `hdds.datanode.disk.balancer.container.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy`
| The policy class for selecting which containers to move from a source volume
to destination volume.
|
+| `hdds.datanode.disk.balancer.service.timeout` | `300s`
|
Timeout for the Datanode DiskBalancer service operations.
|
+| `hdds.datanode.disk.balancer.should.run.default` | `false`
| If
the balancer fails to read its persisted configuration, this value determines
if the service should run by default.
|
diff --git a/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md
b/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md
index 65ba7ca3fa1..e892ee47e7c 100644
--- a/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md
+++ b/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md
@@ -230,16 +230,17 @@ ozone admin datanode diskbalancer report
--in-service-datanodes --json
The DiskBalancer's behavior can be controlled using the following
configuration properties in `ozone-site.xml`.
-| Property | Default Value
| Description
|
-|-------------------------------------------------------------|----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `hdds.datanode.disk.balancer.enabled` | `false`
| 如果为 false,则 Datanode 上的 DiskBalancer 服务将被禁用。将其配置为
true 可启用 DiskBalancer。 |
|
|
[...]
-| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0`
|
百分比(0-100)。如果对于每个卷,其利用率与平均数据节点利用率之差不超过此阈值,则认为数据节点处于平衡状态。 |
-| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10`
| 平衡器可用于移动数据的最大带宽(以 MB/s 为单位),以避免影响客户端 I/O。
|
-| `hdds.datanode.disk.balancer.parallel.thread` | `5`
| 用于并行移动容器的工作线程数。
|
-| `hdds.datanode.disk.balancer.service.interval` | `60s`
| Datanode DiskBalancer 服务检查不平衡并更新其配置的时间间隔。
|
-| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true`
| 如果为真,则一旦磁盘被视为平衡(即所有卷密度都在阈值内),DiskBalancer 将自动停止其平衡活动。
|
-| `hdds.datanode.disk.balancer.volume.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy`
| 用于选择平衡的源卷和目标卷的策略类。
|
-| `hdds.datanode.disk.balancer.container.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy`
| 用于选择将哪些容器从源卷移动到目标卷的策略类。
|
-| `hdds.datanode.disk.balancer.service.timeout` | `300s`
| Datanode DiskBalancer 服务操作超时。
|
-| `hdds.datanode.disk.balancer.should.run.default` | `false`
| 如果平衡器无法读取其持久配置,则该值决定服务是否应默认运行。
|
+| Property | Default Value
| Description
|
+|-------------------------------------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `hdds.datanode.disk.balancer.enabled` | `false`
| 如果为 false,则 Datanode 上的 DiskBalancer 服务将被禁用。将其配置为
true 可启用 DiskBalancer。 |
|
|
[...]
+| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0`
|
百分比(0-100)。如果对于每个卷,其利用率与平均数据节点利用率之差不超过此阈值,则认为数据节点处于平衡状态。 |
+| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10`
| 平衡器可用于移动数据的最大带宽(以 MB/s 为单位),以避免影响客户端 I/O。
|
+| `hdds.datanode.disk.balancer.parallel.thread` | `5`
| 用于并行移动容器的工作线程数。
|
+| `hdds.datanode.disk.balancer.service.interval` | `60s`
| Datanode DiskBalancer 服务检查不平衡并更新其配置的时间间隔。
|
+| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true`
| 如果为真,则一旦磁盘被视为平衡(即所有卷密度都在阈值内),DiskBalancer 将自动停止其平衡活动。
|
+| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m`
|
容器成功从源卷移动到目标卷后,源容器副本被删除前的延迟时间。这种延迟删除机制旨在避免旧副本的即时删除导致持有旧容器副本的线程数据读取失败。单位:ns、ms、s、m、h、d。|
+| `hdds.datanode.disk.balancer.volume.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy`
| 用于选择平衡的源卷和目标卷的策略类。
|
+| `hdds.datanode.disk.balancer.container.choosing.policy` |
`org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy`
| 用于选择将哪些容器从源卷移动到目标卷的策略类。
|
+| `hdds.datanode.disk.balancer.service.timeout` | `300s`
| Datanode DiskBalancer 服务操作超时。
|
+| `hdds.datanode.disk.balancer.should.run.default` | `false`
| 如果平衡器无法读取其持久配置,则该值决定服务是否应默认运行。
|
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]