szetszwo commented on code in PR #10073: URL: https://github.com/apache/ozone/pull/10073#discussion_r3074519282
########## hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/PendingContainerTracker.java: ########## @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.node; + +import com.google.common.annotations.VisibleForTesting; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.DatanodeID; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.StorageReportProto; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.pipeline.Pipeline; +import org.apache.hadoop.ozone.container.common.volume.VolumeUsage; +import org.apache.hadoop.util.Time; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Tracks per-datanode pending container allocations at SCM using a Two Window Tumbling Bucket + * pattern (similar to HDFS HADOOP-3707). + * + * Two Window Tumbling Bucket for automatic aging and cleanup. + * + * How It Works: + * <li>Each DataNode has two sets: <b>currentWindow</b> and <b>previousWindow</b></li> + * <li>New allocations go into <b>currentWindow</b></li> + * <li>Every <b>ROLL_INTERVAL</b> (default 5 minutes): + * <ul> + * <li>previousWindow = currentWindow (shift)</li> + * <li>currentWindow = new empty set (reset)</li> + * <li>Old previousWindow is discarded (automatic aging)</li> + * </ul> + * </li> + * <li>When checking pending: return <b>union</b> of currentWindow + previousWindow</li> + * + * + * Example Timeline: + * <pre> + * Time | Action | CurrentWindow | PreviousWindow | Total Pending + * ------+---------------------------+---------------+----------------+-------------- + * 00:00 | Allocate Container-1 | {C1} | {} | {C1} + * 00:02 | Allocate Container-2 | {C1, C2} | {} | {C1, C2} + * 00:05 | [ROLL] Window tumbles | {} | {C1, C2} | {C1, C2} + * 00:07 | Allocate Container-3 | {C3} | {C1, C2} | {C1, C2, C3} + * 00:08 | Report confirms C1 | {C3} | {C2} | {C2, C3} + * 00:10 | [ROLL] Window tumbles | {} | {C3} | {C3} + * | (C2 aged out if not reported) + * </pre> + * + */ +public class PendingContainerTracker { + + private static final Logger LOG = LoggerFactory.getLogger(PendingContainerTracker.class); + + /** + * Roll interval in milliseconds. + * Configurable via hdds.scm.container.pending.allocation.roll.interval. + * Default: 5 minutes. + * Containers automatically age out after 2 × rollIntervalMs. + */ + private final long rollIntervalMs; + + /** + * Map of DataNode ID to TwoWindowBucket. + */ + private final ConcurrentHashMap<DatanodeID, TwoWindowBucket> datanodeBuckets; + + /** + * Maximum container size in bytes. + */ + private final long maxContainerSize; + + /** + * Metrics for tracking pending containers (same instance as {@link SCMNodeManager}'s node metrics). + */ + private final SCMNodeMetrics metrics; + + /** + * Two-window bucket for a single DataNode. + * Contains current and previous window sets, plus last roll timestamp. + */ + private static class TwoWindowBucket { + private Set<ContainerID> currentWindow = new HashSet<>(); + private Set<ContainerID> previousWindow = new HashSet<>(); + private long lastRollTime = Time.monotonicNow(); + private final long rollIntervalMs; + + TwoWindowBucket(long rollIntervalMs) { + this.rollIntervalMs = rollIntervalMs; + } + + /** + * Roll one or both windows based on elapsed time. + */ + synchronized void rollIfNeeded() { + long now = Time.monotonicNow(); + long elapsed = now - lastRollTime; + + if (elapsed >= 2 * rollIntervalMs) { + previousWindow.clear(); + currentWindow.clear(); + lastRollTime = now; + } else if (elapsed >= rollIntervalMs) { + previousWindow = currentWindow; + currentWindow = new HashSet<>(); + lastRollTime = now; + LOG.debug("Rolled window. Previous window size: {}, Current window reset to empty", previousWindow.size()); + } + } + + /** + * Get union of both windows (all pending containers). + */ + synchronized Set<ContainerID> getAllPending() { + Set<ContainerID> all = new HashSet<>(); + all.addAll(currentWindow); + all.addAll(previousWindow); + return all; + } + + /** + * Add container to current window. + */ + synchronized boolean add(ContainerID containerID) { + return currentWindow.add(containerID); + } + + /** + * Remove container from both windows. + */ + synchronized boolean remove(ContainerID containerID) { + boolean removedFromCurrent = currentWindow.remove(containerID); + boolean removedFromPrevious = previousWindow.remove(containerID); + return removedFromCurrent || removedFromPrevious; + } + + /** + * Check if either window is non-empty. + */ + synchronized boolean isEmpty() { + return currentWindow.isEmpty() && previousWindow.isEmpty(); + } + + /** + * Count of pending containers in both windows. + */ + synchronized int getCount() { + return currentWindow.size() + previousWindow.size(); + } + } + + public PendingContainerTracker(long maxContainerSize) { + this(maxContainerSize, 10 * 60 * 1000, null); // Default 10 minutes + } + + public PendingContainerTracker(long maxContainerSize, long rollIntervalMs, + SCMNodeMetrics metrics) { + this.datanodeBuckets = new ConcurrentHashMap<>(); + this.maxContainerSize = maxContainerSize; + this.rollIntervalMs = rollIntervalMs; + this.metrics = metrics; + LOG.info("PendingContainerTracker initialized with maxContainerSize={}B, rollInterval={}ms", + maxContainerSize, rollIntervalMs); + } + + /** + * Advances the two-window tumbling bucket for this datanode when the roll interval has elapsed. + * Call on periodic paths (e.g. node report) so windows age even when there are no new + * allocations or container reports touching this tracker. + */ + public void rollWindowsIfNeeded(DatanodeDetails node) { + if (node == null) { + return; + } + DatanodeID dnID = node.getID(); + datanodeBuckets.computeIfPresent(dnID, (k, bucket) -> { + synchronized (bucket) { + bucket.rollIfNeeded(); + // Remove bucket if empty after roll + return bucket.isEmpty() ? null : bucket; + } + }); + } + + /** + * Whether the datanode can fit another container of {@code containerSize} after accounting for + * SCM pending allocations for {@code node} (this tracker) and usable space on {@code datanodeInfo}. + * Combines {@link #getPendingAllocationSize} with the per-disk slot check in one call. + * + * @param node identity used to look up pending allocations (same DN as {@code datanodeInfo}) + * @param datanodeInfo storage reports for the datanode + * @param containerSize required container size in bytes (typically SCM max container size) + */ + public boolean hasEffectiveAllocatableSpaceForNewContainer( + DatanodeDetails node, DatanodeInfo datanodeInfo, long containerSize) { + if (node == null || datanodeInfo == null || containerSize <= 0) { + return false; + } + long pendingBytes = getPendingAllocationSize(node); + return hasAllocatableSpaceAfterPending(datanodeInfo, containerSize, pendingBytes); + } + + private boolean hasAllocatableSpaceAfterPending( + DatanodeInfo datanodeInfo, long containerSize, long pendingAllocationBytes) { + List<StorageReportProto> storageReports = datanodeInfo.getStorageReports(); + if (storageReports == null || storageReports.isEmpty()) { + return false; + } + long effectiveAllocatableSpace = 0L; + for (StorageReportProto report : storageReports) { + long usableSpace = VolumeUsage.getUsableSpace(report); + long containersOnThisDisk = usableSpace / containerSize; + effectiveAllocatableSpace += containersOnThisDisk * containerSize; + if (effectiveAllocatableSpace - pendingAllocationBytes >= containerSize) { + return true; + } + } + if (metrics != null) { + metrics.incNumSkippedFullNodeContainerAllocation(); + } + return false; + } + + /** + * Drops all pending allocation state for a datanode (e.g. stale/dead cleanup). + */ + public void clearPendingForDatanode(DatanodeDetails node) { + if (node == null) { + return; + } + DatanodeID dnID = node.getID(); + datanodeBuckets.remove(dnID); + LOG.debug("Cleared pending container allocations for datanode {}", dnID); + } + + /** + * Record a pending container allocation for all DataNodes in the pipeline. + * Container is added to the current window. + * + * @param pipeline The pipeline where container is allocated + * @param containerID The container being allocated + */ + public void recordPendingAllocation(Pipeline pipeline, ContainerID containerID) { + if (pipeline == null || containerID == null) { + LOG.warn("Ignoring null pipeline or containerID"); + return; + } + + for (DatanodeDetails node : pipeline.getNodes()) { + recordPendingAllocationForDatanode(node, containerID); + } + } + + /** + * Record a pending container allocation for a single DataNode. + * Container is added to the current window. + * + * @param node The DataNode where container is being allocated/replicated + * @param containerID The container being allocated/replicated + */ + public void recordPendingAllocationForDatanode(DatanodeDetails node, ContainerID containerID) { + if (node == null || containerID == null) { + LOG.warn("Ignoring null node or containerID"); + return; + } + + DatanodeID dnID = node.getID(); + AtomicBoolean added = addContainerToBucket(containerID, dnID); + + if (added.get() && metrics != null) { + metrics.incNumPendingContainersAdded(); + } + } + + private AtomicBoolean addContainerToBucket(ContainerID containerID, DatanodeID dnID) { + AtomicBoolean added = new AtomicBoolean(false); + + datanodeBuckets.compute(dnID, (k, existing) -> { + TwoWindowBucket bucket = (existing != null) ? existing : new TwoWindowBucket(rollIntervalMs); + synchronized (bucket) { + bucket.rollIfNeeded(); + added.set(bucket.add(containerID)); + LOG.debug("Recorded pending container {} on DataNode {}. Added={}, Total pending={}", + containerID, dnID, added.get(), bucket.getCount()); + return bucket; + } + }); + return added; + } + + /** + * Remove a pending container allocation from a specific DataNode. + * Removes from both current and previous windows. + * Called when container is confirmed. + * + * @param node The DataNode + * @param containerID The container to remove from pending + */ + public void removePendingAllocation(DatanodeDetails node, ContainerID containerID) { + if (node == null || containerID == null) { + return; + } + + DatanodeID dnID = node.getID(); + AtomicBoolean removed = removeContainerFromBucket(containerID, dnID); + + if (removed.get() && metrics != null) { + metrics.incNumPendingContainersRemoved(); + } + } + + private AtomicBoolean removeContainerFromBucket(ContainerID containerID, DatanodeID dnID) { + AtomicBoolean removed = new AtomicBoolean(false); + + datanodeBuckets.computeIfPresent(dnID, (k, bucket) -> { + synchronized (bucket) { + bucket.rollIfNeeded(); + removed.set(bucket.remove(containerID)); + LOG.debug("Removed pending container {} from DataNode {}. Removed={}, Remaining={}", + containerID, dnID, removed.get(), bucket.getCount()); + return bucket.isEmpty() ? null : bucket; + } + }); + return removed; + } + + /** + * Bytes of SCM-side pending container allocations for this datanode (count × configured max + * container size). For whether a new container can be placed, prefer + * {@link #hasEffectiveAllocatableSpaceForNewContainer}. + * <p>Note: this call may advance the internal tumbling window if the roll interval has elapsed, + * ensuring the returned value reflects the most up-to-date pending state.</p> + * + * @param node The DataNode + * @return Total bytes of pending container allocations + */ + public long getPendingAllocationSize(DatanodeDetails node) { + if (node == null) { + return 0; + } + + TwoWindowBucket bucket = datanodeBuckets.get(node.getID()); + if (bucket == null) { + return 0; + } + + synchronized (bucket) { + bucket.rollIfNeeded(); + return (long) bucket.getCount() * maxContainerSize; + } + } + + /** + * Get the set of pending container IDs for a DataNode. + * Returns union of current and previous windows. + * Useful for debugging and monitoring. + * <p>Note: this call may advance the internal tumbling window if the roll interval has elapsed, + * ensuring the returned set reflects the most up-to-date pending state.</p> + * + * @param node The DataNode + * @return Set of pending container IDs + */ + public Set<ContainerID> getPendingContainers(DatanodeDetails node) { Review Comment: Remove this method since it is very easy to be misused. E.g. the test calls it just for the size. Why copying the set to get the size? ```java //TestPendingContainerTracker tracker.getPendingContainers(dn1).size()) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
