This is an automated email from the ASF dual-hosted git repository.
xianjingfeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 03f42c542 [#2525] improvement(server): add some metrics for LAB (#2553)
03f42c542 is described below
commit 03f42c542ba5245d9a70283c5adc0e99eb40d298
Author: xianjingfeng <[email protected]>
AuthorDate: Fri Jul 25 14:53:04 2025 +0800
[#2525] improvement(server): add some metrics for LAB (#2553)
### What changes were proposed in this pull request?
Add some metrics for LAB.
### Why are the changes needed?
These metrics can help us to adjust LAB related parameters.
Fix: #2525
### Does this PR introduce any user-facing change?
No.
### How was this patch tested?
Verify in production environment.
---
.../uniffle/server/ShuffleServerMetrics.java | 24 ++++++++++++++++++++--
.../server/buffer/ShuffleBufferWithLinkedList.java | 2 ++
.../server/buffer/ShuffleBufferWithSkipList.java | 2 ++
.../uniffle/server/buffer/lab/ChunkCreator.java | 20 ++++++++++++++----
.../org/apache/uniffle/server/buffer/lab/LAB.java | 3 +++
5 files changed, 45 insertions(+), 6 deletions(-)
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 9b6c548e1..f8689553c 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -149,7 +149,13 @@ public class ShuffleServerMetrics {
private static final String TOTAL_REMOVE_RESOURCE_TIME =
"total_remove_resource_time";
private static final String TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME =
"total_remove_resource_by_shuffle_ids_time";
-
+ private static final String LAB_CREATED_CHUNK_COUNT =
"lab_created_chunk_count";
+ private static final String LAB_REUSED_CHUNK_COUNT =
"lab_reused_chunk_count";
+ private static final String LAB_RECLAIMED_CHUNK_COUNT =
"lab_reclaimed_chunk_count";
+ private static final String LAB_CHUNK_POOL_REMAIN_PERCENT =
"lab_chunk_pool_remain_percent";
+ private static final String NOT_ON_LAB_BLOCK_COUNT =
"not_on_lab_block_count";
+ private static final String ON_LAB_BLOCK_COUNT = "on_lab_block_count";
+ private static final String BUFFER_BLOCK_SIZE = "buffer_block_size";
public static final String TOPN_OF_TOTAL_DATA_SIZE_FOR_APP =
"topN_of_total_data_size_for_app";
public static final String TOPN_OF_IN_MEMORY_DATA_SIZE_FOR_APP =
"topN_of_in_memory_data_size_for_app";
@@ -268,6 +274,13 @@ public class ShuffleServerMetrics {
public static Counter counterHadoopEventFlush;
public static Counter counterPreAllocatedBufferExpired;
public static Counter counterAppNotFound;
+ public static Counter counterLABChunkCreated;
+ public static Counter counterLABChunkReused;
+ public static Gauge gaugeLABChunkReclaimed;
+ public static Gauge gaugeLABChunkPoolRemainPercent;
+ public static Counter counterBlockNotOnLAB;
+ public static Counter counterBlockOnLAB;
+ public static Summary summaryBufferBlockSize;
private static MetricsManager metricsManager;
private static boolean isRegister = false;
@@ -504,7 +517,14 @@ public class ShuffleServerMetrics {
summaryTotalRemoveResourceTime =
metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_TIME);
summaryTotalRemoveResourceByShuffleIdsTime =
metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME);
-
+ counterLABChunkCreated =
metricsManager.addCounter(LAB_CREATED_CHUNK_COUNT);
+ counterLABChunkReused = metricsManager.addCounter(LAB_REUSED_CHUNK_COUNT);
+ gaugeLABChunkReclaimed =
metricsManager.addGauge(LAB_RECLAIMED_CHUNK_COUNT);
+ gaugeLABChunkPoolRemainPercent =
metricsManager.addGauge(LAB_CHUNK_POOL_REMAIN_PERCENT);
+
+ counterBlockNotOnLAB = metricsManager.addCounter(NOT_ON_LAB_BLOCK_COUNT);
+ counterBlockOnLAB = metricsManager.addCounter(ON_LAB_BLOCK_COUNT);
+ summaryBufferBlockSize = metricsManager.addSummary(BUFFER_BLOCK_SIZE);
gaugeTotalDataSizeUsage =
Gauge.build()
.name(TOPN_OF_TOTAL_DATA_SIZE_FOR_APP)
diff --git
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
index 2216e2907..9d4a5a57f 100644
---
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
+++
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
@@ -37,6 +37,7 @@ import org.apache.uniffle.common.util.Constants;
import org.apache.uniffle.common.util.JavaUtils;
import org.apache.uniffle.server.ShuffleDataFlushEvent;
import org.apache.uniffle.server.ShuffleFlushManager;
+import org.apache.uniffle.server.ShuffleServerMetrics;
public class ShuffleBufferWithLinkedList extends AbstractShuffleBuffer {
// blocks will be added to inFlushBlockMap as <eventId, blocks> pair
@@ -59,6 +60,7 @@ public class ShuffleBufferWithLinkedList extends
AbstractShuffleBuffer {
long currentDataLength = 0;
for (ShufflePartitionedBlock block : data.getBlockList()) {
+
ShuffleServerMetrics.summaryBufferBlockSize.observe(block.getDataLength());
// If sendShuffleData retried, we may receive duplicate block. The
duplicate
// block would gc without release. Here we must release the duplicated
block.
if (addBlock(block)) {
diff --git
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
index 33ba313ce..83f526a5c 100644
---
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
+++
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
@@ -39,6 +39,7 @@ import org.apache.uniffle.common.util.Constants;
import org.apache.uniffle.common.util.JavaUtils;
import org.apache.uniffle.server.ShuffleDataFlushEvent;
import org.apache.uniffle.server.ShuffleFlushManager;
+import org.apache.uniffle.server.ShuffleServerMetrics;
public class ShuffleBufferWithSkipList extends AbstractShuffleBuffer {
private ConcurrentSkipListMap<Long, ShufflePartitionedBlock> blocksMap;
@@ -66,6 +67,7 @@ public class ShuffleBufferWithSkipList extends
AbstractShuffleBuffer {
long currentDataLength = 0;
for (ShufflePartitionedBlock block : data.getBlockList()) {
+
ShuffleServerMetrics.summaryBufferBlockSize.observe(block.getDataLength());
// If sendShuffleData retried, we may receive duplicate block. The
duplicate
// block would gc without release. Here we must release the duplicated
block.
if (!blocksMap.containsKey(block.getBlockId())) {
diff --git
a/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
b/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
index 36aa2b115..f20d90521 100644
---
a/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
+++
b/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
@@ -35,6 +35,8 @@ import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.uniffle.server.ShuffleServerMetrics;
+
/**
* Does the management of LAB chunk creations. A monotonically incrementing id
is associated with
* every chunk
@@ -102,10 +104,12 @@ public class ChunkCreator {
if (pool != null) {
chunk = pool.getChunk();
if (chunk == null) {
- LOG.warn(
- "The chunk pool is full. Reached maxCount= "
- + pool.getMaxCount()
- + ". Creating chunk outside of the pool.");
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ "The chunk pool is full. Reached maxCount= "
+ + pool.getMaxCount()
+ + ". Creating chunk outside of the pool.");
+ }
}
}
@@ -134,6 +138,7 @@ public class ChunkCreator {
Preconditions.checkArgument(id > 0, "chunkId should be positive.");
chunk = new OffheapChunk(size, id, pool);
this.chunkIdMap.put(chunk.getId(), chunk);
+ ShuffleServerMetrics.counterLABChunkCreated.inc();
return chunk;
}
@@ -207,6 +212,10 @@ public class ChunkCreator {
if (chunk != null) {
chunk.reset();
reusedChunkCount.increment();
+
ShuffleServerMetrics.gaugeLABChunkReclaimed.set(reclaimedChunks.size());
+ ShuffleServerMetrics.gaugeLABChunkPoolRemainPercent.set(
+ reclaimedChunks.size() * 100d / chunkCount.get());
+ ShuffleServerMetrics.counterLABChunkReused.inc();
} else {
// Make a chunk if we have not yet created the maxCount chunks
while (true) {
@@ -314,5 +323,8 @@ public class ChunkCreator {
LOG.warn("Chunk {} can not be found in chunkIdMap, ignore it",
chunkID);
}
}
+
ShuffleServerMetrics.gaugeLABChunkReclaimed.set(chunksPool.reclaimedChunks.size());
+ ShuffleServerMetrics.gaugeLABChunkPoolRemainPercent.set(
+ chunksPool.reclaimedChunks.size() * 100d /
chunksPool.chunkCount.get());
}
}
diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
b/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
index 5d3b7000a..4da4ce4af 100644
--- a/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
+++ b/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
@@ -21,6 +21,7 @@ import java.util.LinkedList;
import java.util.List;
import org.apache.uniffle.common.ShufflePartitionedBlock;
+import org.apache.uniffle.server.ShuffleServerMetrics;
/**
* Local allocation buffer.
@@ -61,6 +62,7 @@ public class LAB {
public ShufflePartitionedBlock tryCopyBlockToChunk(ShufflePartitionedBlock
block) {
int size = block.getDataLength();
if (size > maxAlloc) {
+ ShuffleServerMetrics.counterBlockNotOnLAB.inc();
return block;
}
Chunk c;
@@ -77,6 +79,7 @@ public class LAB {
currChunk = null;
}
c.getData().writeBytes(block.getData());
+ ShuffleServerMetrics.counterBlockOnLAB.inc();
return new LABShufflePartitionedBlock(
block.getDataLength(),
block.getUncompressLength(),