This is an automated email from the ASF dual-hosted git repository.
angerszhuuuu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 0bf841006 [CELEBORN-663][REFACTOR] Refine RssInputStream logs to help
admin debug
0bf841006 is described below
commit 0bf841006aa7d97dd14cdc1ffba04f371a6b31fe
Author: Angerszhuuuu <[email protected]>
AuthorDate: Mon Jun 12 18:49:41 2023 +0800
[CELEBORN-663][REFACTOR] Refine RssInputStream logs to help admin debug
### What changes were proposed in this pull request?
Refine RssInputStream logs to indicate the failed location's info to help
admin debug
### Why are the changes needed?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Closes #1576 from AngersZhuuuu/CELEBORN-663.
Authored-by: Angerszhuuuu <[email protected]>
Signed-off-by: Angerszhuuuu <[email protected]>
---
.../celeborn/client/read/RssInputStream.java | 25 ++++++++++++++++------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git
a/client/src/main/java/org/apache/celeborn/client/read/RssInputStream.java
b/client/src/main/java/org/apache/celeborn/client/read/RssInputStream.java
index 1e0b26b52..874445083 100644
--- a/client/src/main/java/org/apache/celeborn/client/read/RssInputStream.java
+++ b/client/src/main/java/org/apache/celeborn/client/read/RssInputStream.java
@@ -240,21 +240,23 @@ public abstract class RssInputStream extends InputStream {
}
location = location.getPeer();
logger.warn(
- "CreatePartitionReader failed {}/{} times, change to peer",
+ "CreatePartitionReader failed {}/{} times for location {},
change to peer",
fetchChunkRetryCnt,
fetchChunkMaxRetry,
+ location,
e);
} else {
logger.warn(
- "CreatePartitionReader failed {}/{} times, retry the same
location",
+ "CreatePartitionReader failed {}/{} times for location {},
retry the same location",
fetchChunkRetryCnt,
fetchChunkMaxRetry,
+ location,
e);
Uninterruptibles.sleepUninterruptibly(retryWaitMs,
TimeUnit.MILLISECONDS);
}
}
}
- throw new CelebornIOException("createPartitionReader failed!");
+ throw new CelebornIOException("createPartitionReader failed! " +
location);
}
private ByteBuf getNextChunk() throws IOException {
@@ -267,13 +269,18 @@ public abstract class RssInputStream extends InputStream {
if (fetchChunkRetryCnt == fetchChunkMaxRetry) {
logger.warn("Fetch chunk fail exceeds max retry {}",
fetchChunkRetryCnt, e);
throw new CelebornIOException(
- "Fetch chunk failed for " + fetchChunkRetryCnt + " times", e);
+ "Fetch chunk failed for "
+ + fetchChunkRetryCnt
+ + " times for location "
+ + currentReader.getLocation(),
+ e);
} else {
if (currentReader.getLocation().getPeer() != null) {
logger.warn(
- "Fetch chunk failed {}/{} times, change to peer",
+ "Fetch chunk failed {}/{} times for location {}, change to
peer",
fetchChunkRetryCnt,
fetchChunkMaxRetry,
+ currentReader.getLocation(),
e);
// fetchChunkRetryCnt % 2 == 0 means both replicas have been
tried,
// so sleep before next try.
@@ -283,14 +290,18 @@ public abstract class RssInputStream extends InputStream {
currentReader =
createReaderWithRetry(currentReader.getLocation().getPeer());
} else {
logger.warn(
- "Fetch chunk failed {}/{} times", fetchChunkRetryCnt,
fetchChunkMaxRetry, e);
+ "Fetch chunk failed {}/{} times for location {}",
+ fetchChunkRetryCnt,
+ fetchChunkMaxRetry,
+ currentReader.getLocation(),
+ e);
Uninterruptibles.sleepUninterruptibly(retryWaitMs,
TimeUnit.MILLISECONDS);
currentReader =
createReaderWithRetry(currentReader.getLocation());
}
}
}
}
- throw new CelebornIOException("Fetch chunk failed!");
+ throw new CelebornIOException("Fetch chunk failed! " +
currentReader.getLocation());
}
private PartitionReader createReader(