This is an automated email from the ASF dual-hosted git repository. CRZbulabula pushed a commit to branch fix/npe-stale-fragment-instance-context in repository https://gitbox.apache.org/repos/asf/iotdb.git
commit 5805534097103ebe890348c4701919515ccf40ea Author: Yongzao <[email protected]> AuthorDate: Mon May 25 11:56:11 2026 +0800 Fix NPE when retried query reuses stale FragmentInstanceContext When QueryExecution.retry() re-plans a query, doDistributionPlan() creates fresh PlanFragmentId objects with nextFragmentInstanceId reset to 0. Because the queryId is unchanged, retry generates fragment instance IDs identical to the first execution (e.g. queryId_11.0). FragmentInstanceManager.instanceContext retains completed contexts for 5 minutes for statistics caching. When a retry dispatches the same FI ID, instanceContext.computeIfAbsent() returns the stale old context whose releaseResource() has already been called, setting dataRegion to null. New drivers then NPE at dataRegion.tryReadLock() inside FragmentInstanceContext.initQueryDataSource(). Fix: replace computeIfAbsent() with compute() in execDataQueryFragmentInstance() so that a released context (dataRegion == null) is atomically replaced with a fresh one carrying the new dataRegion reference. Defensive fix: add a null guard for dataRegion in getSharedQueryDataSource() that returns null (treated by DataDriver as an aborted FI) instead of propagating NPE. --- .../fragment/FragmentInstanceContext.java | 4 +++ .../fragment/FragmentInstanceManager.java | 30 ++++++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java index 2a0373cf6fd..ee177dde02e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java @@ -781,6 +781,10 @@ public class FragmentInstanceContext extends QueryContext { public synchronized IQueryDataSource getSharedQueryDataSource() throws QueryProcessException { if (sharedQueryDataSource == null) { + if (dataRegion == null) { + // Context was released (releaseResource() already ran). Signal aborted to the driver. + return null; + } switch (queryDataSourceType) { case SERIES_SCAN: if (initQueryDataSource(sourcePaths)) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java index 37e5c6c0858..520d98c3f81 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java @@ -151,19 +151,27 @@ public class FragmentInstanceManager { DataNodeQueryContext dataNodeQueryContext = getOrCreateDataNodeQueryContext(instanceId.getQueryId(), dataNodeFINum); + // Use compute() instead of computeIfAbsent() to handle the retry scenario: + // QueryExecution.retry() re-creates the distribution plan with PlanFragmentId + // counters reset to 0, generating fragment instance IDs identical to the first + // execution. instanceContext retains released contexts (dataRegion == null) for + // statistics caching. Without this check, a retried FI reuses the stale context + // and NPEs at dataRegion.tryReadLock(). FragmentInstanceContext context = - instanceContext.computeIfAbsent( + instanceContext.compute( instanceId, - fragmentInstanceId -> - createFragmentInstanceContext( - fragmentInstanceId, - stateMachine, - instance.getSessionInfo(), - dataRegion, - instance.getGlobalTimePredicate(), - dataNodeQueryContextMap, - instance.isDebug(), - instance.isVerbose())); + (fiId, existingContext) -> + (existingContext == null || existingContext.getDataRegion() == null) + ? createFragmentInstanceContext( + fiId, + stateMachine, + instance.getSessionInfo(), + dataRegion, + instance.getGlobalTimePredicate(), + dataNodeQueryContextMap, + instance.isDebug(), + instance.isVerbose()) + : existingContext); context.setHighestPriority(instance.isHighestPriority()); try {
