ableegoldman commented on a change in pull request #10407: URL: https://github.com/apache/kafka/pull/10407#discussion_r602940025
########## File path: streams/src/test/java/org/apache/kafka/streams/processor/internals/TaskManagerTest.java ########## @@ -816,6 +817,106 @@ public void shouldCleanAndReviveCorruptedStandbyTasksBeforeCommittingNonCorrupte verify(consumer); } + @Test + public void shouldCloseAndReviveUncorruptedTasksWhenTimeoutExceptionThrownFromCommit() { + final ProcessorStateManager stateManager = EasyMock.createStrictMock(ProcessorStateManager.class); + stateManager.markChangelogAsCorrupted(taskId00Partitions); + replay(stateManager); + + final StateMachineTask corruptedActive = new StateMachineTask(taskId00, taskId00Partitions, true, stateManager); + final StateMachineTask unCorruptedActive = new StateMachineTask(taskId01, taskId01Partitions, true, stateManager) { + @Override + public void markChangelogAsCorrupted(final Collection<TopicPartition> partitions) { + fail("Should not try to mark changelogs as corrupted for uncorrupted task"); + } + + @Override + public void maybeInitTaskTimeoutOrThrow(final long currentWallClockMs, + final Exception cause) { + throw new TimeoutException(); + } + }; + final Map<TopicPartition, OffsetAndMetadata> offsets = singletonMap(t1p1, new OffsetAndMetadata(0L, null)); + unCorruptedActive.setCommittableOffsetsAndMetadata(offsets); + + // handleAssignment + final Map<TaskId, Set<TopicPartition>> assignment = new HashMap<>(); + assignment.putAll(taskId00Assignment); + assignment.putAll(taskId01Assignment); + expect(activeTaskCreator.createTasks(anyObject(), eq(assignment))).andStubReturn(asList(corruptedActive, unCorruptedActive)); + topologyBuilder.addSubscribedTopicsFromAssignment(anyObject(), anyString()); + expectLastCall().anyTimes(); + + expectRestoreToBeCompleted(consumer, changeLogReader); + + consumer.commitSync(offsets); + expectLastCall().andThrow(new TimeoutException()); + + expect(consumer.assignment()).andStubReturn(union(HashSet::new, taskId00Partitions, taskId01Partitions)); + + replay(activeTaskCreator, standbyTaskCreator, topologyBuilder, consumer, changeLogReader); + + taskManager.handleAssignment(assignment, emptyMap()); + assertThat(taskManager.tryToCompleteRestoration(time.milliseconds(), null), is(true)); + + // make sure this will be committed and throw + assertThat(unCorruptedActive.state(), is(Task.State.RUNNING)); + assertThat(corruptedActive.state(), is(Task.State.RUNNING)); + + unCorruptedActive.setCommitNeeded(); + + corruptedActive.setChangelogOffsets(singletonMap(t1p0, 0L)); + taskManager.handleCorruption(singleton(taskId00)); + + assertThat(corruptedActive.commitPrepared, is(true)); Review comment: Side note: seems weird that StateMachineTask has its own `commitNeeded` field rather than making the one in StreamTask `protected` and using that. Looks like we use `commitNeeded` in kind of a risky way in the tests, eg to indirectly indicate that it was closed clean, or infer that we successfully committed, etc Cleaner/safer to not reuse this variable to mean so many different things and just introduce a `closedClean`, `commitSuccessful`, etc wherever needed... But I don't want to mess with it in this PR so I'll just file a ticket to clean this up later if that makes sense -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org