Github user kl0u commented on a diff in the pull request: https://github.com/apache/flink/pull/2707#discussion_r86561357 --- Diff: flink-streaming-java/src/main/java/org/apache/flink/streaming/runtime/operators/GenericWriteAheadSink.java --- @@ -77,111 +84,144 @@ public GenericWriteAheadSink(CheckpointCommitter committer, TypeSerializer<IN> s public void open() throws Exception { super.open(); committer.setOperatorId(id); - committer.setOperatorSubtaskId(getRuntimeContext().getIndexOfThisSubtask()); committer.open(); - cleanState(); - checkpointStreamFactory = - getContainingTask().createCheckpointStreamFactory(this); + + checkpointStreamFactory = getContainingTask() + .createCheckpointStreamFactory(this); + + cleanRestoredHandles(); } public void close() throws Exception { committer.close(); } /** - * Saves a handle in the state. + * Called when a checkpoint barrier arrives. It closes any open streams to the backend + * and marks them as pending for committing to the external, third-party storage system. * - * @param checkpointId - * @throws IOException + * @param checkpointId the id of the latest received checkpoint. + * @throws IOException in case something went wrong when handling the stream to the backend. */ private void saveHandleInState(final long checkpointId, final long timestamp) throws Exception { + Preconditions.checkNotNull(this.pendingCheckpoints, "The operator has not been properly initialized."); + //only add handle if a new OperatorState was created since the last snapshot if (out != null) { + int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); StreamStateHandle handle = out.closeAndGetHandle(); - if (state.pendingHandles.containsKey(checkpointId)) { + + PendingCheckpoint pendingCheckpoint = new PendingCheckpoint( + checkpointId, subtaskIdx, timestamp, handle); + + if (pendingCheckpoints.contains(pendingCheckpoint)) { //we already have a checkpoint stored for that ID that may have been partially written, //so we discard this "alternate version" and use the stored checkpoint handle.discardState(); } else { - state.pendingHandles.put(checkpointId, new Tuple2<>(timestamp, handle)); + pendingCheckpoints.add(pendingCheckpoint); } out = null; } } @Override - public void snapshotState(FSDataOutputStream out, - long checkpointId, - long timestamp) throws Exception { + public void snapshotState(FSDataOutputStream out, long checkpointId, long timestamp) throws Exception { saveHandleInState(checkpointId, timestamp); - InstantiationUtil.serializeObject(out, state); + DataOutputViewStreamWrapper outStream = new DataOutputViewStreamWrapper(out); + outStream.writeInt(pendingCheckpoints.size()); + for (PendingCheckpoint pendingCheckpoint : pendingCheckpoints) { + pendingCheckpoint.serialize(outStream); + } } @Override public void restoreState(FSDataInputStream in) throws Exception { - this.state = InstantiationUtil.deserializeObject(in, getUserCodeClassloader()); + final DataInputViewStreamWrapper inStream = new DataInputViewStreamWrapper(in); + int numPendingHandles = inStream.readInt(); + for (int i = 0; i < numPendingHandles; i++) { + pendingCheckpoints.add(PendingCheckpoint.restore(inStream, getUserCodeClassloader())); + } } - private void cleanState() throws Exception { - synchronized (this.state.pendingHandles) { //remove all handles that were already committed - Set<Long> pastCheckpointIds = this.state.pendingHandles.keySet(); - Set<Long> checkpointsToRemove = new HashSet<>(); - for (Long pastCheckpointId : pastCheckpointIds) { - if (committer.isCheckpointCommitted(pastCheckpointId)) { - checkpointsToRemove.add(pastCheckpointId); + /** + * Called at {@link #open()} to clean-up the pending handle list. + * It iterates over all restored pending handles, checks which ones are already + * committed to the outside storage system and removes them from the list. + */ + private void cleanRestoredHandles() throws Exception { + synchronized (pendingCheckpoints) { + + // for each of the pending handles... + Iterator<PendingCheckpoint> pendingCheckpointIt = pendingCheckpoints.iterator(); + while (pendingCheckpointIt.hasNext()) { + + PendingCheckpoint pendingCheckpoint = pendingCheckpointIt.next(); + long checkpointId = pendingCheckpoint.checkpointId; + int subtaskId = pendingCheckpoint.subtaskId; + + //...check if the temporary buffer is already committed and if yes, + // remove it from the list of pending checkpoints. + if (committer.isCheckpointCommitted(subtaskId, checkpointId)) { + pendingCheckpoint.stateHandle.discardState(); + pendingCheckpointIt.remove(); } } - for (Long toRemove : checkpointsToRemove) { - this.state.pendingHandles.remove(toRemove); - } } } @Override public void notifyOfCompletedCheckpoint(long checkpointId) throws Exception { super.notifyOfCompletedCheckpoint(checkpointId); - synchronized (state.pendingHandles) { - Set<Long> pastCheckpointIds = state.pendingHandles.keySet(); - Set<Long> checkpointsToRemove = new HashSet<>(); - for (Long pastCheckpointId : pastCheckpointIds) { + synchronized (pendingCheckpoints) { + Iterator<PendingCheckpoint> pendingCheckpointIt = pendingCheckpoints.iterator(); + while (pendingCheckpointIt.hasNext()) { + + PendingCheckpoint pendingCheckpoint = pendingCheckpointIt.next(); + long pastCheckpointId = pendingCheckpoint.checkpointId; + int subtaskId = pendingCheckpoint.subtaskId; + long timestamp = pendingCheckpoint.timestamp; + StreamStateHandle streamHandle = pendingCheckpoint.stateHandle; + if (pastCheckpointId <= checkpointId) { try { - if (!committer.isCheckpointCommitted(pastCheckpointId)) { - Tuple2<Long, StreamStateHandle> handle = state.pendingHandles.get(pastCheckpointId); - try (FSDataInputStream in = handle.f1.openInputStream()) { + if (!committer.isCheckpointCommitted(subtaskId, pastCheckpointId)) { + try (FSDataInputStream in = streamHandle.openInputStream()) { boolean success = sendValues( new ReusingMutableToRegularIteratorWrapper<>( new InputViewIterator<>( new DataInputViewStreamWrapper( in), serializer), serializer), - handle.f0); - if (success) { //if the sending has failed we will retry on the next notify - committer.commitCheckpoint(pastCheckpointId); - checkpointsToRemove.add(pastCheckpointId); + timestamp); + if (success) { + + // in case the checkpoint was successfully committed, + // discard its state from the backend and mark it for removal + // in case it failed, we retry on the next checkpoint + committer.commitCheckpoint(subtaskId, pastCheckpointId); + streamHandle.discardState(); + pendingCheckpointIt.remove(); --- End diff -- No "The iterators returned by this class's iterator method are fail-fast: if the set is modified at any time after the iterator is created, in any way **except through the iterator's own remove method**" from the same javadocs.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---