Hi team,

I run a simple 1.12.1 Flink job in IDE with
TolerableCheckpointFailureNumber set
where I throw an exception in source function snapshotState intentionally to
verify how Flink behaves. What I find is the first checkpoint throws the
exception and eventually time out while the main flow continues to work.
This is expected however all subsequent checkpoints don't reach the
exception anymore and report timeout when timeout reaches. I want to know
if this is expected behavior which all later checkpoints cannot finish if
there is one checkpoint that throws exception.

Below is the code the reproduce the behavior
main

StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setStateBackend(new FsStateBackend("file:///tmp/chpk", true));
env.enableCheckpointing(20_000, CheckpointingMode.AT_LEAST_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(3_000);
env.getCheckpointConfig().setTolerableCheckpointFailureNumber(1000);

env.addSource(new FromElementsFunctionT())
    .setParallelism(1)
    .print()
    .setParallelism(1);
env.execute("Demo");


Source function

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.smartnews.dp.kafka.sample.flink;

import java.util.ArrayList;
import java.util.List;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeutils.base.IntSerializer;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Preconditions;

/**
 * A stream source function that returns a sequence of elements.
 *
 * <p>Upon construction, this source function serializes the elements
using Flink's type
 * information. That way, any object transport using Java
serialization will not be affected by the
 * serializability of the elements.
 *
 * <p><b>NOTE:</b> This source has a parallelism of 1.
 *
 */
@PublicEvolving
public class FromElementsFunctionT implements SourceFunction<Integer>,
CheckpointedFunction {

    private static final long serialVersionUID = 1L;

    /** The number of elements emitted already. */
    private volatile int numElementsEmitted;

    /** Flag to make the source cancelable. */
    private volatile boolean isRunning = true;

    private transient ListState<Integer> checkpointedState;

    @Override
    public void initializeState(FunctionInitializationContext context)
throws Exception {
        Preconditions.checkState(
                this.checkpointedState == null,
                "The " + getClass().getSimpleName() + " has already
been initialized.");

        this.checkpointedState =
                context.getOperatorStateStore()
                        .getListState(
                                new ListStateDescriptor<>(
                                        "from-elements-state",
IntSerializer.INSTANCE));

        if (context.isRestored()) {
            List<Integer> retrievedStates = new ArrayList<>();
            for (Integer entry : this.checkpointedState.get()) {
                retrievedStates.add(entry);
            }

            // given that the parallelism of the function is 1, we can
only have 1 state
            Preconditions.checkArgument(
                    retrievedStates.size() == 1,
                    getClass().getSimpleName() + " retrieved invalid state.");

            this.numElementsEmitted = retrievedStates.get(0);
        }
    }

    @Override
    public void run(SourceContext<Integer> ctx) throws Exception {
        final Object lock = ctx.getCheckpointLock();

        while (isRunning && numElementsEmitted < Integer.MAX_VALUE) {
            Thread.sleep(1000);
            synchronized (lock) {
                ctx.collect(numElementsEmitted++);
            }
        }
    }

    @Override
    public void cancel() {
        isRunning = false;
    }

    // ------------------------------------------------------------------------
    //  Checkpointing
    // ------------------------------------------------------------------------

    @Override
    public void snapshotState(FunctionSnapshotContext context) throws
Exception {
        Preconditions.checkState(
                this.checkpointedState != null,
                "The " + getClass().getSimpleName() + " has not been
properly initialized.");

        this.checkpointedState.clear();
        this.checkpointedState.add(this.numElementsEmitted);
        throw new NullPointerException("npe");
    }
}



-- 
Regards,
Tao

Reply via email to