This is an automated email from the ASF dual-hosted git repository. gyfora pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/flink-kubernetes-operator.git
The following commit(s) were added to refs/heads/main by this push: new af00c99d [FLINK-29475] Add error checker for the operator in e2e tests af00c99d is described below commit af00c99defbe49c84dbd8a3ac4341136ca3efac9 Author: Gabor Somogyi <gabor_somog...@apple.com> AuthorDate: Mon Nov 21 10:15:15 2022 +0100 [FLINK-29475] Add error checker for the operator in e2e tests --- e2e-tests/test_application_kubernetes_ha.sh | 2 ++ e2e-tests/test_application_operations.sh | 2 ++ e2e-tests/test_multi_sessionjob.sh | 2 ++ e2e-tests/test_sessionjob_kubernetes_ha.sh | 2 ++ e2e-tests/test_sessionjob_operations.sh | 2 ++ e2e-tests/utils.sh | 29 +++++++++++++++++++++++++++++ 6 files changed, 39 insertions(+) diff --git a/e2e-tests/test_application_kubernetes_ha.sh b/e2e-tests/test_application_kubernetes_ha.sh index 1797b29a..eda15bc6 100755 --- a/e2e-tests/test_application_kubernetes_ha.sh +++ b/e2e-tests/test_application_kubernetes_ha.sh @@ -47,5 +47,7 @@ wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || e wait_for_status flinkdep/flink-example-statemachine '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1 wait_for_status flinkdep/flink-example-statemachine '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 +check_operator_log_for_errors || exit 1 + echo "Successfully run the Flink Kubernetes application HA test" diff --git a/e2e-tests/test_application_operations.sh b/e2e-tests/test_application_operations.sh index 457972c8..f6d1ace3 100755 --- a/e2e-tests/test_application_operations.sh +++ b/e2e-tests/test_application_operations.sh @@ -67,4 +67,6 @@ wait_for_status flinkdep/flink-example-statemachine '.status.jobManagerDeploymen wait_for_status flinkdep/flink-example-statemachine '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 assert_available_slots 1 $CLUSTER_ID +check_operator_log_for_errors || exit 1 + echo "Successfully run the last-state upgrade test" diff --git a/e2e-tests/test_multi_sessionjob.sh b/e2e-tests/test_multi_sessionjob.sh index 59990870..09862db5 100755 --- a/e2e-tests/test_multi_sessionjob.sh +++ b/e2e-tests/test_multi_sessionjob.sh @@ -38,6 +38,7 @@ jm_pod_name=$(get_jm_pod_name $CLUSTER_ID) wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || exit 1 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 +check_operator_log_for_errors || exit 1 echo "Flink Session Job is running properly" # Current namespace: flink @@ -48,4 +49,5 @@ jm_pod_name=$(get_jm_pod_name $CLUSTER_ID) wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || exit 1 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 +check_operator_log_for_errors || exit 1 echo "Flink Session Job is running properly" diff --git a/e2e-tests/test_sessionjob_kubernetes_ha.sh b/e2e-tests/test_sessionjob_kubernetes_ha.sh index 0ad55b12..7a0fa813 100755 --- a/e2e-tests/test_sessionjob_kubernetes_ha.sh +++ b/e2e-tests/test_sessionjob_kubernetes_ha.sh @@ -48,5 +48,7 @@ wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || e wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 +check_operator_log_for_errors || exit 1 + echo "Successfully run the Flink Session Job HA test" diff --git a/e2e-tests/test_sessionjob_operations.sh b/e2e-tests/test_sessionjob_operations.sh index b1c88fc2..c230af8c 100755 --- a/e2e-tests/test_sessionjob_operations.sh +++ b/e2e-tests/test_sessionjob_operations.sh @@ -79,3 +79,5 @@ wait_for_jobmanager_running $CLUSTER_ID $TIMEOUT wait_for_logs $jm_pod_name "Completed checkpoint [0-9]+ for job" ${TIMEOUT} || exit 1 wait_for_status $SESSION_CLUSTER_IDENTIFIER '.status.jobManagerDeploymentStatus' READY ${TIMEOUT} || exit 1 wait_for_status $SESSION_JOB_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 + +check_operator_log_for_errors || exit 1 diff --git a/e2e-tests/utils.sh b/e2e-tests/utils.sh index b8df6a42..447f038c 100755 --- a/e2e-tests/utils.sh +++ b/e2e-tests/utils.sh @@ -83,6 +83,11 @@ function wait_for_jobmanager_running() { wait_for_logs $jm_pod_name "Rest endpoint listening at" ${TIMEOUT} || exit 1 } +function get_operator_pod_name() { + operator_pod_name=$(kubectl get pods --selector="app.kubernetes.io/name=flink-kubernetes-operator" -o jsonpath='{..metadata.name}') + echo "${operator_pod_name}" +} + function get_jm_pod_name() { CLUSTER_ID=$1 jm_pod_name=$(kubectl get pods --selector="app=${CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') @@ -108,12 +113,36 @@ function retry_times() { return 1 } +function check_operator_log_for_errors { + echo "Checking for operator log errors..." + operator_pod_name=$(get_operator_pod_name) + errors=$(kubectl logs "${operator_pod_name}" \ + | grep -v "Exception while listing jobs" `#https://issues.apache.org/jira/browse/FLINK-30146` \ + | grep -v "Failed to submit a listener notification task" `#https://issues.apache.org/jira/browse/FLINK-30147` \ + | grep -v "Failed to submit job to session cluster" `#https://issues.apache.org/jira/browse/FLINK-30148` \ + | grep -v "Error during event processing" `#https://issues.apache.org/jira/browse/FLINK-30149` \ + | grep -v "REST service in session cluster is bad now" `#https://issues.apache.org/jira/browse/FLINK-30150` \ + | grep -v "AuditUtils" `#https://issues.apache.org/jira/browse/FLINK-30151` \ + | grep -i "error" || true) + if [ -z "${errors}" ]; then + echo "No errors in log files." + return 0 + else + echo -e "Found error in log files.\n\n${errors}" + return 1 + fi +} + function debug_and_show_logs { echo "Debugging failed e2e test:" echo "Currently existing Kubernetes resources" kubectl get all kubectl describe all + echo "Operator logs:" + operator_pod_name=$(get_operator_pod_name) + kubectl logs "${operator_pod_name}" + echo "Flink logs:" kubectl get pods -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | while read pod;do containers=(`kubectl get pods $pod -o jsonpath='{.spec.containers[*].name}'`)