This is an automated email from the ASF dual-hosted git repository. dwysakowicz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/flink.git
The following commit(s) were added to refs/heads/master by this push: new 413a771 [FLINK-10368][e2e] Hardened kerberized yarn e2e test 413a771 is described below commit 413a77157caf25dbbfb8b0caaf2c9e12c7374d98 Author: Dawid Wysakowicz <wysakowicz.da...@gmail.com> AuthorDate: Mon Nov 5 11:07:22 2018 +0100 [FLINK-10368][e2e] Hardened kerberized yarn e2e test * wait for whole bootstrapping script to execute on master node before submitting job * retrying to start hadoop cluster. Failling test in case could not start hadoop cluster. * added check that all containers are up and running before submitting job * reduced memory requirements for the kerberized yarn test --- .../docker-hadoop-secure-cluster/bootstrap.sh | 1 + .../config/yarn-site.xml | 9 +- .../test-scripts/test_yarn_kerberos_docker.sh | 139 ++++++++++++++------- 3 files changed, 100 insertions(+), 49 deletions(-) diff --git a/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/bootstrap.sh b/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/bootstrap.sh index 7b5e50b..5b98b96 100755 --- a/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/bootstrap.sh +++ b/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/bootstrap.sh @@ -124,6 +124,7 @@ elif [ "$1" == "master" ]; then hdfs dfs -chown hadoop-user:hadoop-user /user/hadoop-user kdestroy + echo "Finished master initialization" while true; do sleep 1000; done elif [ "$1" == "worker" ]; then diff --git a/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/config/yarn-site.xml b/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/config/yarn-site.xml index 9b17acc..c7736a6 100644 --- a/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/config/yarn-site.xml +++ b/flink-end-to-end-tests/test-scripts/docker-hadoop-secure-cluster/config/yarn-site.xml @@ -21,6 +21,11 @@ under the License. <value>mapreduce_shuffle</value> </property> + <property> + <name>yarn.nodemanager.vmem-pmem-ratio</name> + <value>3</value> + </property> + <!-- this is ignored by the default scheduler but we have to set it because Flink would complain if we didn't have it --> <property> @@ -33,12 +38,12 @@ under the License. are scheduled on one NM, which wouldn't provoke a previously fixed Kerberos keytab bug. --> <property> <name>yarn.nodemanager.resource.memory-mb</name> - <value>4100</value> + <value>2500</value> </property> <property> <name>yarn.scheduler.minimum-allocation-mb</name> - <value>2000</value> + <value>1000</value> </property> <property> diff --git a/flink-end-to-end-tests/test-scripts/test_yarn_kerberos_docker.sh b/flink-end-to-end-tests/test-scripts/test_yarn_kerberos_docker.sh index c9ef15d..5f2dea2 100755 --- a/flink-end-to-end-tests/test-scripts/test_yarn_kerberos_docker.sh +++ b/flink-end-to-end-tests/test-scripts/test_yarn_kerberos_docker.sh @@ -24,7 +24,8 @@ FLINK_TARBALL_DIR=$TEST_DATA_DIR FLINK_TARBALL=flink.tar.gz FLINK_DIRNAME=$(basename $FLINK_DIR) -MAX_RETRY_SECONDS=800 +MAX_RETRY_SECONDS=120 +CLUSTER_SETUP_RETRIES=3 echo "Flink Tarball directory $FLINK_TARBALL_DIR" echo "Flink tarball filename $FLINK_TARBALL" @@ -33,20 +34,6 @@ echo "End-to-end directory $END_TO_END_DIR" docker --version docker-compose --version -mkdir -p $FLINK_TARBALL_DIR -tar czf $FLINK_TARBALL_DIR/$FLINK_TARBALL -C $(dirname $FLINK_DIR) . - -echo "Building Hadoop Docker container" -until docker build --build-arg HADOOP_VERSION=2.8.4 -f $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/Dockerfile -t flink/docker-hadoop-secure-cluster:latest $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/; do - # with all the downloading and ubuntu updating a lot of flakiness can happen, make sure - # we don't immediately fail - echo "Something went wrong while building the Docker image, retrying ..." - sleep 2 -done - -echo "Starting Hadoop cluster" -docker-compose -f $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/docker-compose.yml up -d - # make sure we stop our cluster at the end function cluster_shutdown { # don't call ourselves again for another signal interruption @@ -60,12 +47,71 @@ function cluster_shutdown { trap cluster_shutdown INT trap cluster_shutdown EXIT -until docker cp $FLINK_TARBALL_DIR/$FLINK_TARBALL master:/home/hadoop-user/; do - # we're retrying this one because we don't know yet if the container is ready - echo "Uploading Flink tarball to docker master failed, retrying ..." - sleep 5 +function start_hadoop_cluster() { + echo "Starting Hadoop cluster" + docker-compose -f $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/docker-compose.yml up -d + + # wait for kerberos to be set up + start_time=$(date +%s) + until docker logs master 2>&1 | grep -q "Finished master initialization"; do + current_time=$(date +%s) + time_diff=$((current_time - start_time)) + + if [ $time_diff -ge $MAX_RETRY_SECONDS ]; then + return 1 + else + echo "Waiting for hadoop cluster to come up. We have been trying for $time_diff seconds, retrying ..." + sleep 10 + fi + done + + # perform health checks + if ! { [ $(docker inspect -f '{{.State.Running}}' master 2>&1) = 'true' ] && + [ $(docker inspect -f '{{.State.Running}}' slave1 2>&1) = 'true' ] && + [ $(docker inspect -f '{{.State.Running}}' slave2 2>&1) = 'true' ] && + [ $(docker inspect -f '{{.State.Running}}' kdc 2>&1) = 'true' ]; }; + then + return 1 + fi + + return 0 +} + +mkdir -p $FLINK_TARBALL_DIR +tar czf $FLINK_TARBALL_DIR/$FLINK_TARBALL -C $(dirname $FLINK_DIR) . + +echo "Building Hadoop Docker container" +until docker build --build-arg HADOOP_VERSION=2.8.4 \ + -f $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/Dockerfile \ + -t flink/docker-hadoop-secure-cluster:latest \ + $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/; +do + # with all the downloading and ubuntu updating a lot of flakiness can happen, make sure + # we don't immediately fail + echo "Something went wrong while building the Docker image, retrying ..." + sleep 2 +done + +CLUSTER_STARTED=1 +for (( i = 0; i < $CLUSTER_SETUP_RETRIES; i++ )) +do + if start_hadoop_cluster; then + echo "Cluster started successfully." + CLUSTER_STARTED=0 + break #continue test, cluster set up succeeded + fi + + echo "ERROR: Could not start hadoop cluster. Retrying..." + docker-compose -f $END_TO_END_DIR/test-scripts/docker-hadoop-secure-cluster/docker-compose.yml down done +if [[ ${CLUSTER_STARTED} -ne 0 ]]; then + echo "ERROR: Could not start hadoop cluster. Aborting..." + exit 1 +fi + +docker cp $FLINK_TARBALL_DIR/$FLINK_TARBALL master:/home/hadoop-user/ + # now, at least the container is ready docker exec -it master bash -c "tar xzf /home/hadoop-user/$FLINK_TARBALL --directory /home/hadoop-user/" @@ -73,6 +119,7 @@ docker exec -it master bash -c "tar xzf /home/hadoop-user/$FLINK_TARBALL --direc docker exec -it master bash -c "echo \"security.kerberos.login.keytab: /home/hadoop-user/hadoop-user.keytab\" > /home/hadoop-user/$FLINK_DIRNAME/conf/flink-conf.yaml" docker exec -it master bash -c "echo \"security.kerberos.login.principal: hadoop-user\" >> /home/hadoop-user/$FLINK_DIRNAME/conf/flink-conf.yaml" docker exec -it master bash -c "echo \"slot.request.timeout: 60000\" >> /home/hadoop-user/$FLINK_DIRNAME/conf/flink-conf.yaml" +docker exec -it master bash -c "echo \"containerized.heap-cutoff-min: 100\" >> /home/hadoop-user/$FLINK_DIRNAME/conf/flink-conf.yaml" echo "Flink config:" docker exec -it master bash -c "cat /home/hadoop-user/$FLINK_DIRNAME/conf/flink-conf.yaml" @@ -84,33 +131,28 @@ OUTPUT_PATH=hdfs:///user/hadoop-user/wc-out-$RANDOM start_time=$(date +%s) # it's important to run this with higher parallelism, otherwise we might risk that # JM and TM are on the same YARN node and that we therefore don't test the keytab shipping -until docker exec -it master bash -c "export HADOOP_CLASSPATH=\`hadoop classpath\` && /home/hadoop-user/$FLINK_DIRNAME/bin/flink run -m yarn-cluster -yn 3 -ys 1 -ytm 2000 -yjm 2000 -p 3 /home/hadoop-user/$FLINK_DIRNAME/examples/streaming/WordCount.jar --output $OUTPUT_PATH"; do - current_time=$(date +%s) - time_diff=$((current_time - start_time)) - - if [ $time_diff -ge $MAX_RETRY_SECONDS ]; then - echo "We tried running the job for $time_diff seconds, max is $MAX_RETRY_SECONDS seconds, aborting" - mkdir -p $TEST_DATA_DIR/logs - echo "Hadoop logs:" - docker cp master:/var/log/hadoop/* $TEST_DATA_DIR/logs/ - for f in $TEST_DATA_DIR/logs/*; do - echo "$f:" - cat $f - done - echo "Docker logs:" - docker logs master - exit 1 - else - echo "Running the Flink job failed, might be that the cluster is not ready yet. We have been trying for $time_diff seconds, retrying ..." - sleep 5 - fi -done - -docker exec -it master bash -c "kinit -kt /home/hadoop-user/hadoop-user.keytab hadoop-user" -docker exec -it master bash -c "hdfs dfs -ls $OUTPUT_PATH" -OUTPUT=$(docker exec -it master bash -c "hdfs dfs -cat $OUTPUT_PATH/*") -docker exec -it master bash -c "kdestroy" -echo "$OUTPUT" +if docker exec -it master bash -c "export HADOOP_CLASSPATH=\`hadoop classpath\` && \ + /home/hadoop-user/$FLINK_DIRNAME/bin/flink run -m yarn-cluster -yn 3 -ys 1 -ytm 1000 -yjm 1000 \ + -p 3 /home/hadoop-user/$FLINK_DIRNAME/examples/streaming/WordCount.jar --output $OUTPUT_PATH"; +then + docker exec -it master bash -c "kinit -kt /home/hadoop-user/hadoop-user.keytab hadoop-user" + docker exec -it master bash -c "hdfs dfs -ls $OUTPUT_PATH" + OUTPUT=$(docker exec -it master bash -c "hdfs dfs -cat $OUTPUT_PATH/*") + docker exec -it master bash -c "kdestroy" + echo "$OUTPUT" +else + echo "Running the job failed." + mkdir -p $TEST_DATA_DIR/logs + echo "Hadoop logs:" + docker cp master:/var/log/hadoop/* $TEST_DATA_DIR/logs/ + for f in $TEST_DATA_DIR/logs/*; do + echo "$f:" + cat $f + done + echo "Docker logs:" + docker logs master + exit 1 +fi if [[ ! "$OUTPUT" =~ "consummation,1" ]]; then echo "Output does not contain (consummation, 1) as required" @@ -139,7 +181,10 @@ fi echo "Running Job without configured keytab, the exception you see below is expected" docker exec -it master bash -c "echo \"\" > /home/hadoop-user/$FLINK_DIRNAME/conf/flink-conf.yaml" # verify that it doesn't work if we don't configure a keytab -OUTPUT=$(docker exec -it master bash -c "export HADOOP_CLASSPATH=\`hadoop classpath\` && /home/hadoop-user/$FLINK_DIRNAME/bin/flink run -m yarn-cluster -yn 3 -ys 1 -ytm 1200 -yjm 800 -p 3 /home/hadoop-user/$FLINK_DIRNAME/examples/streaming/WordCount.jar --output $OUTPUT_PATH") +OUTPUT=$(docker exec -it master bash -c "export HADOOP_CLASSPATH=\`hadoop classpath\` && \ + /home/hadoop-user/$FLINK_DIRNAME/bin/flink run \ + -m yarn-cluster -yn 3 -ys 1 -ytm 1000 -yjm 1000 -p 3 \ + /home/hadoop-user/$FLINK_DIRNAME/examples/streaming/WordCount.jar --output $OUTPUT_PATH") echo "$OUTPUT" if [[ ! "$OUTPUT" =~ "Hadoop security with Kerberos is enabled but the login user does not have Kerberos credentials" ]]; then