Repository: kudu Updated Branches: refs/heads/master 3bb5f56cf -> d55df3c6e
build-support: option to retry all failed tests Currently, users can opt to retry flaky tests as reported by the user-specified test server. The test server's flaky test list may not accurately reflect what tests are flaky in all environments. In environments where there are flaky tests that are under-represented by the test server, it would still be nice to be resilient to flakies. As such, this patch adds an option to retry all failed tests. Here's a run of a non-flaky test into which I added a FATAL log. http://dist-test.cloudera.org/job?job_id=awong.1535433877.28172 Change-Id: I24aea0b9e7a1c2c66bc5feffcb454ff01cdca6fd Reviewed-on: http://gerrit.cloudera.org:8080/11342 Tested-by: Kudu Jenkins Reviewed-by: Grant Henke <granthe...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/5d69deb3 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/5d69deb3 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/5d69deb3 Branch: refs/heads/master Commit: 5d69deb36925113796cd69f51061b8396b0174fc Parents: 3bb5f56 Author: Andrew Wong <aw...@cloudera.com> Authored: Mon Aug 27 19:03:08 2018 -0700 Committer: Andrew Wong <aw...@cloudera.com> Committed: Tue Aug 28 15:39:46 2018 +0000 ---------------------------------------------------------------------- build-support/dist_test.py | 9 ++++++++- build-support/run-test.sh | 35 ++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/5d69deb3/build-support/dist_test.py ---------------------------------------------------------------------- diff --git a/build-support/dist_test.py b/build-support/dist_test.py index ad2bcf1..c1176fe 100755 --- a/build-support/dist_test.py +++ b/build-support/dist_test.py @@ -58,6 +58,11 @@ MAX_TASKS_PER_JOB=10000 # of retries, so we have to subtract 1. FLAKY_TEST_RETRIES = int(os.environ.get('KUDU_FLAKY_TEST_ATTEMPTS', 1)) - 1 +# Whether to retry all failed C++ tests, rather than just known flaky tests. +# Since Java flaky tests are not reported by the test server, Java tests are +# always retried, regardless of this value. +RETRY_ALL_TESTS = int(os.environ.get('KUDU_RETRY_ALL_FAILED_TESTS', 0)) + # Flags to include when running Gradle tasks GRADLE_FLAGS = os.environ.get('EXTRA_GRADLE_FLAGS', "") @@ -473,9 +478,11 @@ def run_tests(parser, options): create_archive_input(staging, execution, collect_tmpdir=options.collect_tmpdir) run_isolate(staging) + retry_all = RETRY_ALL_TESTS > 0 create_task_json(staging, flaky_test_set=get_flakies(), - replicate_tasks=options.num_instances) + replicate_tasks=options.num_instances, + retry_all_tests=retry_all) submit_tasks(staging, options) def add_run_subparser(subparsers): http://git-wip-us.apache.org/repos/asf/kudu/blob/5d69deb3/build-support/run-test.sh ---------------------------------------------------------------------- diff --git a/build-support/run-test.sh b/build-support/run-test.sh index 541d6f7..e8cc996 100755 --- a/build-support/run-test.sh +++ b/build-support/run-test.sh @@ -23,11 +23,12 @@ # If KUDU_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be # gzip-compressed while they are written. # -# If KUDU_FLAKY_TEST_ATTEMPTS is non-zero, and the test being run matches -# one of the lines in the file KUDU_FLAKY_TEST_LIST, then the test will -# be retried on failure up to the specified number of times. This can be -# used in the gerrit workflow to prevent annoying false -1s caused by -# tests that are known to be flaky in master. +# If KUDU_FLAKY_TEST_ATTEMPTS is non-zero, and either the test being run +# matches one of the lines in the file KUDU_FLAKY_TEST_LIST or +# KUDU_RETRY_ALL_FAILED_TESTS is non-zero, then the test will be retried on +# failure up to the specified number of times. This can be used in the gerrit +# workflow to prevent annoying false -1s caused by tests that are known to be +# flaky in master. # # If KUDU_REPORT_TEST_RESULTS is non-zero, then tests are reported to the # central test server. @@ -70,23 +71,27 @@ else TEST_NAME=${SHORT_TEST_NAME} fi -# Determine whether the test is a known flaky by comparing against the user-specified -# list. +# Determine whether the user has chosen to retry all failed tests, or whether +# the test is a known flaky by comparing against the user-specified list. TEST_EXECUTION_ATTEMPTS=1 -if [ -n "$KUDU_FLAKY_TEST_LIST" ]; then +if [ "$KUDU_RETRY_ALL_FAILED_TESTS" -gt 0 ]; then + echo "Will retry on failure" + TEST_IS_RETRYABLE=1 +elif [ -n "KUDU_FLAKY_TEST_LIST" ]; then if [ -f "$KUDU_FLAKY_TEST_LIST" ]; then - IS_KNOWN_FLAKY=$(grep --count --line-regexp "$SHORT_TEST_NAME" "$KUDU_FLAKY_TEST_LIST") + TEST_IS_RETRYABLE=$(grep --count --line-regexp "$SHORT_TEST_NAME" "$KUDU_FLAKY_TEST_LIST") else echo "Flaky test list file $KUDU_FLAKY_TEST_LIST missing" - IS_KNOWN_FLAKY=0 - fi - if [ "$IS_KNOWN_FLAKY" -gt 0 ]; then - TEST_EXECUTION_ATTEMPTS=${KUDU_FLAKY_TEST_ATTEMPTS:-1} - echo $TEST_NAME is a known-flaky test. Will attempt running it - echo up to $TEST_EXECUTION_ATTEMPTS times. + TEST_IS_RETRYABLE=0 fi fi +if [ "$TEST_IS_RETRYABLE" -gt 0 ]; then + TEST_EXECUTION_ATTEMPTS=${KUDU_FLAKY_TEST_ATTEMPTS:-1} + echo $TEST_NAME is a retryable test. Will attempt running it + echo up to $TEST_EXECUTION_ATTEMPTS times. +fi + # We run each test in its own subdir to avoid core file related races. TEST_WORKDIR=$BUILD_ROOT/test-work/$TEST_NAME