HADOOP-11406. xargs -P is not portable (Kengo Seki via aw)
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/6c65e8ab Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/6c65e8ab Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/6c65e8ab Branch: refs/heads/YARN-2928 Commit: 6c65e8ab54c51f234dc4b269cc6decd754c1f86b Parents: de39fed Author: Allen Wittenauer <a...@apache.org> Authored: Thu May 28 10:36:40 2015 -0700 Committer: Zhijie Shen <zjs...@apache.org> Committed: Tue Jun 2 16:12:54 2015 -0700 ---------------------------------------------------------------------- hadoop-common-project/hadoop-common/CHANGES.txt | 2 + .../src/main/bin/hadoop-functions.sh | 42 +++++++++++--------- .../main/conf/hadoop-user-functions.sh.example | 29 +++++++++++++- 3 files changed, 54 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/6c65e8ab/hadoop-common-project/hadoop-common/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index eb1db29..7da02ed 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -466,6 +466,8 @@ Trunk (Unreleased) HADOOP-9891. CLIMiniCluster instructions fail with MiniYarnCluster ClassNotFoundException (Darrell Taylor via aw) + HADOOP-11406. xargs -P is not portable (Kengo Seki via aw) + OPTIMIZATIONS HADOOP-7761. Improve the performance of raw comparisons. (todd) http://git-wip-us.apache.org/repos/asf/hadoop/blob/6c65e8ab/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh index 67e8870..5556f2f 100644 --- a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh +++ b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh @@ -461,27 +461,33 @@ function hadoop_connect_to_hosts if [[ -z "${SLAVE_NAMES}" ]]; then SLAVE_NAMES=$(sed 's/#.*$//;/^$/d' "${SLAVE_FILE}") fi - - # quoting here gets tricky. it's easier to push it into a function - # so that we don't have to deal with it. However... - # xargs can't use a function so instead we'll export it out - # and force it into a subshell - # moral of the story: just use pdsh. - export -f hadoop_actual_ssh - export HADOOP_SSH_OPTS - - # xargs is used with option -I to replace the placeholder in arguments - # list with each hostname read from stdin/pipe. But it consider one - # line as one argument while reading from stdin/pipe. So place each - # hostname in different lines while passing via pipe. - SLAVE_NAMES=$(echo "$SLAVE_NAMES" | tr ' ' '\n' ) - echo "${SLAVE_NAMES}" | \ - xargs -n 1 -P"${HADOOP_SSH_PARALLEL}" \ - -I {} bash -c -- "hadoop_actual_ssh {} ${params}" - wait + hadoop_connect_to_hosts_without_pdsh "${params}" fi } +## @description Connect to ${SLAVE_NAMES} and execute command +## @description under the environment which does not support pdsh. +## @audience private +## @stability evolving +## @replaceable yes +## @param command +## @param [...] +function hadoop_connect_to_hosts_without_pdsh +{ + # shellcheck disable=SC2124 + local params="$@" + local slaves=(${SLAVE_NAMES}) + for (( i = 0; i < ${#slaves[@]}; i++ )) + do + if (( i != 0 && i % HADOOP_SSH_PARALLEL == 0 )); then + wait + fi + # shellcheck disable=SC2086 + hadoop_actual_ssh "${slaves[$i]}" ${params} & + done + wait +} + ## @description Utility routine to handle --slaves mode ## @audience private ## @stability evolving http://git-wip-us.apache.org/repos/asf/hadoop/blob/6c65e8ab/hadoop-common-project/hadoop-common/src/main/conf/hadoop-user-functions.sh.example ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/src/main/conf/hadoop-user-functions.sh.example b/hadoop-common-project/hadoop-common/src/main/conf/hadoop-user-functions.sh.example index b2f78f8..3cf5776 100644 --- a/hadoop-common-project/hadoop-common/src/main/conf/hadoop-user-functions.sh.example +++ b/hadoop-common-project/hadoop-common/src/main/conf/hadoop-user-functions.sh.example @@ -50,7 +50,7 @@ # # -# Another example: finding java +# Example: finding java # # By default, Hadoop assumes that $JAVA_HOME is always defined # outside of its configuration. Eons ago, Apple standardized @@ -85,3 +85,30 @@ # exit 1 # fi #} + +# +# Example: efficient command execution for the slaves +# +# To improve performance, you can use xargs -P +# instead of the for loop, if supported. +# +#function hadoop_connect_to_hosts_without_pdsh +#{ +# # quoting here gets tricky. it's easier to push it into a function +# # so that we don't have to deal with it. However... +# # xargs can't use a function so instead we'll export it out +# # and force it into a subshell +# # moral of the story: just use pdsh. +# export -f hadoop_actual_ssh +# export HADOOP_SSH_OPTS +# +# # xargs is used with option -I to replace the placeholder in arguments +# # list with each hostname read from stdin/pipe. But it consider one +# # line as one argument while reading from stdin/pipe. So place each +# # hostname in different lines while passing via pipe. +# SLAVE_NAMES=$(echo "$SLAVE_NAMES" | tr ' ' '\n' ) +# echo "${SLAVE_NAMES}" | \ +# xargs -n 1 -P"${HADOOP_SSH_PARALLEL}" \ +# -I {} bash -c -- "hadoop_actual_ssh {} ${params}" +# wait +#}