Author: yhemanth
Date: Fri Jun 12 09:09:35 2009
New Revision: 784042
URL: http://svn.apache.org/viewvc?rev=784042&view=rev
Log:
HADOOP-5420. Fix LinuxTaskController to kill tasks using the process groups
they are launched with. Contributed by Sreekanth Ramakrishnan.
Added:
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcessesWithLinuxTaskController.java
Modified:
hadoop/core/trunk/CHANGES.txt
hadoop/core/trunk/src/c++/task-controller/configuration.c
hadoop/core/trunk/src/c++/task-controller/configuration.h.in
hadoop/core/trunk/src/c++/task-controller/main.c
hadoop/core/trunk/src/c++/task-controller/task-controller.c
hadoop/core/trunk/src/c++/task-controller/task-controller.h
hadoop/core/trunk/src/core/org/apache/hadoop/util/ProcessTree.java
hadoop/core/trunk/src/docs/src/documentation/content/xdocs/cluster_setup.xml
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/DefaultTaskController.java
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/JvmManager.java
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/LinuxTaskController.java
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/TaskController.java
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcesses.java
Modified: hadoop/core/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Fri Jun 12 09:09:35 2009
@@ -806,6 +806,10 @@
HADOOP-5981. Fix a bug in HADOOP-2838 in parsing mapred.child.env.
(Amar Kamat via sharad)
+ HADOOP-5420. Fix LinuxTaskController to kill tasks using the process
+ groups they are launched with.
+ (Sreekanth Ramakrishnan via yhemanth)
+
Release 0.20.1 - Unreleased
INCOMPATIBLE CHANGES
Modified: hadoop/core/trunk/src/c++/task-controller/configuration.c
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/c%2B%2B/task-controller/configuration.c?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/c++/task-controller/configuration.c (original)
+++ hadoop/core/trunk/src/c++/task-controller/configuration.c Fri Jun 12
09:09:35 2009
@@ -202,3 +202,36 @@
return NULL;
}
+const char ** get_values(char * key) {
+ const char ** toPass = NULL;
+ const char * value = get_value(key);
+ char *tempTok = NULL;
+ char *tempstr = NULL;
+ int size = 0;
+ int len;
+ //first allocate any array of 10
+ if(value != NULL) {
+ toPass = (const char **) malloc(sizeof(char *) * MAX_SIZE);
+ tempTok = strtok_r((char *)value, ",", &tempstr);
+ if (tempTok != NULL) {
+ while (1) {
+ toPass[size++] = tempTok;
+ tempTok = strtok_r(NULL, ",", &tempstr);
+ if(tempTok == NULL){
+ break;
+ }
+ if((size % MAX_SIZE) == 0) {
+ toPass = (const char **) realloc(toPass,(sizeof(char *) *
+ (MAX_SIZE * ((size/MAX_SIZE) +1))));
+ }
+ }
+ } else {
+ toPass[size] = (char *)value;
+ }
+ }
+ if(size > 0) {
+ toPass[size] = NULL;
+ }
+ return toPass;
+}
+
Modified: hadoop/core/trunk/src/c++/task-controller/configuration.h.in
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/c%2B%2B/task-controller/configuration.h.in?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/c++/task-controller/configuration.h.in (original)
+++ hadoop/core/trunk/src/c++/task-controller/configuration.h.in Fri Jun 12
09:09:35 2009
@@ -57,3 +57,6 @@
//method to free allocated configuration
void free_configurations();
+//function to return array of values pointing to the key. Values are
+//comma seperated strings.
+const char ** get_values(char* key);
Modified: hadoop/core/trunk/src/c++/task-controller/main.c
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/c%2B%2B/task-controller/main.c?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/c++/task-controller/main.c (original)
+++ hadoop/core/trunk/src/c++/task-controller/main.c Fri Jun 12 09:09:35 2009
@@ -25,15 +25,16 @@
const char * task_id = NULL;
const char * tt_root = NULL;
int exit_code = 0;
+ const char * task_pid = NULL;
const char* const short_options = "l:";
const struct option long_options[] = { { "log", 1, NULL, 'l' }, { NULL, 0,
NULL, 0 } };
const char* log_file = NULL;
- // when we support additional commands without ttroot, this check
- // may become command specific.
- if (argc < 6) {
+ //Minimum number of arguments required to run the task-controller
+ //command-name user command tt-root
+ if (argc < 3) {
display_usage(stderr);
return INVALID_ARGUMENT_NUMBER;
}
@@ -44,7 +45,6 @@
strncpy(hadoop_conf_dir,argv[0],(strlen(argv[0]) - strlen(EXEC_PATTERN)));
hadoop_conf_dir[(strlen(argv[0]) - strlen(EXEC_PATTERN))] = '\0';
#endif
-
do {
next_option = getopt_long(argc, argv, short_options, long_options, NULL);
switch (next_option) {
@@ -88,24 +88,25 @@
}
optind = optind + 1;
command = atoi(argv[optind++]);
- job_id = argv[optind++];
- task_id = argv[optind++];
-
#ifdef DEBUG
fprintf(LOGFILE, "main : command provided %d\n",command);
fprintf(LOGFILE, "main : user is %s\n", user_detail->pw_name);
- fprintf(LOGFILE, "main : job id %s \n", job_id);
- fprintf(LOGFILE, "main : task id %s \n", task_id);
#endif
switch (command) {
- case RUN_TASK:
- tt_root = argv[optind];
+ case LAUNCH_TASK_JVM:
+ tt_root = argv[optind++];
+ job_id = argv[optind++];
+ task_id = argv[optind++];
exit_code
= run_task_as_user(user_detail->pw_name, job_id, task_id, tt_root);
break;
- case KILL_TASK:
- tt_root = argv[optind];
- exit_code = kill_user_task(user_detail->pw_name, job_id, task_id, tt_root);
+ case TERMINATE_TASK_JVM:
+ task_pid = argv[optind++];
+ exit_code = kill_user_task(user_detail->pw_name, task_pid, SIGTERM);
+ break;
+ case KILL_TASK_JVM:
+ task_pid = argv[optind++];
+ exit_code = kill_user_task(user_detail->pw_name, task_pid, SIGKILL);
break;
default:
exit_code = INVALID_COMMAND_PROVIDED;
Modified: hadoop/core/trunk/src/c++/task-controller/task-controller.c
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/c%2B%2B/task-controller/task-controller.c?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/c++/task-controller/task-controller.c (original)
+++ hadoop/core/trunk/src/c++/task-controller/task-controller.c Fri Jun 12
09:09:35 2009
@@ -23,9 +23,6 @@
//LOGFILE
FILE *LOGFILE;
-//hadoop temp dir root which is configured in secure configuration
-const char *mapred_local_dir;
-
//placeholder for global cleanup operations
void cleanup() {
free_configurations();
@@ -36,10 +33,14 @@
if (get_user_details(user) < 0) {
return -1;
}
-#ifdef DEBUG
- fprintf(LOGFILE,"change_user : setting user as %s ", user_detail->pw_name);
-#endif
+
+ if(initgroups(user_detail->pw_name, user_detail->pw_gid) != 0) {
+ cleanup();
+ return SETUID_OPER_FAILED;
+ }
+
errno = 0;
+
setgid(user_detail->pw_gid);
if (errno != 0) {
fprintf(LOGFILE, "unable to setgid : %s\n", strerror(errno));
@@ -70,90 +71,61 @@
return 0;
}
-//Function to set the hadoop.temp.dir key from configuration.
-//would return -1 if the configuration is not proper.
-
-int get_mapred_local_dir() {
-
- if (mapred_local_dir == NULL) {
- mapred_local_dir = get_value(TT_SYS_DIR_KEY);
- }
-
- //after the call it should not be null
- if (mapred_local_dir == NULL) {
- return -1;
- } else {
- return 0;
- }
-
-}
// function to check if the passed tt_root is present in hadoop.tmp.dir
int check_tt_root(const char *tt_root) {
- char *token;
+ char ** mapred_local_dir;
int found = -1;
if (tt_root == NULL) {
return -1;
}
- if (mapred_local_dir == NULL) {
- if (get_mapred_local_dir() < 0) {
- return -1;
- }
- }
+ mapred_local_dir = (char **)get_values(TT_SYS_DIR_KEY);
- token = strtok((char *) mapred_local_dir, ",");
- if (token == NULL && mapred_local_dir != NULL) {
- token = (char *)mapred_local_dir;
+ if (mapred_local_dir == NULL) {
+ return -1;
}
- while (1) {
- if (strcmp(tt_root, token) == 0) {
+ while(*mapred_local_dir != NULL) {
+ if(strcmp(*mapred_local_dir,tt_root) == 0) {
found = 0;
break;
}
- token = strtok(NULL, ",");
- if (token == NULL) {
- break;
- }
}
-
+ free(mapred_local_dir);
return found;
-
}
-/*
- *d function which would return .pid file path which is used while running
- * and killing of the tasks by the user.
- *
- * check TT_SYS_DIR for pattern
+/**
+ * Function to check if the constructed path and absolute
+ * path resolve to one and same.
*/
-void get_pid_path(const char * jobid, const char * taskid, const char *tt_root,
- char ** pid_path) {
-
- int str_len = strlen(TT_SYS_DIR) + strlen(jobid) + strlen(taskid) + strlen(
- tt_root);
- *pid_path = NULL;
- if (mapred_local_dir == NULL) {
- if (get_mapred_local_dir() < 0) {
- return;
- }
+int check_path(char *path) {
+ char * resolved_path = (char *) canonicalize_file_name(path);
+ if(resolved_path == NULL) {
+ return ERROR_RESOLVING_FILE_PATH;
+ }
+ if(strcmp(resolved_path, path) !=0) {
+ free(resolved_path);
+ return RELATIVE_PATH_COMPONENTS_IN_FILE_PATH;
}
-
- *pid_path = (char *) malloc(sizeof(char) * (str_len + 1));
-
- if (*pid_path == NULL) {
- fprintf(LOGFILE, "unable to allocate memory for pid path\n");
- return;
+ free(resolved_path);
+ return 0;
+}
+/**
+ * Function to check if a user actually owns the file.
+ */
+int check_owner(uid_t uid, char *path) {
+ struct stat filestat;
+ if(stat(path, &filestat)!=0) {
+ return UNABLE_TO_STAT_FILE;
+ }
+ //check owner.
+ if(uid != filestat.st_uid){
+ return FILE_NOT_OWNED_BY_TASKTRACKER;
}
- memset(*pid_path,'\0',str_len+1);
- snprintf(*pid_path, str_len, TT_SYS_DIR, tt_root, jobid, taskid);
-#ifdef DEBUG
- fprintf(LOGFILE, "get_pid_path : pid path = %s\n", *pid_path);
- fflush(LOGFILE);
-#endif
-
+ return 0;
}
/*
@@ -163,19 +135,19 @@
*/
void get_task_file_path(const char * jobid, const char * taskid,
const char * tt_root, char **task_script_path) {
+ const char ** mapred_local_dir = get_values(TT_SYS_DIR_KEY);
*task_script_path = NULL;
int str_len = strlen(TT_LOCAL_TASK_SCRIPT_PATTERN) + strlen(jobid) + (strlen(
taskid)) + strlen(tt_root);
if (mapred_local_dir == NULL) {
- if (get_mapred_local_dir() < 0) {
- return;
- }
+ return;
}
*task_script_path = (char *) malloc(sizeof(char) * (str_len + 1));
if (*task_script_path == NULL) {
fprintf(LOGFILE, "Unable to allocate memory for task_script_path \n");
+ free(mapred_local_dir);
return;
}
@@ -186,13 +158,13 @@
fprintf(LOGFILE, "get_task_file_path : task script path = %s\n",
*task_script_path);
fflush(LOGFILE);
#endif
-
+ free(mapred_local_dir);
}
//end of private functions
void display_usage(FILE *stream) {
fprintf(stream,
- "Usage: task-controller [-l logile] user command command-args\n");
+ "Usage: task-controller [-l logfile] user command command-args\n");
}
//function used to populate and user_details structure.
@@ -212,28 +184,20 @@
*Function used to launch a task as the provided user.
* First the function checks if the tt_root passed is found in
* hadoop.temp.dir
- *
- *Then gets the path to which the task has to write its pid from
- *get_pid_path.
- *
- * THen writes its pid into the file.
- *
- * Then changes the permission of the pid file into 777
- *
- * Then uses get_task_file_path to fetch the task script file path.
- *
+ * Uses get_task_file_path to fetch the task script file path.
* Does an execlp on the same in order to replace the current image with
* task image.
- *
*/
int run_task_as_user(const char * user, const char *jobid, const char *taskid,
const char *tt_root) {
char *task_script_path = NULL;
- char *pid_path = NULL;
- FILE *file_handle = NULL;
int exit_code = 0;
- int i = 0;
+ uid_t uid = getuid();
+
+ if(jobid == NULL || taskid == NULL) {
+ return INVALID_ARGUMENT_NUMBER;
+ }
#ifdef DEBUG
fprintf(LOGFILE,"run_task_as_user : Job id : %s \n", jobid);
@@ -241,7 +205,8 @@
fprintf(LOGFILE,"run_task_as_user : tt_root : %s \n", tt_root);
fflush(LOGFILE);
#endif
-
+ //Check tt_root before switching the user, as reading configuration
+ //file requires privileged access.
if (check_tt_root(tt_root) < 0) {
fprintf(LOGFILE, "invalid tt root passed %s\n", tt_root);
cleanup();
@@ -257,44 +222,21 @@
return SETUID_OPER_FAILED;
}
- get_pid_path(jobid, taskid, tt_root, &pid_path);
-
- if (pid_path == NULL) {
+ get_task_file_path(jobid, taskid, tt_root, &task_script_path);
+ if (task_script_path == NULL) {
cleanup();
- return INVALID_PID_PATH;
+ return INVALID_TASK_SCRIPT_PATH;
}
-
errno = 0;
- file_handle = fopen(pid_path, "w");
-
- if (file_handle == NULL) {
- exit_code = UNABLE_TO_OPEN_PID_FILE_WRITE_MODE;
+ exit_code = check_path(task_script_path);
+ if(exit_code != 0) {
goto cleanup;
}
-
errno = 0;
- if (fprintf(file_handle, "%d\n", getpid()) < 0) {
- exit_code = UNABLE_TO_WRITE_TO_PID_FILE;
+ exit_code = check_owner(uid, task_script_path);
+ if(exit_code != 0) {
goto cleanup;
}
-
- fflush(file_handle);
- fclose(file_handle);
- //set file handle to null after closing so it would not be double closed
- //in cleanup label
- file_handle = NULL;
- //change the permissions of the file
- errno = 0;
- //free pid_t path which is allocated
- free(pid_path);
- pid_path = NULL;
-
- get_task_file_path(jobid, taskid, tt_root, &task_script_path);
-
- if (task_script_path == NULL) {
- cleanup();
- return INVALID_TASK_SCRIPT_PATH;
- }
errno = 0;
cleanup();
execlp(task_script_path, task_script_path, NULL);
@@ -306,83 +248,53 @@
return exit_code;
cleanup:
- if (pid_path != NULL) {
- free(pid_path);
- }
if (task_script_path != NULL) {
free(task_script_path);
}
- if (file_handle != NULL) {
- fclose(file_handle);
- }
// free configurations
cleanup();
return exit_code;
}
+
/**
- * Function used to terminate a task launched by the user.
- *
- * The function first checks if the passed tt-root is found in
- * configured hadoop.temp.dir (which is a list of tt_roots).
- *
- * Then gets the task-pid path using function get_pid_path.
- *
- * reads the task-pid from the file which is mentioned by get_pid_path
- *
- * kills the task by sending SIGTERM to that particular process.
- *
+ * Function used to terminate/kill a task launched by the user.
+ * The function sends appropriate signal to the process group
+ * specified by the task_pid.
*/
-int kill_user_task(const char *user, const char *jobid, const char *taskid,
- const char *tt_root) {
+int kill_user_task(const char *user, const char *task_pid, int sig) {
int pid = 0;
- int i = 0;
- char *pid_path = NULL;
- FILE *file_handle = NULL;
-#ifdef DEBUG
- fprintf(LOGFILE,"kill_user_task : Job id : %s \n", jobid);
- fprintf(LOGFILE,"kill_user_task : task id : %s \n", taskid);
- fprintf(LOGFILE,"kill_user_task : tt_root : %s \n", tt_root);
- fflush(LOGFILE);
-#endif
- if (check_tt_root(tt_root) < 0) {
- fprintf(LOGFILE, "invalid tt root passed %s\n", tt_root);
- cleanup();
- return INVALID_TT_ROOT;
+ if(task_pid == NULL) {
+ return INVALID_ARGUMENT_NUMBER;
}
+ pid = atoi(task_pid);
+ if(pid <= 0) {
+ return INVALID_TASK_PID;
+ }
fclose(LOGFILE);
fcloseall();
-
if (change_user(user) != 0) {
cleanup();
return SETUID_OPER_FAILED;
}
- get_pid_path(jobid, taskid, tt_root, &pid_path);
- if (pid_path == NULL) {
- cleanup();
- return INVALID_PID_PATH;
- }
- file_handle = fopen(pid_path, "r");
- if (file_handle == NULL) {
- free(pid_path);
- cleanup();
- return UNABLE_TO_OPEN_PID_FILE_READ_MODE;
- }
- fscanf(file_handle, "%d", &pid);
- fclose(file_handle);
- free(pid_path);
- if (pid == 0) {
- cleanup();
- return UNABLE_TO_READ_PID;
+ //Don't continue if the process-group is not alive anymore.
+ if(kill(-pid,0) < 0) {
+ errno = 0;
+ return 0;
}
- if (kill(pid, SIGTERM) < 0) {
- fprintf(LOGFILE, "%s\n", strerror(errno));
- cleanup();
- return UNABLE_TO_KILL_TASK;
+
+ if (kill(-pid, sig) < 0) {
+ if(errno != ESRCH) {
+ fprintf(LOGFILE, "Error is %s\n", strerror(errno));
+ cleanup();
+ return UNABLE_TO_KILL_TASK;
+ }
+ errno = 0;
}
cleanup();
return 0;
}
+
Modified: hadoop/core/trunk/src/c++/task-controller/task-controller.h
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/c%2B%2B/task-controller/task-controller.h?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/c++/task-controller/task-controller.h (original)
+++ hadoop/core/trunk/src/c++/task-controller/task-controller.h Fri Jun 12
09:09:35 2009
@@ -28,42 +28,37 @@
#include <sys/stat.h>
#include <sys/signal.h>
#include <getopt.h>
+#include<grp.h>
#include "configuration.h"
//command definitions
enum command {
- RUN_TASK,
- KILL_TASK
+ LAUNCH_TASK_JVM,
+ TERMINATE_TASK_JVM,
+ KILL_TASK_JVM
};
enum errorcodes {
INVALID_ARGUMENT_NUMBER = 1,
- INVALID_USER_NAME,
- INVALID_COMMAND_PROVIDED,
- SUPER_USER_NOT_ALLOWED_TO_RUN_TASKS,
- OUT_OF_MEMORY,
- INVALID_TT_ROOT,
- INVALID_PID_PATH,
- UNABLE_TO_OPEN_PID_FILE_WRITE_MODE,
- UNABLE_TO_OPEN_PID_FILE_READ_MODE,
- UNABLE_TO_WRITE_TO_PID_FILE,
- SETUID_OPER_FAILED,
- INVALID_TASK_SCRIPT_PATH,
- UNABLE_TO_EXECUTE_TASK_SCRIPT,
- UNABLE_TO_READ_PID,
- UNABLE_TO_KILL_TASK,
- UNABLE_TO_FIND_PARENT_PID_FILE,
- TASK_CONTROLLER_SPAWNED_BY_INVALID_PARENT_PROCESS,
- UNABLE_TO_READ_PARENT_PID
+ INVALID_USER_NAME, //2
+ INVALID_COMMAND_PROVIDED, //3
+ SUPER_USER_NOT_ALLOWED_TO_RUN_TASKS, //4
+ INVALID_TT_ROOT, //5
+ SETUID_OPER_FAILED, //6
+ INVALID_TASK_SCRIPT_PATH, //7
+ UNABLE_TO_EXECUTE_TASK_SCRIPT, //8
+ UNABLE_TO_KILL_TASK, //9
+ INVALID_PROCESS_LAUNCHING_TASKCONTROLLER, //10
+ INVALID_TASK_PID, //11
+ ERROR_RESOLVING_FILE_PATH, //12
+ RELATIVE_PATH_COMPONENTS_IN_FILE_PATH, //13
+ UNABLE_TO_STAT_FILE, //14
+ FILE_NOT_OWNED_BY_TASKTRACKER //15
};
-#define TT_PID_PATTERN "%s/hadoop-%s-tasktracker.pid"
-
#define TT_LOCAL_TASK_SCRIPT_PATTERN "%s/taskTracker/jobcache/%s/%s/taskjvm.sh"
-#define TT_SYS_DIR "%s/taskTracker/jobcache/%s/%s/.pid"
-
#define TT_SYS_DIR_KEY "mapred.local.dir"
#define MAX_ITEMS 10
@@ -81,8 +76,6 @@
int run_task_as_user(const char * user, const char *jobid, const char *taskid,
const char *tt_root);
-int verify_parent();
-
-int kill_user_task(const char *user, const char *jobid, const char *taskid,
const char *tt_root);
+int kill_user_task(const char *user, const char *task_pid, int sig);
int get_user_details(const char *user);
Modified: hadoop/core/trunk/src/core/org/apache/hadoop/util/ProcessTree.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/core/org/apache/hadoop/util/ProcessTree.java?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/core/org/apache/hadoop/util/ProcessTree.java
(original)
+++ hadoop/core/trunk/src/core/org/apache/hadoop/util/ProcessTree.java Fri Jun
12 09:09:35 2009
@@ -54,73 +54,24 @@
}
/**
- * Kills the process(OR process group) by sending the signal SIGKILL
- * in the current thread
- * @param pid Process id(OR process group id) of to-be-deleted-process
- * @param isProcessGroup Is pid a process group id of to-be-deleted-processes
- * @param sleepTimeBeforeSigKill wait time before sending SIGKILL after
- * sending SIGTERM
- */
- private static void sigKillInCurrentThread(String pid, boolean
isProcessGroup,
- long sleepTimeBeforeSigKill) {
- // Kill the subprocesses of root process(even if the root process is not
- // alive) if process group is to be killed.
- if (isProcessGroup || ProcessTree.isAlive(pid)) {
- try {
- // Sleep for some time before sending SIGKILL
- Thread.sleep(sleepTimeBeforeSigKill);
- } catch (InterruptedException i) {
- LOG.warn("Thread sleep is interrupted.");
- }
-
- ShellCommandExecutor shexec = null;
-
- try {
- String pid_pgrpid;
- if(isProcessGroup) {//kill the whole process group
- pid_pgrpid = "-" + pid;
- }
- else {//kill single process
- pid_pgrpid = pid;
- }
-
- String[] args = { "kill", "-9", pid_pgrpid };
- shexec = new ShellCommandExecutor(args);
- shexec.execute();
- } catch (IOException ioe) {
- LOG.warn("Error executing shell command " + ioe);
- } finally {
- if(isProcessGroup) {
- LOG.info("Killing process group" + pid + " with SIGKILL. Exit code "
- + shexec.getExitCode());
- }
- else {
- LOG.info("Killing process " + pid + " with SIGKILL. Exit code "
- + shexec.getExitCode());
- }
- }
- }
- }
-
- /** Kills the process(OR process group) by sending the signal SIGKILL
- * @param pid Process id(OR process group id) of to-be-deleted-process
- * @param isProcessGroup Is pid a process group id of to-be-deleted-processes
+ * Destroy the process-tree.
+ * @param pid process id of the root process of the subtree of processes
+ * to be killed
* @param sleeptimeBeforeSigkill The time to wait before sending SIGKILL
* after sending SIGTERM
+ * @param isProcessGroup pid is a process group leader or not
* @param inBackground Process is to be killed in the back ground with
* a separate thread
*/
- private static void sigKill(String pid, boolean isProcessGroup,
- long sleeptimeBeforeSigkill, boolean inBackground) {
-
- if(inBackground) { // use a separate thread for killing
- SigKillThread sigKillThread = new SigKillThread(pid, isProcessGroup,
- sleeptimeBeforeSigkill);
- sigKillThread.setDaemon(true);
- sigKillThread.start();
+ public static void destroy(String pid, long sleeptimeBeforeSigkill,
+ boolean isProcessGroup, boolean inBackground) {
+ if(isProcessGroup) {
+ destroyProcessGroup(pid, sleeptimeBeforeSigkill, inBackground);
}
else {
- sigKillInCurrentThread(pid, isProcessGroup, sleeptimeBeforeSigkill);
+ //TODO: Destroy all the processes in the subtree in this case also.
+ // For the time being, killing only the root process.
+ destroyProcess(pid, sleeptimeBeforeSigkill, inBackground);
}
}
@@ -133,6 +84,29 @@
*/
protected static void destroyProcess(String pid, long sleeptimeBeforeSigkill,
boolean inBackground) {
+ terminateProcess(pid);
+ sigKill(pid, false, sleeptimeBeforeSigkill, inBackground);
+ }
+
+ /** Destroy the process group.
+ * @param pgrpId Process group id of to-be-killed-processes
+ * @param sleeptimeBeforeSigkill The time to wait before sending SIGKILL
+ * after sending SIGTERM
+ * @param inBackground Process group is to be killed in the back ground with
+ * a separate thread
+ */
+ protected static void destroyProcessGroup(String pgrpId,
+ long sleeptimeBeforeSigkill, boolean inBackground) {
+ terminateProcessGroup(pgrpId);
+ sigKill(pgrpId, true, sleeptimeBeforeSigkill, inBackground);
+ }
+
+ /**
+ * Sends terminate signal to the process, allowing it to gracefully exit.
+ *
+ * @param pid pid of the process to be sent SIGTERM
+ */
+ public static void terminateProcess(String pid) {
ShellCommandExecutor shexec = null;
try {
String[] args = { "kill", pid };
@@ -144,19 +118,15 @@
LOG.info("Killing process " + pid +
" with SIGTERM. Exit code " + shexec.getExitCode());
}
-
- sigKill(pid, false, sleeptimeBeforeSigkill, inBackground);
}
-
- /** Destroy the process group.
- * @param pgrpId Process group id of to-be-killed-processes
- * @param sleeptimeBeforeSigkill The time to wait before sending SIGKILL
- * after sending SIGTERM
- * @param inBackground Process group is to be killed in the back ground with
- * a separate thread
+
+ /**
+ * Sends terminate signal to all the process belonging to the passed process
+ * group, allowing the group to gracefully exit.
+ *
+ * @param pgrpId process group id
*/
- protected static void destroyProcessGroup(String pgrpId,
- long sleeptimeBeforeSigkill, boolean inBackground) {
+ public static void terminateProcessGroup(String pgrpId) {
ShellCommandExecutor shexec = null;
try {
String[] args = { "kill", "--", "-" + pgrpId };
@@ -168,37 +138,115 @@
LOG.info("Killing all processes in the process group " + pgrpId +
" with SIGTERM. Exit code " + shexec.getExitCode());
}
-
- sigKill(pgrpId, true, sleeptimeBeforeSigkill, inBackground);
}
/**
- * Destroy the process-tree.
- * @param pid process id of the root process of the subtree of processes
- * to be killed
+ * Kills the process(OR process group) by sending the signal SIGKILL
+ * in the current thread
+ * @param pid Process id(OR process group id) of to-be-deleted-process
+ * @param isProcessGroup Is pid a process group id of to-be-deleted-processes
+ * @param sleepTimeBeforeSigKill wait time before sending SIGKILL after
+ * sending SIGTERM
+ */
+ private static void sigKillInCurrentThread(String pid, boolean
isProcessGroup,
+ long sleepTimeBeforeSigKill) {
+ // Kill the subprocesses of root process(even if the root process is not
+ // alive) if process group is to be killed.
+ if (isProcessGroup || ProcessTree.isAlive(pid)) {
+ try {
+ // Sleep for some time before sending SIGKILL
+ Thread.sleep(sleepTimeBeforeSigKill);
+ } catch (InterruptedException i) {
+ LOG.warn("Thread sleep is interrupted.");
+ }
+ if(isProcessGroup) {
+ killProcessGroup(pid);
+ } else {
+ killProcess(pid);
+ }
+ }
+ }
+
+
+ /** Kills the process(OR process group) by sending the signal SIGKILL
+ * @param pid Process id(OR process group id) of to-be-deleted-process
+ * @param isProcessGroup Is pid a process group id of to-be-deleted-processes
* @param sleeptimeBeforeSigkill The time to wait before sending SIGKILL
* after sending SIGTERM
- * @param isProcessGroup pid is a process group leader or not
* @param inBackground Process is to be killed in the back ground with
* a separate thread
*/
- public static void destroy(String pid, long sleeptimeBeforeSigkill,
- boolean isProcessGroup, boolean inBackground) {
- if(isProcessGroup) {
- destroyProcessGroup(pid, sleeptimeBeforeSigkill, inBackground);
+ private static void sigKill(String pid, boolean isProcessGroup,
+ long sleeptimeBeforeSigkill, boolean inBackground) {
+
+ if(inBackground) { // use a separate thread for killing
+ SigKillThread sigKillThread = new SigKillThread(pid, isProcessGroup,
+ sleeptimeBeforeSigkill);
+ sigKillThread.setDaemon(true);
+ sigKillThread.start();
}
else {
- //TODO: Destroy all the processes in the subtree in this case also.
- // For the time being, killing only the root process.
- destroyProcess(pid, sleeptimeBeforeSigkill, inBackground);
+ sigKillInCurrentThread(pid, isProcessGroup, sleeptimeBeforeSigkill);
+ }
+ }
+
+ /**
+ * Sends kill signal to process, forcefully terminating the process.
+ *
+ * @param pid process id
+ */
+ public static void killProcess(String pid) {
+
+ //If process tree is not alive then return immediately.
+ if(!ProcessTree.isAlive(pid)) {
+ return;
+ }
+ String[] args = { "kill", "-9", pid };
+ ShellCommandExecutor shexec = new ShellCommandExecutor(args);
+ try {
+ shexec.execute();
+ } catch (IOException e) {
+ LOG.warn("Error sending SIGKILL to process "+ pid + " ."+
+ StringUtils.stringifyException(e));
+ } finally {
+ LOG.info("Killing process " + pid + " with SIGKILL. Exit code "
+ + shexec.getExitCode());
}
}
+ /**
+ * Sends kill signal to all process belonging to same process group,
+ * forcefully terminating the process group.
+ *
+ * @param pgrpId process group id
+ */
+ public static void killProcessGroup(String pgrpId) {
+
+ //If process tree is not alive then return immediately.
+ if(!ProcessTree.isProcessGroupAlive(pgrpId)) {
+ return;
+ }
+ String[] args = { "kill", "-9", "-"+pgrpId };
+ ShellCommandExecutor shexec = new ShellCommandExecutor(args);
+ try {
+ shexec.execute();
+ } catch (IOException e) {
+ LOG.warn("Error sending SIGKILL to process group "+ pgrpId + " ."+
+ StringUtils.stringifyException(e));
+ } finally {
+ LOG.info("Killing process group" + pgrpId + " with SIGKILL. Exit code "
+ + shexec.getExitCode());
+ }
+ }
+
/**
* Is the process with PID pid still alive?
* This method assumes that isAlive is called on a pid that was alive not
* too long ago, and hence assumes no chance of pid-wrapping-around.
+ *
+ * @param pid pid of the process to check.
+ * @return true if process is alive.
*/
public static boolean isAlive(String pid) {
ShellCommandExecutor shexec = null;
@@ -215,6 +263,32 @@
}
return (shexec.getExitCode() == 0 ? true : false);
}
+
+ /**
+ * Is the process group with still alive?
+ *
+ * This method assumes that isAlive is called on a pid that was alive not
+ * too long ago, and hence assumes no chance of pid-wrapping-around.
+ *
+ * @param pgrpId process group id
+ * @return true if any of process in group is alive.
+ */
+ public static boolean isProcessGroupAlive(String pgrpId) {
+ ShellCommandExecutor shexec = null;
+ try {
+ String[] args = { "kill", "-0", "-"+pgrpId };
+ shexec = new ShellCommandExecutor(args);
+ shexec.execute();
+ } catch (ExitCodeException ee) {
+ return false;
+ } catch (IOException ioe) {
+ LOG.warn("Error executing shell command "
+ + Arrays.toString(shexec.getExecString()) + ioe);
+ return false;
+ }
+ return (shexec.getExitCode() == 0 ? true : false);
+ }
+
/**
* Helper thread class that kills process-tree with SIGKILL in background
Modified:
hadoop/core/trunk/src/docs/src/documentation/content/xdocs/cluster_setup.xml
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/cluster_setup.xml?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
---
hadoop/core/trunk/src/docs/src/documentation/content/xdocs/cluster_setup.xml
(original)
+++
hadoop/core/trunk/src/docs/src/documentation/content/xdocs/cluster_setup.xml
Fri Jun 12 09:09:35 2009
@@ -696,7 +696,10 @@
<p>
The executable must be deployed as a setuid executable, by changing
- the ownership to <em>root</em> and giving it permissions
<em>4755</em>.
+ the ownership to <em>root</em>, group ownership to that of
tasktracker
+ and giving it permissions <em>4510</em>.Please take a note that,
+ group which owns task-controller should contain only tasktracker
+ as its memeber and not users who submit jobs.
</p>
<p>The executable requires a configuration file called
Modified:
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/DefaultTaskController.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/DefaultTaskController.java?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
---
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/DefaultTaskController.java
(original)
+++
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/DefaultTaskController.java
Fri Jun 12 09:09:35 2009
@@ -59,43 +59,7 @@
context.shExec = shexec;
shexec.execute();
}
-
- /**
- * Kills the JVM running the task stored in the context.
- *
- * @param context the context storing the task running within the JVM
- * that needs to be killed.
- */
- void killTaskJVM(TaskController.TaskControllerContext context) {
- ShellCommandExecutor shexec = context.shExec;
-
- if (shexec != null) {
- Process process = shexec.getProcess();
- if (Shell.WINDOWS) {
- // Currently we don't use setsid on WINDOWS. So kill the process alone.
- if (process != null) {
- process.destroy();
- }
- }
- else { // In addition to the task JVM, kill its subprocesses also.
- String pid = context.pid;
- if (pid != null) {
- ProcessTree.destroy(pid, context.sleeptimeBeforeSigkill,
- ProcessTree.isSetsidAvailable, false);
- try {
- if (process != null) {
- LOG.info("Process exited with exit code:" + process.waitFor());
- }
- } catch (InterruptedException ie) {}
- }
- else if (process != null) {
- // kill the task JVM alone, if we don't have the process group id
- process.destroy();
- }
- }
- }
- }
-
+
/**
* Initialize the task environment.
*
@@ -123,5 +87,50 @@
@Override
void initializeJob(JobID jobId) {
}
+
+ @Override
+ void terminateTask(TaskControllerContext context) {
+ ShellCommandExecutor shexec = context.shExec;
+ if (shexec != null) {
+ Process process = shexec.getProcess();
+ if (Shell.WINDOWS) {
+ // Currently we don't use setsid on WINDOWS.
+ //So kill the process alone.
+ if (process != null) {
+ process.destroy();
+ }
+ }
+ else { // In addition to the task JVM, kill its subprocesses also.
+ String pid = context.pid;
+ if (pid != null) {
+ if(ProcessTree.isSetsidAvailable) {
+ ProcessTree.terminateProcessGroup(pid);
+ }else {
+ ProcessTree.terminateProcess(pid);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void killTask(TaskControllerContext context) {
+ ShellCommandExecutor shexec = context.shExec;
+ if (shexec != null) {
+ if (Shell.WINDOWS) {
+ //We don't do send kill process signal in case of windows as
+ //already we have done a process.destroy() in termintateTaskJVM()
+ return;
+ }
+ String pid = context.pid;
+ if (pid != null) {
+ if(ProcessTree.isSetsidAvailable) {
+ ProcessTree.killProcessGroup(pid);
+ }else {
+ ProcessTree.killProcess(pid);
+ }
+ }
+ }
+ }
}
Modified: hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/JvmManager.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/JvmManager.java?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/JvmManager.java
(original)
+++ hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/JvmManager.java Fri
Jun 12 09:09:35 2009
@@ -438,7 +438,7 @@
.getLong("mapred.tasktracker.tasks.sleeptime-before-sigkill",
ProcessTree.DEFAULT_SLEEPTIME_BEFORE_SIGKILL);
- controller.killTaskJVM(initalContext);
+ controller.destroyTaskJVM(initalContext);
} else {
LOG.info(String.format("JVM Not killed %s but just removed", jvmId
.toString()));
Modified:
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/LinuxTaskController.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/LinuxTaskController.java?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
---
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/LinuxTaskController.java
(original)
+++
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/LinuxTaskController.java
Fri Jun 12 09:09:35 2009
@@ -30,6 +30,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.mapred.JvmManager.JvmEnv;
+import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
/**
@@ -107,6 +108,7 @@
*/
enum TaskCommands {
LAUNCH_TASK_JVM,
+ TERMINATE_TASK_JVM,
KILL_TASK_JVM
}
@@ -131,33 +133,48 @@
writeCommand(cmdLine, getTaskCacheDirectory(context));
// Call the taskcontroller with the right parameters.
- List<String> launchTaskJVMArgs = buildTaskCommandArgs(context);
+ List<String> launchTaskJVMArgs = buildLaunchTaskArgs(context);
ShellCommandExecutor shExec = buildTaskControllerExecutor(
TaskCommands.LAUNCH_TASK_JVM,
env.conf.getUser(),
launchTaskJVMArgs, env);
context.shExec = shExec;
- shExec.execute();
- LOG.debug("output after executing task jvm = " + shExec.getOutput());
+ try {
+ shExec.execute();
+ } catch (Exception e) {
+ LOG.warn("Exception thrown while launching task JVM : " +
+ StringUtils.stringifyException(e));
+ LOG.warn("Exit code from task is : " + shExec.getExitCode());
+ LOG.warn("Output from task-contoller is : " + shExec.getOutput());
+ throw new IOException(e);
+ }
+ if(LOG.isDebugEnabled()) {
+ LOG.debug("output after executing task jvm = " + shExec.getOutput());
+ }
}
- // convenience API for building command arguments for specific commands
- private List<String> buildTaskCommandArgs(TaskControllerContext context) {
+ /**
+ * Returns list of arguments to be passed while launching task VM.
+ * See {...@code buildTaskControllerExecutor(TaskCommands,
+ * String, List<String>, JvmEnv)} documentation.
+ * @param context
+ * @return Argument to be used while launching Task VM
+ */
+ private List<String> buildLaunchTaskArgs(TaskControllerContext context) {
List<String> commandArgs = new ArrayList<String>(3);
String taskId = context.task.getTaskID().toString();
String jobId = getJobId(context);
+ LOG.debug("getting the task directory as: "
+ + getTaskCacheDirectory(context));
+ commandArgs.add(getDirectoryChosenForTask(
+ new File(getTaskCacheDirectory(context)),
+ context));
commandArgs.add(jobId);
if(!context.task.isTaskCleanupTask()) {
commandArgs.add(taskId);
}else {
commandArgs.add(taskId + TaskTracker.TASK_CLEANUP_SUFFIX);
}
-
- LOG.debug("getting the task directory as: "
- + getTaskCacheDirectory(context));
- commandArgs.add(getDirectoryChosenForTask(
- new File(getTaskCacheDirectory(context)),
- context));
return commandArgs;
}
@@ -173,7 +190,7 @@
// in mapred.local.dir chosen for storing data pertaining to
// this task.
private String getDirectoryChosenForTask(File directory,
- TaskControllerContext context) {
+ TaskControllerContext context) {
String jobId = getJobId(context);
String taskId = context.task.getTaskID().toString();
for (String dir : mapredLocalDirs) {
@@ -184,43 +201,13 @@
return dir;
}
}
-
+
LOG.error("Couldn't parse task cache directory correctly");
throw new IllegalArgumentException("invalid task cache directory "
- + directory.getAbsolutePath());
+ + directory.getAbsolutePath());
}
/**
- * Kill a launched task JVM running as the user of the job.
- *
- * This method will launch the task controller setuid executable
- * that in turn will kill the task JVM by sending a kill signal.
- * @param context the context storing the task running within the JVM
- * that needs to be killed.
- */
- void killTaskJVM(TaskControllerContext context) {
-
- if(context.task == null) {
- LOG.info("Context task null not killing the JVM");
- return;
- }
-
- JvmEnv env = context.env;
- List<String> killTaskJVMArgs = buildTaskCommandArgs(context);
- try {
- ShellCommandExecutor shExec = buildTaskControllerExecutor(
- TaskCommands.KILL_TASK_JVM,
- context.env.conf.getUser(),
- killTaskJVMArgs,
- context.env);
- shExec.execute();
- LOG.debug("Command output :" +shExec.getOutput());
- } catch (IOException ioe) {
- LOG.warn("IOException in killing task: " + ioe.getMessage());
- }
- }
-
- /**
* Setup appropriate permissions for directories and files that
* are used by the task.
*
@@ -281,9 +268,24 @@
LOG.warn("Could not change permissions for directory " + dir);
}
}
-
- // convenience API to create the executor for launching the
- // setuid script.
+ /**
+ * Builds the command line for launching/terminating/killing task JVM.
+ * Following is the format for launching/terminating/killing task JVM
+ * <br/>
+ * For launching following is command line argument:
+ * <br/>
+ * {...@code user-name command tt-root job_id task_id}
+ * <br/>
+ * For terminating/killing task jvm.
+ * {...@code user-name command tt-root task-pid}
+ *
+ * @param command command to be executed.
+ * @param userName user name
+ * @param cmdArgs list of extra arguments
+ * @param env JVM environment variables.
+ * @return {...@link ShellCommandExecutor}
+ * @throws IOException
+ */
private ShellCommandExecutor buildTaskControllerExecutor(TaskCommands
command,
String userName,
List<String> cmdArgs, JvmEnv env)
@@ -420,6 +422,67 @@
}
}
+ /**
+ * API which builds the command line to be pass to LinuxTaskController
+ * binary to terminate/kill the task. See
+ * {...@code buildTaskControllerExecutor(TaskCommands,
+ * String, List<String>, JvmEnv)} documentation.
+ *
+ *
+ * @param context context of task which has to be passed kill signal.
+ *
+ */
+ private List<String> buildKillTaskCommandArgs(TaskControllerContext
+ context){
+ List<String> killTaskJVMArgs = new ArrayList<String>();
+ killTaskJVMArgs.add(context.pid);
+ return killTaskJVMArgs;
+ }
+
+ /**
+ * Convenience method used to sending appropriate Kill signal to the task
+ * VM
+ * @param context
+ * @param command
+ * @throws IOException
+ */
+ private void finishTask(TaskControllerContext context,
+ TaskCommands command) throws IOException{
+ if(context.task == null) {
+ LOG.info("Context task null not killing the JVM");
+ return;
+ }
+ ShellCommandExecutor shExec = buildTaskControllerExecutor(
+ command, context.env.conf.getUser(),
+ buildKillTaskCommandArgs(context), context.env);
+ try {
+ shExec.execute();
+ } catch (Exception e) {
+ LOG.warn("Output from task-contoller is : " + shExec.getOutput());
+ throw new IOException(e);
+ }
+ }
+
+ @Override
+ void terminateTask(TaskControllerContext context) {
+ try {
+ finishTask(context, TaskCommands.TERMINATE_TASK_JVM);
+ } catch (Exception e) {
+ LOG.warn("Exception thrown while sending kill to the Task VM " +
+ StringUtils.stringifyException(e));
+ }
+ }
+
+ @Override
+ void killTask(TaskControllerContext context) {
+ try {
+ finishTask(context, TaskCommands.KILL_TASK_JVM);
+ } catch (Exception e) {
+ LOG.warn("Exception thrown while sending destroy to the Task VM " +
+ StringUtils.stringifyException(e));
+ }
+ }
+
protected String getTaskControllerExecutablePath() {
return taskControllerExe;
}
Modified:
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/TaskController.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/TaskController.java?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
--- hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/TaskController.java
(original)
+++ hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/TaskController.java
Fri Jun 12 09:09:35 2009
@@ -19,10 +19,12 @@
import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JvmManager.JvmEnv;
-import org.apache.hadoop.mapred.JobID;
+import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
/**
@@ -38,6 +40,8 @@
private Configuration conf;
+ public static final Log LOG = LogFactory.getLog(TaskController.class);
+
public Configuration getConf() {
return conf;
}
@@ -63,13 +67,29 @@
throws IOException;
/**
- * Kill a task JVM
+ * Top level cleanup a task JVM method.
+ *
+ * The current implementation does the following.
+ * <ol>
+ * <li>Sends a graceful terminate signal to task JVM allowing its sub-process
+ * to cleanup.</li>
+ * <li>Waits for stipulated period</li>
+ * <li>Sends a forceful kill signal to task JVM, terminating all its
+ * sub-process forcefully.</li>
+ * </ol>
*
- * This method defines how a JVM launched to execute one or more
- * tasks will be killed.
- * @param context
+ * @param context the task for which kill signal has to be sent.
*/
- abstract void killTaskJVM(TaskControllerContext context);
+ final void destroyTaskJVM(TaskControllerContext context) {
+ terminateTask(context);
+ try {
+ Thread.sleep(context.sleeptimeBeforeSigkill);
+ } catch (InterruptedException e) {
+ LOG.warn("Sleep interrupted : " +
+ StringUtils.stringifyException(e));
+ }
+ killTask(context);
+ }
/**
* Perform initializing actions required before a task can run.
@@ -110,4 +130,20 @@
* @param tip Task of job for which localization happens.
*/
abstract void initializeJob(JobID jobId);
+
+ /**
+ * Sends a graceful terminate signal to taskJVM and it sub-processes.
+ *
+ * @param context task context
+ */
+ abstract void terminateTask(TaskControllerContext context);
+
+ /**
+ * Sends a KILL signal to forcefully terminate the taskJVM and its
+ * sub-processes.
+ *
+ * @param context task context
+ */
+
+ abstract void killTask(TaskControllerContext context);
}
Modified:
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcesses.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcesses.java?rev=784042&r1=784041&r2=784042&view=diff
==============================================================================
---
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcesses.java
(original)
+++
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcesses.java
Fri Jun 12 09:09:35 2009
@@ -23,17 +23,22 @@
import java.io.IOException;
import java.util.Random;
import java.util.Iterator;
+import java.util.StringTokenizer;
import junit.framework.TestCase;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ProcessTree;
import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.util.Shell.ExitCodeException;
+import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.util.TestProcfsBasedProcessTree;
import org.apache.commons.logging.Log;
@@ -49,11 +54,11 @@
.getLog(TestKillSubProcesses.class);
private static String TEST_ROOT_DIR = new File(System.getProperty(
- "test.build.data", "/tmp")).toURI().toString().replace(' ', '+');
+ "test.build.data", "/tmp"), "killjob").toURI().toString().replace(' ',
'+');
private static JobClient jobClient = null;
- private static MiniMRCluster mr = null;
+ static MiniMRCluster mr = null;
private static Path scriptDir = null;
private static String scriptDirName = null;
private static String pid = null;
@@ -70,7 +75,7 @@
conf.setJobName("testkilljobsubprocesses");
conf.setMapperClass(KillingMapperWithChildren.class);
- scriptDir = new Path(TEST_ROOT_DIR + "/script");
+ scriptDir = new Path(TEST_ROOT_DIR , "script");
RunningJob job = runJobAndSetProcessHandle(jt, conf);
// kill the job now
@@ -181,9 +186,8 @@
}
}
LOG.info("pid of map task is " + pid);
-
- // Checking if the map task is alive
- assertTrue(ProcessTree.isAlive(pid));
+ //Checking if the map task is alive
+ assertTrue("Map is no more alive", isAlive(pid));
LOG.info("The map task is alive before Job completion, as expected.");
}
}
@@ -216,7 +220,7 @@
" is " + childPid);
assertTrue("Unexpected: The subprocess at level " + i +
" in the subtree is not alive before Job completion",
- ProcessTree.isAlive(childPid));
+ isAlive(childPid));
}
}
return job;
@@ -250,10 +254,10 @@
" is " + childPid);
assertTrue("Unexpected: The subprocess at level " + i +
" in the subtree is alive after Job completion",
- !ProcessTree.isAlive(childPid));
+ !isAlive(childPid));
}
}
- FileSystem fs = FileSystem.get(conf);
+ FileSystem fs = FileSystem.getLocal(mr.createJobConf());
if(fs.exists(scriptDir)) {
fs.delete(scriptDir, true);
}
@@ -261,10 +265,23 @@
private static RunningJob runJob(JobConf conf) throws IOException {
- final Path inDir = new Path(TEST_ROOT_DIR + "/killjob/input");
- final Path outDir = new Path(TEST_ROOT_DIR + "/killjob/output");
+ final Path inDir;
+ final Path outDir;
+ FileSystem fs = FileSystem.getLocal(conf);
+ FileSystem tempFs = FileSystem.get(conf);
+ //Check if test is run with hdfs or local file system.
+ //if local filesystem then prepend TEST_ROOT_DIR, otherwise
+ //killjob folder would be created in workspace root.
+ if (!tempFs.getUri().toASCIIString().equals(
+ fs.getUri().toASCIIString())) {
+ inDir = new Path("killjob/input");
+ outDir = new Path("killjob/output");
+ } else {
+ inDir = new Path(TEST_ROOT_DIR, "input");
+ outDir = new Path(TEST_ROOT_DIR, "output");
+ }
- FileSystem fs = FileSystem.get(conf);
+
if(fs.exists(scriptDir)) {
fs.delete(scriptDir, true);
}
@@ -290,9 +307,7 @@
// run the TCs
conf = mr.createJobConf();
JobTracker jt = mr.getJobTrackerRunner().getJobTracker();
- runKillingJobAndValidate(jt, conf);
- runFailingJobAndValidate(jt, conf);
- runSuccessfulJobAndValidate(jt, conf);
+ runTests(conf, jt);
} finally {
if (mr != null) {
mr.shutdown();
@@ -300,12 +315,25 @@
}
}
+ void runTests(JobConf conf, JobTracker jt) throws IOException {
+ FileSystem fs = FileSystem.getLocal(mr.createJobConf());
+ Path rootDir = new Path(TEST_ROOT_DIR);
+ if(!fs.exists(rootDir)) {
+ fs.mkdirs(rootDir);
+ }
+ fs.setPermission(rootDir,
+ new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
+ runKillingJobAndValidate(jt, conf);
+ runFailingJobAndValidate(jt, conf);
+ runSuccessfulJobAndValidate(jt, conf);
+ }
+
/**
* Creates signal file
*/
private static void signalTask(String signalFile, JobConf conf) {
try {
- FileSystem fs = FileSystem.get(conf);
+ FileSystem fs = FileSystem.getLocal(conf);
fs.createNewFile(new Path(signalFile));
} catch(IOException e) {
LOG.warn("Unable to create signal file. " + e);
@@ -317,10 +345,12 @@
*/
private static void runChildren(JobConf conf) throws IOException {
if (ProcessTree.isSetsidAvailable) {
- FileSystem fs = FileSystem.get(conf);
+ FileSystem fs = FileSystem.getLocal(conf);
TEST_ROOT_DIR = new Path(conf.get("test.build.data")).toUri().getPath();
- scriptDir = new Path(TEST_ROOT_DIR + "/script");
-
+ scriptDir = new Path(TEST_ROOT_DIR + "/script");
+ if(fs.exists(scriptDir)){
+ fs.delete(scriptDir, true);
+ }
// create shell script
Random rm = new Random();
Path scriptPath = new Path(scriptDir, "_shellScript_" + rm.nextInt()
@@ -329,6 +359,7 @@
String script =
"echo $$ > " + scriptDir.toString() + "/childPidFile" + "$1\n" +
"echo hello\n" +
+ "trap 'echo got SIGTERM' 15 \n" +
"if [ $1 != 0 ]\nthen\n" +
" sh " + shellScript + " $(($1-1))\n" +
"else\n" +
@@ -447,4 +478,46 @@
throw new RuntimeException("failing map");
}
}
+
+ /**
+ * Check for presence of the process with the pid passed is alive or not
+ * currently.
+ *
+ * @param pid pid of the process
+ * @return if a process is alive or not.
+ */
+ private static boolean isAlive(String pid) throws IOException {
+ String commandString ="ps -o pid,command -e";
+ String args[] = new String[] {"bash", "-c" , commandString};
+ ShellCommandExecutor shExec = new ShellCommandExecutor(args);
+ try {
+ shExec.execute();
+ }catch(ExitCodeException e) {
+ return false;
+ } catch (IOException e) {
+ LOG.warn("IOExecption thrown while checking if process is alive" +
+ StringUtils.stringifyException(e));
+ throw e;
+ }
+
+ String output = shExec.getOutput();
+
+ //Parse the command output and check for pid, ignore the commands
+ //which has ps or grep in it.
+ StringTokenizer strTok = new StringTokenizer(output, "\n");
+ boolean found = false;
+ while(strTok.hasMoreTokens()) {
+ StringTokenizer pidToken = new StringTokenizer(strTok.nextToken(),
+ " ");
+ String pidStr = pidToken.nextToken();
+ String commandStr = pidToken.nextToken();
+ if(pid.equals(pidStr) && !(commandStr.contains("ps")
+ || commandStr.contains("grep"))) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+ }
+
}
Added:
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcessesWithLinuxTaskController.java
URL:
http://svn.apache.org/viewvc/hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcessesWithLinuxTaskController.java?rev=784042&view=auto
==============================================================================
---
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcessesWithLinuxTaskController.java
(added)
+++
hadoop/core/trunk/src/test/mapred/org/apache/hadoop/mapred/TestKillSubProcessesWithLinuxTaskController.java
Fri Jun 12 09:09:35 2009
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+/**
+ * Test killing of child processes spawned by the jobs with LinuxTaskController
+ * running the jobs as a user different from the user running the cluster.
+ * See {...@link ClusterWithLinuxTaskController}
+ */
+
+public class TestKillSubProcessesWithLinuxTaskController extends
+ ClusterWithLinuxTaskController {
+
+ public void testKillSubProcess() throws Exception{
+ if(!shouldRun()) {
+ return;
+ }
+ startCluster();
+ JobConf myConf = getClusterConf();
+ JobTracker jt = mrCluster.getJobTrackerRunner().getJobTracker();
+
+ TestKillSubProcesses.mr = mrCluster;
+ TestKillSubProcesses sbProc = new TestKillSubProcesses();
+ sbProc.runTests(myConf, jt);
+ }
+}