2007/8/21, David Liontooth <[EMAIL PROTECTED]>:
> Is it possible to use transcode in cluster mode on a beowulf cluster?
> Does anyone here have experience with this?

Yes it is, I did this the other week.

There are two mechanisms supported: PVM and -W ("cluster mode").

There are two documents in the distribution about these two modes.

I found PVM support to be broken in transcode.
It always gives some message about "Unable to send" or something
similar, and a google for the same message says other people find the
same thing.

I ran cluster mode (-W) a coupla weeks ago on 14 nodes of varying speeds.
-W seems to be designed to spread the work between N otherwise idle
nodes of the same speed, whereas mine range from 400MHz to 1.7GHz
share a common home filesystem and all have the same OS installation
(Debian with custom transcode)
 I wrote a program "distribute" to spread the work that is sort of
like a generalised distcc. It reads a list of command lines (60 of
them in my example) and sends one to each node via rsh (ie ssh), then
when a node finished its task its is given another one, so on until
all are done.
  It's all pretty hacky and error-prone at present but you're welcome
to try with it. It reduced my encoding time from many hours to 29
minutes.

Enjoy!

> BTW is anyone updating the wiki?
> http://www.transcoding.org/cgi-bin/transcode?Transcode_Wiki shows only
> the April 15, 2007 release.

Get write permission and go for it!

    M

Attachment: tcp
Description: Binary data

/*
 * Distribute jobs to cluster nodes.
 *
 * Reads command lines from stdin, one per line and
 * starts one job on each cluster node.
 * When a node's job terminates, it is given another job to do from the
 * input list, and so on until all jobs have been allocated to nodes.
 * We then wait for all jobs to terminate before exiting.
 *
 * If the environment variable NODES is set, it contains a space-separated
 * list of cluster node host names.
 * Otherwise we get the list of host names from /etc/distcc/hosts
 *
 * Martin Guy <[EMAIL PROTECTED]> 2007-07-27
 */

#include <stdio.h>	/* for NULL */
#include <stdlib.h>	/* for exit(), malloc() */
#include <string.h>	/* for strlen(),  strdup() */
#include <unistd.h>	/* for _exit() and exec*() */
#include <errno.h>	/* for ECHILD */

#include <sys/types.h>	/* for <sys/wait.h> */
#include <sys/wait.h>	/* for wait() and pid_t */

/* Forward declarations */
static char *get_next_job();
void read_node_list();	/* Initialise list of usable nodes */
char *find_free_node();	/* Return the hostname of an idle node */
void start_job_on_node(char *job, char *hostname); /* Start job on named host */
void wait_for_all_jobs(); /* Wait for all jobs to complete */
void zap_newline(char *buf); /* removbe final newline from a string */

static char *pwd;	/* Current working directory */
static char cd_command[256];	/* Command to prefix to rsh-ed command */

main(int argc, char **argv)
{
	char *job;	/* Command line to run as a remote job */

	/* Set current directory, so as to make remote jobs CD to it */
	/* The lazy way: bash's PWD variable */
	pwd = getenv("PWD");
	if (pwd == NULL) {
		fputs("Cannot get current direcctory from $PWD. Please set it.\n",
			stderr);
		exit(1);
	}
	sprintf(cd_command, "cd %s &&", pwd);

	read_node_list();
	while ((job = get_next_job()) != NULL) {
		char *node = find_free_node();
		start_job_on_node(job, node);
	}
	wait_for_all_jobs();
	exit(0);
}

/* Read next command line to execute (a line from standard input) */
static char *
get_next_job()
{
	static char buf[256];
	
	if (fgets(buf, sizeof(buf), stdin) == NULL) {
		/* EOF (or error) */
		return NULL;
	} else {
		zap_newline(buf);
	}
	return buf;
}

/* File containing list of cluster nodes, one per line.
 * Lines starting with # are ignored.
 */
static char hostfile[] = "/etc/distcc/hosts";

/* Info we need to record about each cluster node */
struct node {
	char *hostname;		/* hostname */
	pid_t pid;		/* PID of local process that spawned a job
				 * onto this node; 0 means none is running */
	char *job;		/* command line running on this host */
	struct node *next;	/* linked list pointer, NULL at end of list */
};

/* pointer to head of linked list of nodes */
static struct node *nodes;

/* Forward declarations */
static void add_node(char *name);
static void delete_node(char *hostname);

/*
 * Get the list of usable cluster nodes.
 * For now we read it from /etc/distcc/hosts
 */
void
read_node_list()
{
	FILE *ifp;
	char buf[80];

	/* First choice: environment variable NODES, containing
	 * a list of hostnames separated by spaces
	 */
	char *HOSTS=getenv("NODES");
	if (HOSTS != NULL) {
		char *host; /* a hostname from the environment variable */
		for (host = strtok(HOSTS, " ");
		     host != NULL;
		     host = strtok(NULL, " ")) {
			add_node(host);
		}
		return;
	}

	/* Second choice: read host list from config file */
	ifp = fopen("/etc/distcc/hosts", "r");
	if (ifp == NULL) {
		fputs("Cannot open host file ", stderr);
		perror(hostfile);
		exit(0);
	}

	while (fgets(buf, sizeof(buf), ifp) != NULL) {
		/* Ignore commented lines */
		if (buf[0] == '#') continue;
		zap_newline(buf);
		add_node(buf);
	}
	fclose(ifp);
}

/* Add a named host to the end of our list of cluster nodes */
static void
add_node(char *hostname)
{
	struct node *newnode = malloc(sizeof(struct node));
	if (newnode == NULL
	    || (newnode->hostname = strdup(hostname)) == NULL) {
		fputs("Out of memory\n", stderr);
		exit(1);
	}
	newnode->pid = 0;	/* Nothing running on it yet... */
	newnode->job = NULL;	/* Nothing running on it yet... */

	/* Append to end of list to preserve host order */
	newnode->next = NULL;
	if (nodes == NULL) nodes = newnode;
	else {
		/* Seek to last node and append new node */
		struct node *np = nodes;
		while (np->next != NULL) np = np->next;
		np->next = newnode;
	}
}

/* Delete a named host from our list of cluster nodes */
static void
delete_node(char *hostname)
{
	struct node **npp;	/* points to the pointer that references the
			node we are thinking of deleting. It either points to
			the "nodes" list head or to the "next" pointer of a
			struct node in the list */

	for (npp = &nodes; *npp != NULL; npp = &((*npp)->next)) {
		if (strcmp((*npp)->hostname, hostname) == 0) {
			struct node *np = *npp;	/* the node we must free */
			/* unlink node from list */
			*npp = np->next;
			/* free old cell's memory */
			if (np->job != NULL) free(np->job);
			free(np);
			return;
		}
	}
	if (*npp == NULL) {
		/* "cannot happen" */
		fprintf(stderr, "Failed to delete host %s.\n", hostname);
	}
}

/* Find an idle node and return its hostname.
 * If none are idle, wait for one to finish what it is doing and return that.
 */
char *
find_free_node()
{
	struct node *np;
	int status;
	pid_t pid;

	if (nodes == NULL) {
		fputs("There are no cluster nodes in our list.\n", stderr);
		exit(1);
	}

	/* First choice: a node that is doing nothing */
	for (np=nodes; np != NULL; np=np->next) {
		if (np->pid == 0) return np->hostname;
	}

	/* Second choice: wait for one to finish */
	pid = wait(&status);
	if (pid <= 0) {
		/* "Cannot happen" */
		perror("Bad return value when waiting for child");
		exit(1);
	}
	/* Find out which host that was and return its name (it's now free) */
	for (np=nodes; np != NULL; np=np->next) {
		if (np->pid == pid) break;
	}
	if (np == NULL) {
		fputs("Error: a child finished that we never spawned\n", stderr);
		exit(1);
	}

	if (WIFEXITED(status)) switch (WEXITSTATUS(status)) {
	case 255:	/* Could not connect to host. */
		fprintf(stderr, "Host %s seems down.\n", np->hostname);
		exit(1);
		
	case 127:	/* Command not found on remote host. Say which host. */
		fprintf(stderr, "Command not found on host %s: %s\n",
			np->hostname, np->job);
		exit(1);
		break;
	case 0:
		/* Command succeeded */
		fprintf(stderr, "Command succeeded on host %s: %s\n",
			np->hostname, np->job);
		break;
	default:
		/* Command failed. So do we. */
		fprintf(stderr, "Command failed with status %d on host %s: %s\n",
			status, np->hostname, np->job);
		exit(WEXITSTATUS(status));
		break;
	}

	/* Nothing running there now... */
	np->pid = 0;
	if (np->job) { free(np->job); np->job = NULL; }

	return np->hostname;
}

/*
 * Start a command line job on a named host
 */
void
start_job_on_node(char *job, char *hostname)
{	
	struct node *np;	/* node for that hostname */
	pid_t pid;

	/* Find node with matching hostname */
	for (np = nodes;
	     np != NULL && strcmp(np->hostname, hostname) != 0;
	     np = np->next)
		;

	if (np == NULL) {
		/* Hostname not found: "Cannot happen" */
		fprintf(stderr, "Host %s not found in node search.\n", hostname);
		exit(1);
	}

	fprintf(stderr, "On %s starting %s\n", hostname, job);

	switch (pid = fork()) {
	case -1:
		fputs("Cannot fork\n", stderr);
		exit(1);
	case 0:	/* child */
		execl("/usr/bin/rsh", "rsh", hostname, cd_command, job, NULL);
		_exit(-1);
	default: /* parent */
		np->pid = pid;
		/* Remember the command line in case we have to reschedule it */
		if (np->job) free(np->job);
		np->job = strdup(job);
	}
}

void
wait_for_all_jobs()
{
	int status;

	fprintf(stderr, "Waiting for jobs to finish...\n");

	while (wait(&status) != -1)
		;
	if (errno != ECHILD) {
		perror("Waiting for all jobs");
		exit(1);
	}
}

/*
 * Utility functions
 */

/* Replace a final newline, if any, with a nul */
void
zap_newline(char *buf)
{
	/* Zap the final newline */
	char *cp = buf + strlen(buf) - 1;
	if (*cp == '\n') *cp = '\0';
}

Reply via email to