Once upon a time xdm was created. This program manages X terminals by forking 1 or 2 processes per display:
1. a local server 'X :0' if required, and 2. the session manager greeter '-:0'. The dm.c WaitForChild() loop interpretes the session exit code and uses a ++startTries>=startAttempts test to detect an unresponsive server, OPENFAILED_DISPLAY, or a SIGTERM'ed session. Other exit codes reset startTries = 0. All worked well until the ps2 mouse appeared. Such input device is initialized last and after the X listening socket. Thus the session manager finds the X server, starts the greeter, and waits for its return. Alas, if the mouse now fails initialization then X quits and triggers session.c IOErrorHandler() { exit(RESERVER_DISPLAY); }. The main loop would then call RestartDisplay(d, TRUE) causing a persistent restart-quit cycle which inhibits any user login. Somewhere before 2007 this was solved by the crash detect code in dm.c at the case RESERVER_DISPLAY. A manager exit is considered a crash if a previous exit occured within the XDM_BROKEN_INTERVAL of 120 second. RemoveDisplay() is then called to remove the display from the list. This isn't a perfect solution since crash detection may also be triggered by the ctrl-alt-backspace server termination. Although a bit an unfriendly way of quitting X this is the only option available in case of a garbled display. The proposed patch http://bugs.freedesktop.org/show_bug.cgi?id=20546 adds a ++reservTries>=reservAttempts test to allow for a few successive crash-type manager exits. The resource reservAttempts is the number of times a fatal IO error is allowed in session.c. The patch also substitutes the RestartDisplay() RemoveDisplay() sequence by StopDisplay(). The original sequence works since RestartDisplay(d, TRUE) kills the serverPid. However RemoveDisplay() directly removes the display from the list which may thus give an "Unknown child termination" error. StopDisplay() on the other hand implements the synchronous kill and wait for child exit. RemoveDisplay() is only to be called if neither server nor session manager is running. The patch is included below. I was wondering if someone here would be willing to inspect and then perhaps to support the patch for inclusion into app/xdm. If there are questions just ask and I will further describe the underlying logic. Signed-off-by: Servaas Vandenberghe diff --git a/dm.c b/dm.c index da18800..c0b0031 100644 --- a/dm.c +++ b/dm.c @@ -492,6 +492,7 @@ #endif break; case OBEYSESS_DISPLAY: d->startTries = 0; + d->reservTries = 0; Debug ("Display exited with OBEYSESS_DISPLAY\n"); if (d->displayType.lifetime != Permanent || d->status == zombie) @@ -528,24 +529,44 @@ #endif Debug ("Display exited with RESERVER_DISPLAY\n"); if (d->displayType.origin == FromXDMCP || d->status == zombie) StopDisplay(d); - else - RestartDisplay (d, TRUE); - { - Time_t Time; - time(&Time); - Debug("time %i %i\n",Time,d->lastCrash); - if (d->lastCrash && - ((Time - d->lastCrash) < XDM_BROKEN_INTERVAL)) { - Debug("Server crash frequency too high:" - " removing display %s\n",d->name); - LogError("Server crash rate too high:" - " removing display %s\n",d->name); + else { + Time_t now; + int crash; + + time(&now); + Debug("time %i %i try %i of %i\n", now, d->lastReserv, + d->reservTries, d->reservAttempts); + crash = d->lastReserv && + ((now - d->lastReserv) < XDM_BROKEN_INTERVAL); + + if (!crash) + d->reservTries = 0; + + if (crash && ++d->reservTries >= d->reservAttempts) { + char msg[] = "Server crash frequency too high:" + " stopping display"; + Debug("%s %s\n", msg, d->name); + LogError("%s %s\n", msg, d->name); #if !defined(ARC4_RANDOM) && !defined(DEV_RANDOM) AddTimerEntropy(); #endif - RemoveDisplay (d); + /* For a local X server either: + * 1. The server exit was returned by waitpid(). So + * serverPid==-1 => StopDisplay() calls RemoveDisplay() + * + * 2. The server is a zombie or still running. So + * serverPid>1 => StopDisplay() + * a. sets status=zombie, + * b. kills the server. + * The next waitpid() returns this zombie server pid + * and the 'case zombie:' below then calls + * RemoveDisplay(). + */ + StopDisplay(d); } else - d->lastCrash = Time; + RestartDisplay(d, TRUE); + + d->lastReserv = now; } break; case waitCompose (SIGTERM,0,0): diff --git a/dm.h b/dm.h index af50328..ccf0bd8 100644 --- a/dm.h +++ b/dm.h @@ -177,7 +177,8 @@ struct display { pid_t serverPid; /* process id of server (-1 if none) */ FileState state; /* state during HUP processing */ int startTries; /* current start try */ - Time_t lastCrash; /* time of last crash */ + Time_t lastReserv; /* time of last reserver crash */ + int reservTries; /* current reserver try */ # ifdef XDMCP /* XDMCP state */ CARD32 sessionID; /* ID of active session */ @@ -197,6 +198,7 @@ # endif int openRepeat; /* open attempts to make */ int openTimeout; /* abort open attempt timeout */ int startAttempts; /* number of attempts at starting */ + int reservAttempts; /* allowed fatal errors after start */ int pingInterval; /* interval between XSync */ int pingTimeout; /* timeout for XSync */ int terminateServer;/* restart for each session */ diff --git a/dpylist.c b/dpylist.c index 6da882f..dccd679 100644 --- a/dpylist.c +++ b/dpylist.c @@ -241,7 +241,9 @@ NewDisplay (char *name, char *class) d->openTimeout = 0; d->startAttempts = 0; d->startTries = 0; - d->lastCrash = 0; + d->lastReserv = 0; + d->reservAttempts = 0; + d->reservTries = 0; d->terminateServer = 0; d->grabTimeout = 0; #ifdef XDMCP diff --git a/resource.c b/resource.c index 5c02da7..a06d742 100644 --- a/resource.c +++ b/resource.c @@ -222,6 +222,8 @@ struct displayResource serverResources[] "120" }, { "startAttempts","StartAttempts",DM_INT, boffset(startAttempts), "4" }, +{ "reservAttempts","ReservAttempts",DM_INT, boffset(reservAttempts), + "2" }, { "pingInterval","PingInterval",DM_INT, boffset(pingInterval), "5" }, { "pingTimeout","PingTimeout", DM_INT, boffset(pingTimeout), diff --git a/xdm.man.cpp b/xdm.man.cpp index 220580b..d393598 100644 --- a/xdm.man.cpp +++ b/xdm.man.cpp @@ -445,6 +445,7 @@ See the section .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openRepeat\fP" .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openTimeout\fP" .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.startAttempts\fP" +.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.reservAttempts\fP" These numeric resources control the behavior of .I xdm when attempting to open intransigent servers. \fBopenDelay\fP is @@ -463,9 +464,10 @@ This process is repeated \fBstartAttempts\fP times, at which point the display is declared dead and disabled. Although this behavior may seem arbitrary, it has been empirically developed and -works quite well on most systems. The default values are -5 for \fBopenDelay\fP, 5 for \fBopenRepeat\fP, 30 for \fBopenTimeout\fP and -4 for \fBstartAttempts\fP. +works quite well on most systems. \fBreservAttempts\fP is the number of +times a fatal error is allowed after connecting. The default values are +\fBopenDelay\fP: 15, \fBopenRepeat\fP: 5, \fBopenTimeout\fP: 120, +\fBstartAttempts\fP: 4 and \fBreservAttempts\fP: 2. .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingInterval\fP" .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingTimeout\fP" To discover when remote displays disappear, _______________________________________________ xorg-devel@lists.x.org: X.Org development Archives: http://lists.x.org/archives/xorg-devel Info: http://lists.x.org/mailman/listinfo/xorg-devel