Once upon a time xdm was created.  This program manages X terminals by 
forking 1 or 2 processes per display:

  1. a local server 'X :0' if required, and 
  2. the session manager greeter '-:0'.

The dm.c WaitForChild() loop interpretes the session exit code and 
uses a ++startTries>=startAttempts test to detect an unresponsive 
server, OPENFAILED_DISPLAY, or a SIGTERM'ed session.  Other exit codes 
reset startTries = 0.  

All worked well until the ps2 mouse appeared.  Such input device is 
initialized last and after the X listening socket.  Thus the session 
manager finds the X server, starts the greeter, and waits for its 
return.  Alas, if the mouse now fails initialization then X quits and 
triggers session.c IOErrorHandler() { exit(RESERVER_DISPLAY); }.  
The main loop would then call RestartDisplay(d, TRUE) causing a 
persistent restart-quit cycle which inhibits any user login.  

Somewhere before 2007 this was solved by the crash detect code in dm.c 
at the case RESERVER_DISPLAY.  A manager exit is considered a crash if 
a previous exit occured within the XDM_BROKEN_INTERVAL of 120 second.  
RemoveDisplay() is then called to remove the display from the list.  

This isn't a perfect solution since crash detection may also be 
triggered by the ctrl-alt-backspace server termination.  Although a 
bit an unfriendly way of quitting X this is the only option available 
in case of a garbled display.  

The proposed patch http://bugs.freedesktop.org/show_bug.cgi?id=20546
adds a ++reservTries>=reservAttempts test to allow for a few successive 
crash-type manager exits.  The resource reservAttempts is the number 
of times a fatal IO error is allowed in session.c.  

The patch also substitutes the RestartDisplay() RemoveDisplay() 
sequence by StopDisplay().  The original sequence works since 
RestartDisplay(d, TRUE) kills the serverPid.  However RemoveDisplay() 
directly removes the display from the list which may thus give an 
"Unknown child termination" error.  StopDisplay() on the other hand 
implements the synchronous kill and wait for child exit.  
RemoveDisplay() is only to be called if neither server nor session 
manager is running.  

The patch is included below.  I was wondering if someone here would be 
willing to inspect and then perhaps to support the patch for inclusion 
into app/xdm.  If there are questions just ask and I will further 
describe the underlying logic.  

Signed-off-by: Servaas Vandenberghe

diff --git a/dm.c b/dm.c
index da18800..c0b0031 100644
--- a/dm.c
+++ b/dm.c
@@ -492,6 +492,7 @@ #endif
                break;
            case OBEYSESS_DISPLAY:
                d->startTries = 0;
+               d->reservTries = 0;
                Debug ("Display exited with OBEYSESS_DISPLAY\n");
                if (d->displayType.lifetime != Permanent ||
                    d->status == zombie)
@@ -528,24 +529,44 @@ #endif
                Debug ("Display exited with RESERVER_DISPLAY\n");
                if (d->displayType.origin == FromXDMCP || d->status == zombie)
                    StopDisplay(d);
-               else
-                   RestartDisplay (d, TRUE);
-               {
-                 Time_t Time;
-                 time(&Time);
-                 Debug("time %i %i\n",Time,d->lastCrash);
-                 if (d->lastCrash &&
-                     ((Time - d->lastCrash) < XDM_BROKEN_INTERVAL)) {
-                   Debug("Server crash frequency too high:"
-                         " removing display %s\n",d->name);
-                   LogError("Server crash rate too high:"
-                            " removing display %s\n",d->name);
+               else {
+                 Time_t now;
+                 int crash;
+
+                 time(&now);
+                 Debug("time %i %i try %i of %i\n", now, d->lastReserv,
+                       d->reservTries, d->reservAttempts);
+                 crash = d->lastReserv &&
+                   ((now - d->lastReserv) < XDM_BROKEN_INTERVAL);
+
+                 if (!crash)
+                   d->reservTries = 0;
+
+                 if (crash && ++d->reservTries >= d->reservAttempts) {
+                   char msg[] = "Server crash frequency too high:"
+                     " stopping display";
+                   Debug("%s %s\n", msg, d->name);
+                   LogError("%s %s\n", msg, d->name);
 #if !defined(ARC4_RANDOM) && !defined(DEV_RANDOM)
                    AddTimerEntropy();
 #endif
-                   RemoveDisplay (d);
+                   /* For a local X server either:
+                    * 1. The server exit was returned by waitpid().  So
+                    *    serverPid==-1 => StopDisplay() calls RemoveDisplay()
+                    *
+                    * 2. The server is a zombie or still running.  So
+                    *    serverPid>1 => StopDisplay()
+                    *                   a. sets status=zombie,
+                    *                   b. kills the server.
+                    *    The next waitpid() returns this zombie server pid
+                    *    and the 'case zombie:' below then calls 
+                    *    RemoveDisplay().
+                    */
+                   StopDisplay(d);
                  } else
-                   d->lastCrash = Time;
+                   RestartDisplay(d, TRUE);
+
+                 d->lastReserv = now;
                }
                break;
            case waitCompose (SIGTERM,0,0):
diff --git a/dm.h b/dm.h
index af50328..ccf0bd8 100644
--- a/dm.h
+++ b/dm.h
@@ -177,7 +177,8 @@ struct display {
        pid_t           serverPid;      /* process id of server (-1 if none) */
        FileState       state;          /* state during HUP processing */
        int             startTries;     /* current start try */
-        Time_t          lastCrash;      /* time of last crash */
+       Time_t          lastReserv;     /* time of last reserver crash */
+       int             reservTries;    /* current reserver try */
 # ifdef XDMCP
        /* XDMCP state */
        CARD32          sessionID;      /* ID of active session */
@@ -197,6 +198,7 @@ # endif
        int             openRepeat;     /* open attempts to make */
        int             openTimeout;    /* abort open attempt timeout */
        int             startAttempts;  /* number of attempts at starting */
+       int             reservAttempts; /* allowed fatal errors after start */
        int             pingInterval;   /* interval between XSync */
        int             pingTimeout;    /* timeout for XSync */
        int             terminateServer;/* restart for each session */
diff --git a/dpylist.c b/dpylist.c
index 6da882f..dccd679 100644
--- a/dpylist.c
+++ b/dpylist.c
@@ -241,7 +241,9 @@ NewDisplay (char *name, char *class)
     d->openTimeout = 0;
     d->startAttempts = 0;
     d->startTries = 0;
-    d->lastCrash = 0;
+    d->lastReserv = 0;
+    d->reservAttempts = 0;
+    d->reservTries = 0;
     d->terminateServer = 0;
     d->grabTimeout = 0;
 #ifdef XDMCP
diff --git a/resource.c b/resource.c
index 5c02da7..a06d742 100644
--- a/resource.c
+++ b/resource.c
@@ -222,6 +222,8 @@ struct displayResource serverResources[]
                                "120" },
 { "startAttempts","StartAttempts",DM_INT,      boffset(startAttempts),
                                "4" },
+{ "reservAttempts","ReservAttempts",DM_INT,    boffset(reservAttempts),
+                               "2" },
 { "pingInterval","PingInterval",DM_INT,                boffset(pingInterval),
                                "5" },
 { "pingTimeout","PingTimeout", DM_INT,         boffset(pingTimeout),
diff --git a/xdm.man.cpp b/xdm.man.cpp
index 220580b..d393598 100644
--- a/xdm.man.cpp
+++ b/xdm.man.cpp
@@ -445,6 +445,7 @@ See the section
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openRepeat\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openTimeout\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.startAttempts\fP"
+.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.reservAttempts\fP"
 These numeric resources control the behavior of
 .I xdm
 when attempting to open intransigent servers.  \fBopenDelay\fP is
@@ -463,9 +464,10 @@ This
 process is repeated \fBstartAttempts\fP times, at which point the display is
 declared dead and disabled.  Although
 this behavior may seem arbitrary, it has been empirically developed and
-works quite well on most systems.  The default values are
-5 for \fBopenDelay\fP, 5 for \fBopenRepeat\fP, 30 for \fBopenTimeout\fP and
-4 for \fBstartAttempts\fP.
+works quite well on most systems.  \fBreservAttempts\fP is the number of 
+times a fatal error is allowed after connecting.  The default values are
+\fBopenDelay\fP: 15, \fBopenRepeat\fP: 5, \fBopenTimeout\fP: 120, 
+\fBstartAttempts\fP: 4 and \fBreservAttempts\fP: 2.
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingInterval\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingTimeout\fP"
 To discover when remote displays disappear,
_______________________________________________
xorg-devel@lists.x.org: X.Org development
Archives: http://lists.x.org/archives/xorg-devel
Info: http://lists.x.org/mailman/listinfo/xorg-devel

Reply via email to