I only did my tests on 9vx. I have a version that I instrumented with
a circular log buffer, and I have some gdb macros which dumps the
buffer.

I can put the whole source somewhere and even a log with my comments
of the bug if you want to see it. But please note that I made several
changes (because I had to understand how it works) and I would rather
copy my changes to the latest 9vx source tree so that everyone can
read it. What do you think ?

At the end of this post I added some part of the instruments.

Please, I would like to insist on the fact I'm not saying the promela
model is wrong. And I realize that the fix I propose might not be the
good one. Maybe the problem is even elsewhere. All these is just
feelings, logs and headache.

Phil;


The logs look like this :

(gdb) k9log 10
- kernel history size: 524288
         element size: 64
        element count: 8192
  History stack: 56300 elems:
dbeb: m= 3 pc=44e235 sp= 51f02020 up=0 :0 xp=0 :0 r= 0 a= 0 # kproc: runproc: psleep dbea: m= 3 pc=44e235 sp= 51f02020 up=0 :0 xp=0 :0 r= 0 a= 0 # kproc: runproc: search dbe9: m= 3 pc=44e235 sp= 51f02020 up=0 :0 xp=0 :0 r= 0 a= 0 # kproc: runproc dbe8: m= 3 pc=44dfb2 sp= 51f02070 up=0 :0 xp=4 :8 r= 0 a= 0 # proc: sched: calling runproc dbe7: m= 3 pc=44dfb2 sp= 51f02070 up=0 :0 xp=4 :8 r= 0 a= 0 # proc: sched dbe6: m= 3 pc=40e41b sp= 51f020b0 up=4 :8 xp=0 :0 r= 0 a= 0 # proc: schedinit: disable up dbe5: m= 3 pc=40e41b sp= 51f020b0 up=4 :8 xp=0 :0 r= 983360 a= 0 # proc: sleep: unlock r dbe4: m= 3 pc=40e41b sp= 51f020b0 up=4 :8 xp=0 :0 r= 983360 a= 0 # proc: sleep: unlock up dbe3: m= 3 pc=40e41b sp= 51f020b0 up=4 :8 xp=0 :0 r= 0 a= 0 # proc: schedinit: up still active dbe2: m= 3 pc=40e41b sp= 51f020b0 up=4 :8 xp=0 :0 r= 0 a= 0 # proc: schedinit: start

where the first column is the serial of the logged operation, and 'a'
just a int.

-------------------------------------------------------------------------------------
void
sleep(Rendez *r, int (*ftest)(void*), void *arg)
{
   int s;
   void (*pt)(Proc*, int, vlong);
   void * pc;

   pc = getcallerpc(&r);
   k9log("proc: sleep", pc, 0, r, 0);

   s = splhi();

   if (up == nil)
       dbgbreak();

   k9log("proc: sleep: lock r", pc, 0, r, 0);

   lock(&r->xlk);

   if(r->xp){
       k9log("proc: sleep: ** double sleep **", pc, r->xp, r, 0);
       panic("cpu%d: up=%d *double sleep* r=%p r->p=%d caller=%p\n",
             m->machno, up ? up->pid : 0, r, r->xp ? r->xp->pid : 0, pc);
   }

   /*
    *  Wakeup only knows there may be something to do by testing
    *  r->p in order to get something to lock on.
    *  Flush that information out to memory in case the sleep is
    *  committed.
    */
   r->xp = up;
   r->xm = m;
   r->xpc = pc;

   k9log("proc: sleep: lock up", pc, 0, r, 0);

   lock(&up->rlock);

//    if (up->state != Running)
//        dbgbreak();
k9log("proc: sleep: condition", pc, 0, r, up->nlocks.ref);

   if ((*ftest)(arg)) {
       k9log("proc: sleep: happened", pc, 0, r, 0);
done:
       /*
        *  if condition happened or a note is pending
        *  never mind
        */
       r->xp = nil;

       k9log("proc: sleep: unlock up", pc, 0, r, 0);

       unlock(&up->rlock);

       k9log("proc: sleep: unlock r", pc, 0, r, 0);

       unlock(&r->xlk);
   }
       else if (up->notepending) {
       k9log("proc: sleep: note pending", pc, 0, r, 0);
       goto done;
   }
   else {
       /*
        *  now we are committed to
        *  change state and call scheduler
        */
       pt = proctrace;
       if(pt)
           pt(up, SSleep, 0);
       up->state = Wakeme;
       up->rx = r;

       /* statistics */
       m->cs++;

       k9log("proc: sleep: sleeping", pc, 0, r, 0);

       procsave(up);
       if(setlabel(&up->sched)) {
           /*
            *  here when the process is awakened
            */
           k9log("proc: sleep: awakened", pc, r->xp, r, 0);

           procrestore(up);
           spllo();
       } else {
           /*
            *  here to go to sleep (i.e. stop Running)
            */

//            up->rmu = 1;

//            k9log("proc: sleep: unlock up", pc, 0, r, 0);
// unlock(&up->rlock); // k9log("proc: sleep: unlock r", pc, 0, r, 0); // unlock(&r->xlk);

           k9log("proc: sleep: going sched", pc, 0, r, 0);

           gotolabel(&m->sched);
       }
   }

   k9log("proc: sleep: done", pc, 0, r, 0);

   if(up->notepending) {
       k9log("proc: sleep: forward note", pc, 0, r, 0);

       up->notepending = 0;
       splx(s);
       if(up->procctl == Proc_exitme && up->closingfgrp)
           forceclosefgrp();
       error(Eintr);
   }

   splx(s);
}



struct {
   int32 count;
   struct {
       char *         op;
       void *         pc;
       void *        sp;
       int         p0;
       int        s0;
       int         p1;
       int        s1;
       int        xm;    // machno
       Rendez *     r;
       int32        a;
       int32         n;
   } hist[8192];
} k9logs;


void
k9log(char * op, void * pc, Proc * p, void * r, int32 a)
{
   static int32 serial = 0;
   int32 n, i;

   n = __sync_fetch_and_add_4(&serial, 1);
   i = __sync_fetch_and_add_4(&k9logs.count, 1) % nelem(k9logs.hist);

   k9logs.hist[i].n = n;
   k9logs.hist[i].r = r;
   k9logs.hist[i].a = a;
   k9logs.hist[i].xm = getmach()->machno;
   k9logs.hist[i].op = op;
   k9logs.hist[i].pc = pc;
   k9logs.hist[i].sp = &p;
   if (up) {
       k9logs.hist[i].p0 = up->pid;
       k9logs.hist[i].s0 = up->state;
   }
   else {
       k9logs.hist[i].p0 = 0;
       k9logs.hist[i].s0 = 0;
   }
   if (p) {
       k9logs.hist[i].p1 = p->pid;
       k9logs.hist[i].s1 = p->state;
   }
   else {
       k9logs.hist[i].p1 = 0;
       k9logs.hist[i].s1 = 0;
   }
}

Bakul Shah wrote:
On Fri, 11 Jun 2010 16:59:42 +0200 Philippe Anel <x...@bouyapop.org>  wrote:
Ooops I forgot to answer this :
- does changing spl* to manipulation of a per-cpu lock solve the problem?
sometimes preventing anything else from running on your mach is
exactly what you want.
No ... I don't think so. I think the problem comes from the fact the
process is no longer exclusively tied to the current Mach when going
(back) to schedinit() ... hence the change I did.

Were you able to verify your hypothesis by adding a bit of
trapping code + assertion(s) in the original sources?  At the
point of double sleep one can check state to see if the
expected preconditions are true.  Alternatively one can check
when the expected conditions become true, set a variable and
test it where the double sleep print occurs.  Then one can sort
of walk back to the earliest point where things go wrong.




Reply via email to