I just managed to create this infamous bug pattern once again:

function()
{
        stall_topmost_domain();
        ...
        if (condition)
                return;
        ...
        unstall_topmost_domain();
}

The result is often a locked-up system, specifically the root domain no
longer receives IRQs. Unless you find the bug quickly by code
inspection, debugging/instrumenting can take quite some time.

To catch such issues earlier, I therefore propose the following
extension of ipipe_check_context. It is based on the assumption that the
topmost domain should never be stalled when lower domains execute that
check. This specifically takes care of not breaking Xenomai's IRQ shield
(a mid-prio domain that intentionally blocks Linux IRQs).

This is how this patch sees my bug:

I-pipe: Detected stalled topmost domain, probably caused by a bug.
        A critical section may have been left unterminated.
Pid: 4483, comm: cyclictest Tainted: G        W 2.6.26.2-xeno_64 #55

Call Trace:
 [<ffffffff8026b61b>] ipipe_check_context+0x11e/0x128
 [<ffffffff80474849>] down_write+0x1d/0x2e
 [<ffffffff802c9686>] ipipe_disable_ondemand_mappings+0x41/0x3b3
 [<ffffffff8021ed3c>] ? mcount+0x4c/0x72
 [<ffffffff80283b93>] xnshadow_map+0x65/0x2a6
 [<ffffffff8021ed3c>] ? mcount+0x4c/0x72
 [<ffffffff802b5bc5>] __pthread_setschedparam+0xd3/0x3b9
 [<ffffffff80283840>] losyscall_event+0x11f/0x1ee
 [<ffffffff80283721>] ? losyscall_event+0x0/0x1ee
 [<ffffffff8026c6ad>] __ipipe_dispatch_event+0x127/0x255
 [<ffffffff8021e922>] __ipipe_syscall_root+0xa2/0x194
 [<ffffffff8047555a>] __ipipe_syscall_root_thunk+0x35/0x6a
 [<ffffffff8020c034>] ? system_call_after_swapgs+0x54/0x94

I-pipe tracer log (100 points):
 |  *+func                    0 ipipe_trace_panic_freeze+0xe 
(ipipe_check_context+0xab)
 |  *+func                    0 find_next_bit+0x9 (__next_cpu+0x1e)
 |  *+func                    0 __next_cpu+0x9 (ipipe_check_context+0x9f)
 |  *+func                   -1 find_first_bit+0x9 (__first_cpu+0x13)
 |  *+func                   -1 __first_cpu+0x9 (ipipe_check_context+0x79)
 |  *+func                   -1 ipipe_check_context+0xc (down_write+0x1d)
 |  *+func                   -1 down_write+0xe 
(ipipe_disable_ondemand_mappings+0x41)
 |  *+func                   -2 _spin_lock+0x9 (get_task_mm+0x1d)
 |  *+func                   -2 get_task_mm+0xe 
(ipipe_disable_ondemand_mappings+0x1e)
 |  *+func                   -3 ipipe_disable_ondemand_mappings+0x16 
(xnshadow_map+0x65)
 |  *+func                   -3 xnshadow_map+0x12 (__pthread_setschedparam+0xd3)
 |  *+func                   -4 xnsynch_init+0x9 (xnregistry_enter+0xf8)
 |   +begin   0x80000000     -5 xnregistry_enter+0x5b (pthread_create+0x321)
     +func                   -5 strchr+0x9 (xnregistry_enter+0x40)
     +func                   -6 xnregistry_enter+0x16 (pthread_create+0x321)
 |   +end     0x80000000     -6 __ipipe_restore_pipeline_head+0xea 
(pthread_create+0x309)
 |  *+func                   -6 __ipipe_restore_pipeline_head+0xe 
(pthread_create+0x309)
 |  *+func                   -7 ppd_lookup_inner+0xe (xnshadow_ppd_get+0x5d)
 |  *+func                   -8 xnshadow_ppd_get+0xd (pthread_create+0x28c)
 |   +begin   0x80000000     -8 pthread_create+0x227 
(__pthread_setschedparam+0xb6)

(xnregistry_enter is left with nklock still held. Fix committed.)

Jan

---
 kernel/ipipe/core.c |   20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

Index: b/kernel/ipipe/core.c
===================================================================
--- a/kernel/ipipe/core.c
+++ b/kernel/ipipe/core.c
@@ -1567,7 +1567,9 @@ void ipipe_check_context(struct ipipe_do
        /* Note: We don't make the per_cpu access atomic. We assume that code
           which temporarily disables the check does this in atomic context
           only. */
-       if (likely(ipipe_current_domain->priority <= border_ipd->priority) ||
+       if (likely(ipipe_current_domain->priority <= border_ipd->priority &&
+                  !test_bit(IPIPE_STALL_FLAG,
+                            &ipipe_head_cpudom_var(status))) ||
            !per_cpu(ipipe_percpu_context_check, ipipe_processor_id()))
                return;
 
@@ -1575,10 +1577,18 @@ void ipipe_check_context(struct ipipe_do
 
        ipipe_trace_panic_freeze();
        ipipe_set_printk_sync(ipipe_current_domain);
-       printk(KERN_ERR "I-pipe: Detected illicit call from domain '%s'\n"
-              KERN_ERR "        into a service reserved for domain '%s' and "
-                       "below.\n",
-              ipipe_current_domain->name, border_ipd->name);
+
+       if (ipipe_current_domain->priority > border_ipd->priority)
+               printk(KERN_ERR "I-pipe: Detected illicit call from domain "
+                               "'%s'\n"
+                      KERN_ERR "        into a service reserved for domain "
+                               "'%s' and below.\n",
+                      ipipe_current_domain->name, border_ipd->name);
+       else
+               printk(KERN_ERR "I-pipe: Detected stalled topmost domain, "
+                               "probably caused by a bug.\n"
+                               "        A critical section may have been "
+                               "left unterminated.\n");
        dump_stack();
        ipipe_trace_panic_dump();
 }

_______________________________________________
Adeos-main mailing list
[email protected]
https://mail.gna.org/listinfo/adeos-main

Reply via email to