On Thu, Feb 07 2008, Ingo Molnar wrote:
> 
> * Jens Axboe <[EMAIL PROTECTED]> wrote:
> 
> > > pick up the threaded softirq patches from -rt, those move all 
> > > softirqs processing into kernel threads. I'd suggest to extend those 
> > > via wakeup-from-remote functionality - it fits the construct quite 
> > > naturally. You should also be able to directly observe any 
> > > performance impact of threaded softirq handlers. (and if you find 
> > > any, let me know so that we can make it faster :-)
> > 
> > I was just considering that, since I knew -rt moved the softirqs into 
> > threads. I'll look into it, but may not post anything until after my 
> > vacation.
> 
> we should more seriously investigate kernel thread scalability for 
> another reason as well: besides -rt, any generic async IO facility we 
> pick up will likely heavily rely on them. Kernel thread scheduling is 
> quite a bit lighter than user task scheduling [no TLB flushes, etc.] - 
> and if it is still not good enough we could probably accelerate them 
> some more. (and everyone will benefit) irq-context softirqs on the other 
> hand are quite rigid and bring in many atomicity assumptions so they are 
> not as natural to program for.

Here's a patch on top of the other patchset that switches blk-softirq.c
to using kernel threads instead. It's pretty simple. Booted tested,
works for me :-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 5ae8345..73c71c7 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -7,6 +7,8 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
 #include <linux/cpu.h>
 
 #include "blk.h"
@@ -14,41 +16,136 @@
 struct blk_comp {
        spinlock_t lock;
        struct list_head list;
-       int dead;
+       struct task_struct *task;
 };
-static DEFINE_PER_CPU(struct blk_comp, blk_cpu_done);
+static DEFINE_PER_CPU(struct blk_comp, blk_irq_data);
+
+static void raise_blk_irq(int cpu)
+{
+       struct blk_comp *bc = &per_cpu(blk_irq_data, cpu);
+
+       wake_up_process(bc->task);
+}
+
+static void blk_done_softirq(struct list_head *list)
+{
+       while (!list_empty(list)) {
+               struct request *rq;
+
+               rq = list_entry(list->next, struct request, donelist);
+               list_del_init(&rq->donelist);
+               rq->q->softirq_done_fn(rq);
+       }
+}
+
+static int blk_irq_thread(void *data)
+{
+       struct blk_comp *bc = data;
+       struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+
+       sched_setscheduler(current, SCHED_FIFO, &param);
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       while (!kthread_should_stop()) {
+               LIST_HEAD(local_list);
+
+               preempt_disable();
+               smp_mb();
+               if (list_empty(&bc->list)) {
+                       preempt_enable_no_resched();
+                       schedule();
+                       preempt_disable();
+               }
+
+               __set_current_state(TASK_RUNNING);
+               smp_mb();
+               while (!list_empty(&bc->list)) {
+                       spin_lock_irq(&bc->lock);
+                       list_replace_init(&bc->list, &local_list);
+                       spin_unlock_irq(&bc->lock);
+
+
+                       blk_done_softirq(&local_list);
+               }
+               preempt_enable();
+               set_current_state(TASK_INTERRUPTIBLE);
+       }
+
+       __set_current_state(TASK_RUNNING);
+       return 0;
+}
+
+static int create_blk_irq_thread(int cpu)
+{
+       struct blk_comp *bc = &per_cpu(blk_irq_data, cpu);
+       struct task_struct *task;
+
+       spin_lock_init(&bc->lock);
+       INIT_LIST_HEAD(&bc->list);
+       bc->task = NULL;
+
+       task = kthread_create(blk_irq_thread, bc, "blk-irq-%d", cpu);
+       if (IS_ERR(task)) {
+               printk("block: irq thread creation failed\n");
+               return 1;
+       }
+
+       kthread_bind(task, cpu);
+       bc->task = task;
+       return 0;
+}
 
 static int __cpuinit blk_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        int cpu = (unsigned long) hcpu;
+       struct task_struct *task;
        struct blk_comp *bc;
 
        switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               bc = &per_cpu(blk_irq_data, cpu);
+               if (bc->task) {
+                       printk("blk: task for %d already there\n", cpu);
+                       break;
+               }
+               if (create_blk_irq_thread(cpu))
+                       return NOTIFY_BAD;
+               break;
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               raise_blk_irq(cpu);
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               bc = &per_cpu(blk_irq_data, cpu);
+               if (!bc->task);
+                       break;
+               kthread_bind(bc->task, any_online_cpu(cpu_online_map));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN: {
                /*
                 * If a CPU goes away, splice its entries to the current CPU
                 * and trigger a run of the softirq
                 */
+               struct sched_param param;
                struct blk_comp *dead;
 
                local_irq_disable();
-               bc = &__get_cpu_var(blk_cpu_done);
-               dead = &per_cpu(blk_cpu_done, cpu);
+               bc = &__get_cpu_var(blk_irq_data);
+               dead = &per_cpu(blk_irq_data, cpu);
                double_spin_lock(&bc->lock, &dead->lock, bc < dead);
                list_splice_init(&dead->list, &bc->list);
-               dead->dead = 1;
                double_spin_unlock(&bc->lock, &dead->lock, bc < dead);
-               raise_softirq_irqoff(BLOCK_SOFTIRQ);
-               local_irq_enable();
-               break;
-               }
-       case CPU_ONLINE: {
-               local_irq_disable();
-               bc = &per_cpu(blk_cpu_done, cpu);
-               bc->dead = 0;
+
+               param.sched_priority = MAX_RT_PRIO - 1;
+               task = dead->task;
+               sched_setscheduler(task, SCHED_FIFO, &param);
+               dead->task = NULL;
+               raise_blk_irq(smp_processor_id());
                local_irq_enable();
+               kthread_stop(task);
                break;
                }
        }
@@ -56,35 +153,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block 
*self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+static struct notifier_block __cpuinitdata blk_cpu_notifier = {
        .notifier_call  = blk_cpu_notify,
 };
 
-/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
-       struct blk_comp *bc;
-       struct list_head local_list;
-
-       local_irq_disable();
-       bc = &__get_cpu_var(blk_cpu_done);
-       spin_lock(&bc->lock);
-       list_replace_init(&bc->list, &local_list);
-       spin_unlock(&bc->lock);
-       local_irq_enable();
-
-       while (!list_empty(&local_list)) {
-               struct request *rq;
-
-               rq = list_entry(local_list.next, struct request, donelist);
-               list_del_init(&rq->donelist);
-               rq->q->softirq_done_fn(rq);
-       }
-}
-
 /**
  * blk_complete_request - end I/O on a request
  * @req:      the request being processed
@@ -117,11 +189,11 @@ void blk_complete_request(struct request *req)
 
        if (ccpu == cpu) {
 is_dead:
-               bc = &__get_cpu_var(blk_cpu_done);
+               bc = &__get_cpu_var(blk_irq_data);
                q->local_complete++;
        } else {
-               bc = &per_cpu(blk_cpu_done, ccpu);
-               if (unlikely(bc->dead)) {
+               bc = &per_cpu(blk_irq_data, ccpu);
+               if (unlikely(!bc->task)) {
                        ccpu = cpu;
                        goto is_dead;
                }
@@ -134,30 +206,27 @@ is_dead:
        list_add_tail(&req->donelist, &bc->list);
        spin_unlock(&bc->lock);
 
-       if (was_empty) {
-               if (cpu == ccpu)
-                       raise_softirq_irqoff(BLOCK_SOFTIRQ);
-               else
-                       raise_softirq_on_cpu(ccpu, BLOCK_SOFTIRQ);
-       }
+       if (was_empty)
+               raise_blk_irq(ccpu);
 
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_complete_request);
 
-static int __init blk_softirq_init(void)
+__init int blk_softirq_init(void)
 {
        int i;
 
        for_each_possible_cpu(i) {
-               struct blk_comp *bc = &per_cpu(blk_cpu_done, i);
+               void *cpu;
+               int err;
 
-               spin_lock_init(&bc->lock);
-               INIT_LIST_HEAD(&bc->list);
-               bc->dead = 0;
+               cpu = (void *) (long) i;
+               err = blk_cpu_notify(&blk_cpu_notifier, CPU_UP_PREPARE, cpu);
+               BUG_ON(err == NOTIFY_BAD);
+               blk_cpu_notify(&blk_cpu_notifier, CPU_ONLINE, cpu);
        }
 
-       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
        register_hotcpu_notifier(&blk_cpu_notifier);
        return 0;
 }

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to