ping!

On Mon, Dec 24, 2012 at 03:03:30PM +0800, Shaohua Li wrote:
> 
> I'm testing swapout workload in a two-socket Xeon machine. The workload has 10
> threads, each thread sequentially accesses separate memory region. TLB flush
> overhead is very big in the workload. For each page, page reclaim need move it
> from active lru list and then unmap it. Both need a TLB flush. And this is a
> multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses
> generic smp_call)function. So this workload stress smp_call_function_many
> heavily.
> 
> Without patch, perf shows:
> +  24.49%  [k] generic_smp_call_function_interrupt
> -  21.72%  [k] _raw_spin_lock 
>    - _raw_spin_lock 
>       + 79.80% __page_check_address 
>       + 6.42% generic_smp_call_function_interrupt
>       + 3.31% get_swap_page                       
>       + 2.37% free_pcppages_bulk                  
>       + 1.75% handle_pte_fault                    
>       + 1.54% put_super                           
>       + 1.41% grab_super_passive                  
>       + 1.36% __swap_duplicate                    
>       + 0.68% blk_flush_plug_list                 
>       + 0.62% swap_info_get                       
> +   6.55%  [k] flush_tlb_func                     
> +   6.46%  [k] smp_call_function_many             
> +   5.09%  [k] call_function_interrupt            
> +   4.75%  [k] default_send_IPI_mask_sequence_phys
> +   2.18%  [k] find_next_bit
> 
> swapout throughput is around 1300M/s.
> 
> With the patch, perf shows:
> -  27.23%  [k] _raw_spin_lock                           
>    - _raw_spin_lock                                     
>       + 80.53% __page_check_address                     
>       + 8.39% generic_smp_call_function_single_interrupt
>       + 2.44% get_swap_page                             
>       + 1.76% free_pcppages_bulk                        
>       + 1.40% handle_pte_fault                          
>       + 1.15% __swap_duplicate                          
>       + 1.05% put_super                                 
>       + 0.98% grab_super_passive                        
>       + 0.86% blk_flush_plug_list                       
>       + 0.57% swap_info_get                             
> +   8.25%  [k] default_send_IPI_mask_sequence_phys      
> +   7.55%  [k] call_function_interrupt                  
> +   7.47%  [k] smp_call_function_many                   
> +   7.25%  [k] flush_tlb_func                           
> +   3.81%  [k] _raw_spin_lock_irqsave                   
> +   3.78%  [k] generic_smp_call_function_single_interrupt
> 
> swapout throughput is around 1400M/s. So there is around a 7% improvement, and
> total cpu utilization doesn't change.
> 
> Without the patch, cfd_data is shared by all CPUs.
> generic_smp_call_function_interrupt does read/write cfd_data several times
> which will create a lot of cache ping-pong. With the patch, the data becomes
> per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make
> call_single_queue lock contend.
> 
> Next step is to remove generic_smp_call_function_interrupt from arch code.
> 
> Signed-off-by: Shaohua Li <[email protected]>
> ---
>  include/linux/smp.h |    3 
>  kernel/smp.c        |  184 
> ++++++++--------------------------------------------
>  2 files changed, 32 insertions(+), 155 deletions(-)
> 
> Index: linux/kernel/smp.c
> ===================================================================
> --- linux.orig/kernel/smp.c   2012-12-24 11:37:28.150604463 +0800
> +++ linux/kernel/smp.c        2012-12-24 14:57:11.807949527 +0800
> @@ -16,22 +16,12 @@
>  #include "smpboot.h"
>  
>  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
> -static struct {
> -     struct list_head        queue;
> -     raw_spinlock_t          lock;
> -} call_function __cacheline_aligned_in_smp =
> -     {
> -             .queue          = LIST_HEAD_INIT(call_function.queue),
> -             .lock           = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
> -     };
> -
>  enum {
>       CSD_FLAG_LOCK           = 0x01,
>  };
>  
>  struct call_function_data {
> -     struct call_single_data csd;
> -     atomic_t                refs;
> +     struct call_single_data __percpu *csd;
>       cpumask_var_t           cpumask;
>  };
>  
> @@ -56,6 +46,11 @@ hotplug_cfd(struct notifier_block *nfb,
>               if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
>                               cpu_to_node(cpu)))
>                       return notifier_from_errno(-ENOMEM);
> +             cfd->csd = alloc_percpu(struct call_single_data);
> +             if (!cfd->csd) {
> +                     free_cpumask_var(cfd->cpumask);
> +                     return notifier_from_errno(-ENOMEM);
> +             }
>               break;
>  
>  #ifdef CONFIG_HOTPLUG_CPU
> @@ -65,6 +60,7 @@ hotplug_cfd(struct notifier_block *nfb,
>       case CPU_DEAD:
>       case CPU_DEAD_FROZEN:
>               free_cpumask_var(cfd->cpumask);
> +             free_percpu(cfd->csd);
>               break;
>  #endif
>       };
> @@ -166,85 +162,6 @@ void generic_exec_single(int cpu, struct
>  }
>  
>  /*
> - * Invoked by arch to handle an IPI for call function. Must be called with
> - * interrupts disabled.
> - */
> -void generic_smp_call_function_interrupt(void)
> -{
> -     struct call_function_data *data;
> -     int cpu = smp_processor_id();
> -
> -     /*
> -      * Shouldn't receive this interrupt on a cpu that is not yet online.
> -      */
> -     WARN_ON_ONCE(!cpu_online(cpu));
> -
> -     /*
> -      * Ensure entry is visible on call_function_queue after we have
> -      * entered the IPI. See comment in smp_call_function_many.
> -      * If we don't have this, then we may miss an entry on the list
> -      * and never get another IPI to process it.
> -      */
> -     smp_mb();
> -
> -     /*
> -      * It's ok to use list_for_each_rcu() here even though we may
> -      * delete 'pos', since list_del_rcu() doesn't clear ->next
> -      */
> -     list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
> -             int refs;
> -             smp_call_func_t func;
> -
> -             /*
> -              * Since we walk the list without any locks, we might
> -              * see an entry that was completed, removed from the
> -              * list and is in the process of being reused.
> -              *
> -              * We must check that the cpu is in the cpumask before
> -              * checking the refs, and both must be set before
> -              * executing the callback on this cpu.
> -              */
> -
> -             if (!cpumask_test_cpu(cpu, data->cpumask))
> -                     continue;
> -
> -             smp_rmb();
> -
> -             if (atomic_read(&data->refs) == 0)
> -                     continue;
> -
> -             func = data->csd.func;          /* save for later warn */
> -             func(data->csd.info);
> -
> -             /*
> -              * If the cpu mask is not still set then func enabled
> -              * interrupts (BUG), and this cpu took another smp call
> -              * function interrupt and executed func(info) twice
> -              * on this cpu.  That nested execution decremented refs.
> -              */
> -             if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
> -                     WARN(1, "%pf enabled interrupts and double executed\n", 
> func);
> -                     continue;
> -             }
> -
> -             refs = atomic_dec_return(&data->refs);
> -             WARN_ON(refs < 0);
> -
> -             if (refs)
> -                     continue;
> -
> -             WARN_ON(!cpumask_empty(data->cpumask));
> -
> -             raw_spin_lock(&call_function.lock);
> -             list_del_rcu(&data->csd.list);
> -             raw_spin_unlock(&call_function.lock);
> -
> -             csd_unlock(&data->csd);
> -     }
> -
> -}
> -
> -/*
>   * Invoked by arch to handle an IPI for call function single. Must be
>   * called from the arch with interrupts disabled.
>   */
> @@ -448,8 +365,7 @@ void smp_call_function_many(const struct
>                           smp_call_func_t func, void *info, bool wait)
>  {
>       struct call_function_data *data;
> -     unsigned long flags;
> -     int refs, cpu, next_cpu, this_cpu = smp_processor_id();
> +     int cpu, next_cpu, this_cpu = smp_processor_id();
>  
>       /*
>        * Can deadlock when called with interrupts disabled.
> @@ -481,79 +397,39 @@ void smp_call_function_many(const struct
>       }
>  
>       data = &__get_cpu_var(cfd_data);
> -     csd_lock(&data->csd);
> -
> -     /* This BUG_ON verifies our reuse assertions and can be removed */
> -     BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
> -
> -     /*
> -      * The global call function queue list add and delete are protected
> -      * by a lock, but the list is traversed without any lock, relying
> -      * on the rcu list add and delete to allow safe concurrent traversal.
> -      * We reuse the call function data without waiting for any grace
> -      * period after some other cpu removes it from the global queue.
> -      * This means a cpu might find our data block as it is being
> -      * filled out.
> -      *
> -      * We hold off the interrupt handler on the other cpu by
> -      * ordering our writes to the cpu mask vs our setting of the
> -      * refs counter.  We assert only the cpu owning the data block
> -      * will set a bit in cpumask, and each bit will only be cleared
> -      * by the subject cpu.  Each cpu must first find its bit is
> -      * set and then check that refs is set indicating the element is
> -      * ready to be processed, otherwise it must skip the entry.
> -      *
> -      * On the previous iteration refs was set to 0 by another cpu.
> -      * To avoid the use of transitivity, set the counter to 0 here
> -      * so the wmb will pair with the rmb in the interrupt handler.
> -      */
> -     atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
>  
> -     data->csd.func = func;
> -     data->csd.info = info;
> -
> -     /* Ensure 0 refs is visible before mask.  Also orders func and info */
> -     smp_wmb();
> -
> -     /* We rely on the "and" being processed before the store */
>       cpumask_and(data->cpumask, mask, cpu_online_mask);
>       cpumask_clear_cpu(this_cpu, data->cpumask);
> -     refs = cpumask_weight(data->cpumask);
>  
>       /* Some callers race with other cpus changing the passed mask */
> -     if (unlikely(!refs)) {
> -             csd_unlock(&data->csd);
> +     if (unlikely(!cpumask_weight(data->cpumask)))
>               return;
> -     }
> -
> -     raw_spin_lock_irqsave(&call_function.lock, flags);
> -     /*
> -      * Place entry at the _HEAD_ of the list, so that any cpu still
> -      * observing the entry in generic_smp_call_function_interrupt()
> -      * will not miss any other list entries:
> -      */
> -     list_add_rcu(&data->csd.list, &call_function.queue);
> -     /*
> -      * We rely on the wmb() in list_add_rcu to complete our writes
> -      * to the cpumask before this write to refs, which indicates
> -      * data is on the list and is ready to be processed.
> -      */
> -     atomic_set(&data->refs, refs);
> -     raw_spin_unlock_irqrestore(&call_function.lock, flags);
>  
> -     /*
> -      * Make the list addition visible before sending the ipi.
> -      * (IPIs must obey or appear to obey normal Linux cache
> -      * coherency rules -- see comment in generic_exec_single).
> -      */
> -     smp_mb();
> +     for_each_cpu(cpu, data->cpumask) {
> +             struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
> +             struct call_single_queue *dst =
> +                                     &per_cpu(call_single_queue, cpu);
> +             unsigned long flags;
> +
> +             csd_lock(csd);
> +             csd->func = func;
> +             csd->info = info;
> +
> +             raw_spin_lock_irqsave(&dst->lock, flags);
> +             list_add_tail(&csd->list, &dst->list);
> +             raw_spin_unlock_irqrestore(&dst->lock, flags);
> +     }
>  
>       /* Send a message to all CPUs in the map */
>       arch_send_call_function_ipi_mask(data->cpumask);
>  
> -     /* Optionally wait for the CPUs to complete */
> -     if (wait)
> -             csd_lock_wait(&data->csd);
> +     if (wait) {
> +             for_each_cpu(cpu, data->cpumask) {
> +                     struct call_single_data *csd =
> +                                     per_cpu_ptr(data->csd, cpu);
> +                     csd_lock_wait(csd);
> +             }
> +     }
>  }
>  EXPORT_SYMBOL(smp_call_function_many);
>  
> Index: linux/include/linux/smp.h
> ===================================================================
> --- linux.orig/include/linux/smp.h    2012-12-24 11:38:28.901840885 +0800
> +++ linux/include/linux/smp.h 2012-12-24 11:39:16.557243069 +0800
> @@ -89,7 +89,8 @@ void kick_all_cpus_sync(void);
>  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
>  void __init call_function_init(void);
>  void generic_smp_call_function_single_interrupt(void);
> -void generic_smp_call_function_interrupt(void);
> +#define generic_smp_call_function_interrupt \
> +     generic_smp_call_function_single_interrupt
>  #else
>  static inline void call_function_init(void) { }
>  #endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to