On Fri, 8 Apr 2016, Waiman Long wrote:
> This patch attempts to reduce HPET read contention by using the fact
> that if more than one task are trying to access HPET at the same time,
> it will be more efficient if one task in the group reads the HPET
> counter and shares it with the rest of the group instead of each
> group member reads the HPET counter individually.

That has nothing to do with tasks. clocksource reads can happen from almost
any context. The problem is concurrent access on multiple cpus.

> This optimization is enabled on systems with more than 32 CPUs. It can
> also be explicitly enabled or disabled by using the new opt_read_hpet
> kernel parameter.

Please not. What's wrong with enabling it unconditionally?
 
> +/*
>   * Clock source related code
>   */
>  static cycle_t read_hpet(struct clocksource *cs)
>  {
> -     return (cycle_t)hpet_readl(HPET_COUNTER);
> +     int seq, cnt = 0;
> +     u32 time;
> +
> +     if (opt_read_hpet <= 0)
> +             return (cycle_t)hpet_readl(HPET_COUNTER);

This wants to be conditional on CONFIG_SMP. No point in having all that muck
around for an UP kernel.

> +     seq = READ_ONCE(hpet_save.seq);
> +     if (!HPET_SEQ_LOCKED(seq)) {
> +             int old, new = seq + 1;
> +             unsigned long flags;
> +
> +             local_irq_save(flags);
> +             /*
> +              * Set the lock bit (lsb) to get the right to read HPET
> +              * counter directly. If successful, read the counter, save
> +              * its value, and increment the sequence number. Otherwise,
> +              * increment the sequnce number to the expected locked value
> +              * for comparison later on.
> +              */
> +             old = cmpxchg(&hpet_save.seq, seq, new);
> +             if (old == seq) {
> +                     time = hpet_readl(HPET_COUNTER);
> +                     WRITE_ONCE(hpet_save.hpet, time);
> +
> +                     /* Unlock */
> +                     smp_store_release(&hpet_save.seq, new + 1);
> +                     local_irq_restore(flags);
> +                     return (cycle_t)time;
> +             }
> +             local_irq_restore(flags);
> +             seq = new;
> +     }
> +
> +     /*
> +      * Wait until the locked sequence number changes which indicates
> +      * that the saved HPET value is up-to-date.
> +      */
> +     while (READ_ONCE(hpet_save.seq) == seq) {
> +             /*
> +              * Since reading the HPET is much slower than a single
> +              * cpu_relax() instruction, we use two here in an attempt
> +              * to reduce the amount of cacheline contention in the
> +              * hpet_save.seq cacheline.
> +              */
> +             cpu_relax();
> +             cpu_relax();
> +
> +             if (likely(++cnt <= HPET_RESET_THRESHOLD))
> +                     continue;
> +
> +             /*
> +              * In the unlikely event that it takes too long for the lock
> +              * holder to read the HPET, we do it ourselves and try to
> +              * reset the lock. This will also break a deadlock if it
> +              * happens, for example, when the process context lock holder
> +              * gets killed in the middle of reading the HPET counter.
> +              */
> +             time = hpet_readl(HPET_COUNTER);
> +             WRITE_ONCE(hpet_save.hpet, time);
> +             if (READ_ONCE(hpet_save.seq) == seq) {
> +                     if (cmpxchg(&hpet_save.seq, seq, seq + 1) == seq)
> +                             pr_warn("read_hpet: reset hpet seq to 0x%x\n",
> +                                     seq + 1);

This is voodoo programming and actively dangerous.

CPU0            CPU1                    CPU2
lock_hpet()
T1=read_hpet()  wait_for_unlock()       
store_hpet(T1)  
                ....                    
                T2 = read_hpet()
unlock_hpet()                           
                                        lock_hpet()
                                        T3 = read_hpet()
                                        store_hpet(T3)
                                        unlock_hpet()
                                        return T3
lock_hpet()
T4 = read_hpet()                        wait_for_unlock()
store_hpet(T4)  
                store_hpet(T2)                  
unlock_hpet()                           return T2

CPU2 will observe time going backwards.

Thanks,

        tglx

Reply via email to