The reading of neighbour table entries can be converted from a slow
reader/writer lock to a fast lockless sequence number check.

Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>


---
 include/net/neighbour.h |    2 
 net/core/neighbour.c    |  117 +++++++++++++++++++++++++++++-------------------
 net/ipv4/arp.c          |  101 +++++++++++++++++++++++++----------------
 net/ipv6/ndisc.c        |   16 +++---
 net/ipv6/route.c        |   12 ++--
 net/sched/sch_teql.c    |   11 +++-
 6 files changed, 155 insertions(+), 104 deletions(-)

--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -100,7 +100,7 @@ struct neighbour
        __u8                    type;
        __u8                    dead;
        atomic_t                probes;
-       rwlock_t                lock;
+       seqlock_t               lock;
        unsigned char           ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
        struct hh_cache         *hh;
        atomic_t                refcnt;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -143,17 +143,17 @@ static int neigh_forced_gc(struct neigh_
                         * - nobody refers to it.
                         * - it is not permanent
                         */
-                       write_lock(&n->lock);
+                       write_seqlock(&n->lock);
                        if (atomic_read(&n->refcnt) == 1 &&
                            !(n->nud_state & NUD_PERMANENT)) {
                                hlist_del_rcu(&n->hlist);
                                n->dead = 1;
                                shrunk  = 1;
-                               write_unlock(&n->lock);
+                               write_sequnlock(&n->lock);
                                call_rcu(&n->rcu, neigh_rcu_release);
                                continue;
                        }
-                       write_unlock(&n->lock);
+                       write_sequnlock(&n->lock);
                }
        }
 
@@ -198,7 +198,7 @@ static void neigh_flush_dev(struct neigh
                                continue;
 
                        hlist_del_rcu(&n->hlist);
-                       write_lock(&n->lock);
+                       write_seqlock(&n->lock);
                        neigh_del_timer(n);
                        n->dead = 1;
 
@@ -220,7 +220,7 @@ static void neigh_flush_dev(struct neigh
                                        n->nud_state = NUD_NONE;
                                NEIGH_PRINTK2("neigh %p is stray.\n", n);
                        }
-                       write_unlock(&n->lock);
+                       write_sequnlock(&n->lock);
                        neigh_release(n);
                }
        }
@@ -267,7 +267,7 @@ static struct neighbour *neigh_alloc(str
        memset(n, 0, tbl->entry_size);
 
        skb_queue_head_init(&n->arp_queue);
-       rwlock_init(&n->lock);
+       seqlock_init(&n->lock);
        n->updated        = n->used = now;
        n->nud_state      = NUD_NONE;
        n->output         = neigh_blackhole;
@@ -615,7 +615,7 @@ void neigh_destroy(struct neighbour *nei
 /* Neighbour state is suspicious;
    disable fast path.
 
-   Called with write_locked neigh.
+   Called with locked neigh.
  */
 static void neigh_suspect(struct neighbour *neigh)
 {
@@ -632,7 +632,7 @@ static void neigh_suspect(struct neighbo
 /* Neighbour state is OK;
    enable fast path.
 
-   Called with write_locked neigh.
+   Called with locked neigh.
  */
 static void neigh_connect(struct neighbour *neigh)
 {
@@ -676,7 +676,7 @@ static void neigh_periodic_timer(unsigne
        hlist_for_each_entry_safe(n, node, tmp, head, hlist) {
                unsigned int state;
 
-               write_lock(&n->lock);
+               write_seqlock(&n->lock);
 
                state = n->nud_state;
                if (state & (NUD_PERMANENT | NUD_IN_TIMER))
@@ -690,12 +690,12 @@ static void neigh_periodic_timer(unsigne
                     time_after(now, n->used + n->parms->gc_staletime))) {
                        hlist_del_rcu(&n->hlist);
                        n->dead = 1;
-                       write_unlock(&n->lock);
+                       write_sequnlock(&n->lock);
                        neigh_release(n);
                        continue;
                }
        next_elt:
-               write_unlock(&n->lock);
+               write_sequnlock(&n->lock);
        }
 
        /* Cycle through all hash buckets every base_reachable_time/2 ticks.
@@ -738,7 +738,7 @@ static void neigh_timer_handler(unsigned
        unsigned state;
        int notify = 0;
 
-       write_lock(&neigh->lock);
+       write_seqlock(&neigh->lock);
 
        state = neigh->nud_state;
        now = jiffies;
@@ -748,6 +748,7 @@ static void neigh_timer_handler(unsigned
 #ifndef CONFIG_SMP
                printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
 #endif
+               write_sequnlock(&neigh->lock);
                goto out;
        }
 
@@ -808,9 +809,9 @@ static void neigh_timer_handler(unsigned
                 */
                while (neigh->nud_state == NUD_FAILED &&
                       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
-                       write_unlock(&neigh->lock);
+                       write_sequnlock(&neigh->lock);
                        neigh->ops->error_report(neigh, skb);
-                       write_lock(&neigh->lock);
+                       write_sequnlock(&neigh->lock);
                }
                skb_queue_purge(&neigh->arp_queue);
        }
@@ -821,20 +822,22 @@ static void neigh_timer_handler(unsigned
                if (!mod_timer(&neigh->timer, next))
                        neigh_hold(neigh);
        }
+
        if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
                struct sk_buff *skb = skb_peek(&neigh->arp_queue);
                /* keep skb alive even if arp_queue overflows */
                if (skb)
                        skb_get(skb);
-               write_unlock(&neigh->lock);
+               write_sequnlock(&neigh->lock);
                neigh->ops->solicit(neigh, skb);
                atomic_inc(&neigh->probes);
                if (skb)
                        kfree_skb(skb);
        } else {
-out:
-               write_unlock(&neigh->lock);
+               write_sequnlock(&neigh->lock);
        }
+
+out:
        if (notify)
                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
 
@@ -850,11 +853,11 @@ int __neigh_event_send(struct neighbour 
        int rc;
        unsigned long now;
 
-       write_lock_bh(&neigh->lock);
+       write_seqlock_bh(&neigh->lock);
 
        rc = 0;
        if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
-               goto out_unlock_bh;
+               goto out;
 
        now = jiffies;
        
@@ -868,7 +871,7 @@ int __neigh_event_send(struct neighbour 
                } else {
                        neigh->nud_state = NUD_FAILED;
                        neigh->updated = jiffies;
-                       write_unlock_bh(&neigh->lock);
+                       write_sequnlock_bh(&neigh->lock);
 
                        if (skb)
                                kfree_skb(skb);
@@ -896,8 +899,8 @@ int __neigh_event_send(struct neighbour 
                }
                rc = 1;
        }
-out_unlock_bh:
-       write_unlock_bh(&neigh->lock);
+out:
+       write_sequnlock_bh(&neigh->lock);
        return rc;
 }
 
@@ -948,7 +951,7 @@ int neigh_update(struct neighbour *neigh
        struct net_device *dev;
        int update_isrouter = 0;
 
-       write_lock_bh(&neigh->lock);
+       write_seqlock_bh(&neigh->lock);
 
        dev    = neigh->dev;
        old    = neigh->nud_state;
@@ -1052,22 +1055,23 @@ int neigh_update(struct neighbour *neigh
                while (neigh->nud_state & NUD_VALID &&
                       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
                        struct neighbour *n1 = neigh;
-                       write_unlock_bh(&neigh->lock);
+                       write_sequnlock_bh(&neigh->lock);
                        /* On shaper/eql skb->dst->neighbour != neigh :( */
                        if (skb->dst && skb->dst->neighbour)
                                n1 = skb->dst->neighbour;
                        n1->output(skb);
-                       write_lock_bh(&neigh->lock);
+                       write_seqlock_bh(&neigh->lock);
                }
                skb_queue_purge(&neigh->arp_queue);
        }
-out:
+
        if (update_isrouter) {
                neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
                        (neigh->flags | NTF_ROUTER) :
                        (neigh->flags & ~NTF_ROUTER);
        }
-       write_unlock_bh(&neigh->lock);
+out:
+       write_sequnlock_bh(&neigh->lock);
 
        if (notify)
                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -1144,6 +1148,30 @@ int neigh_compat_output(struct sk_buff *
        return dev_queue_xmit(skb);
 }
 
+static int neigh_hard_header(struct sk_buff *skb, struct net_device *dev,
+                            const struct neighbour *neigh)
+{
+       int rc;
+
+       unsigned seq;
+
+       for(;;) {
+               seq = read_seqbegin(&neigh->lock);
+               rc = dev->hard_header(skb, dev, ntohs(skb->protocol),
+                                     neigh->ha, NULL, skb->len);
+
+               if (likely(!read_seqretry(&neigh->lock, seq)))
+                       break;
+
+               if (rc < 0)
+                       break;
+
+               __skb_pull(skb, rc);
+       }
+
+       return rc;
+}
+
 /* Slow and careful. */
 
 int neigh_resolve_output(struct sk_buff *skb)
@@ -1160,19 +1188,17 @@ int neigh_resolve_output(struct sk_buff 
        if (!neigh_event_send(neigh, skb)) {
                int err;
                struct net_device *dev = neigh->dev;
+
                if (dev->hard_header_cache && !dst->hh) {
-                       write_lock_bh(&neigh->lock);
+                       write_seqlock_bh(&neigh->lock);
                        if (!dst->hh)
                                neigh_hh_init(neigh, dst, dst->ops->protocol);
                        err = dev->hard_header(skb, dev, ntohs(skb->protocol),
                                               neigh->ha, NULL, skb->len);
-                       write_unlock_bh(&neigh->lock);
-               } else {
-                       read_lock_bh(&neigh->lock);
-                       err = dev->hard_header(skb, dev, ntohs(skb->protocol),
-                                              neigh->ha, NULL, skb->len);
-                       read_unlock_bh(&neigh->lock);
-               }
+                       write_sequnlock_bh(&neigh->lock);
+               } else
+                       err = neigh_hard_header(skb, dev, neigh);
+
                if (err >= 0)
                        rc = neigh->ops->queue_xmit(skb);
                else
@@ -1196,14 +1222,11 @@ int neigh_connected_output(struct sk_buf
        int err;
        struct dst_entry *dst = skb->dst;
        struct neighbour *neigh = dst->neighbour;
-       struct net_device *dev = neigh->dev;
 
        __skb_pull(skb, skb->nh.raw - skb->data);
 
-       read_lock_bh(&neigh->lock);
-       err = dev->hard_header(skb, dev, ntohs(skb->protocol),
-                              neigh->ha, NULL, skb->len);
-       read_unlock_bh(&neigh->lock);
+       err = neigh_hard_header(skb, neigh->dev, neigh);
+
        if (err >= 0)
                err = neigh->ops->queue_xmit(skb);
        else {
@@ -1960,11 +1983,15 @@ static int neigh_fill_info(struct sk_buf
 
        NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
 
-       read_lock_bh(&neigh->lock);
        ndm->ndm_state   = neigh->nud_state;
+
+       /* Not really updating this neighbour but don't want to
+        * deal with the unwind case when seqlock needs retry
+        */
+       write_seqlock_bh(&neigh->lock);
        if ((neigh->nud_state & NUD_VALID) &&
            nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
-               read_unlock_bh(&neigh->lock);
+               write_sequnlock_bh(&neigh->lock);
                goto nla_put_failure;
        }
 
@@ -1972,7 +1999,7 @@ static int neigh_fill_info(struct sk_buf
        ci.ndm_confirmed = now - neigh->confirmed;
        ci.ndm_updated   = now - neigh->updated;
        ci.ndm_refcnt    = atomic_read(&neigh->refcnt) - 1;
-       read_unlock_bh(&neigh->lock);
+       write_sequnlock_bh(&neigh->lock);
 
        NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
        NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
@@ -2077,13 +2104,13 @@ void __neigh_for_each_release(struct nei
                                          &tbl->hash_buckets[chain], hlist) {
                        int release;
 
-                       write_lock(&n->lock);
+                       write_seqlock(&n->lock);
                        release = cb(n);
                        if (release) {
                                hlist_del_rcu(&n->hlist);
                                n->dead = 1;
                        }
-                       write_unlock(&n->lock);
+                       write_sequnlock(&n->lock);
                        if (release)
                                call_rcu(&n->rcu, neigh_rcu_release);
                }
--- net-2.6.19.orig/net/ipv4/arp.c
+++ net-2.6.19/net/ipv4/arp.c
@@ -328,6 +328,31 @@ static void arp_error_report(struct neig
        kfree_skb(skb);
 }
 
+
+static unsigned arp_state_to_flags(const struct neighbour *neigh)
+{
+       unsigned flags = 0;
+       if (neigh->nud_state&NUD_PERMANENT)
+               flags = ATF_PERM|ATF_COM;
+       else if (neigh->nud_state&NUD_VALID)
+               flags = ATF_COM;
+       return flags;
+}
+
+static void arp_get_neigh_addr(u8 *ha, const struct neighbour *neigh,
+                              unsigned len, unsigned *flags)
+{
+       unsigned seq;
+
+       do {
+               seq = read_seqbegin(&neigh->lock);
+               memcpy(ha, neigh->ha, len);
+               if (flags)
+                       *flags = arp_state_to_flags(neigh);
+       } while (read_seqretry(&neigh->lock, seq));
+
+}
+
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 {
        u32 saddr = 0;
@@ -369,8 +394,12 @@ static void arp_solicit(struct neighbour
        if ((probes -= neigh->parms->ucast_probes) < 0) {
                if (!(neigh->nud_state&NUD_VALID))
                        printk(KERN_DEBUG "trying to ucast probe in 
NUD_INVALID\n");
-               dst_ha = neigh->ha;
-               read_lock_bh(&neigh->lock);
+
+               dst_ha = kmalloc(MAX_ADDR_LEN, GFP_ATOMIC);
+               if (!dst_ha)
+                       return;
+
+               arp_get_neigh_addr(dst_ha, neigh, MAX_ADDR_LEN, NULL);
        } else if ((probes -= neigh->parms->app_probes) < 0) {
 #ifdef CONFIG_ARPD
                neigh_app_ns(neigh);
@@ -380,8 +409,9 @@ static void arp_solicit(struct neighbour
 
        arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
                 dst_ha, dev->dev_addr, NULL);
+
        if (dst_ha)
-               read_unlock_bh(&neigh->lock);
+               kfree(dst_ha);
 }
 
 static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
@@ -489,10 +519,7 @@ int arp_find(unsigned char *haddr, struc
        if (n) {
                n->used = jiffies;
                if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
-                       read_lock_bh(&n->lock);
-                       memcpy(haddr, n->ha, dev->addr_len);
-                       read_unlock_bh(&n->lock);
-                       neigh_release(n);
+                       arp_get_neigh_addr(haddr, n, dev->addr_len, NULL);
                        return 0;
                }
                neigh_release(n);
@@ -1047,16 +1074,6 @@ static int arp_req_set(struct arpreq *r,
        return err;
 }
 
-static unsigned arp_state_to_flags(struct neighbour *neigh)
-{
-       unsigned flags = 0;
-       if (neigh->nud_state&NUD_PERMANENT)
-               flags = ATF_PERM|ATF_COM;
-       else if (neigh->nud_state&NUD_VALID)
-               flags = ATF_COM;
-       return flags;
-}
-
 /*
  *     Get an ARP cache entry.
  */
@@ -1069,10 +1086,8 @@ static int arp_req_get(struct arpreq *r,
 
        neigh = neigh_lookup(&arp_tbl, &ip, dev);
        if (neigh) {
-               read_lock_bh(&neigh->lock);
-               memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
-               r->arp_flags = arp_state_to_flags(neigh);
-               read_unlock_bh(&neigh->lock);
+               arp_get_neigh_addr(r->arp_ha.sa_data, neigh, dev->addr_len,
+                                  &r->arp_flags);
                r->arp_ha.sa_family = dev->type;
                strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
                neigh_release(neigh);
@@ -1258,7 +1273,7 @@ void __init arp_init(void)
 /*
  *     ax25 -> ASCII conversion
  */
-static char *ax2asc2(ax25_address *a, char *buf)
+static char *ax2asc2(const ax25_address *a, char *buf)
 {
        char c, *s;
        int n;
@@ -1290,35 +1305,41 @@ static char *ax2asc2(ax25_address *a, ch
 #define HBUFFERLEN 30
 
 static void arp_format_neigh_entry(struct seq_file *seq,
-                                  struct neighbour *n)
+                                  const struct neighbour *n)
 {
        char hbuffer[HBUFFERLEN];
        const char hexbuf[] = "0123456789ABCDEF";
        int k, j;
+       unsigned hflags, seqno;
        char tbuf[16];
        struct net_device *dev = n->dev;
        int hatype = dev->type;
 
-       read_lock(&n->lock);
-       /* Convert hardware address to XX:XX:XX:XX ... form. */
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
-       if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
-               ax2asc2((ax25_address *)n->ha, hbuffer);
-       else {
-#endif
-       for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
-               hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
-               hbuffer[k++] = hexbuf[n->ha[j] & 15];
-               hbuffer[k++] = ':';
-       }
-       hbuffer[--k] = 0;
+       do {
+               seqno = read_seqbegin(&n->lock);
+
+               /* Convert hardware address to XX:XX:XX:XX ... form. */
 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
-       }
+               if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+                       ax2asc2((const ax25_address *)n->ha, hbuffer);
+               else
 #endif
-       sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+               {
+                       for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < 
dev->addr_len; j++) {
+                               hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+                               hbuffer[k++] = hexbuf[n->ha[j] & 15];
+                               hbuffer[k++] = ':';
+                       }
+                       hbuffer[--k] = 0;
+               }
+
+               sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+               hflags = arp_state_to_flags(n);
+       } while (read_seqretry(&n->lock, seqno));
+
        seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
-                  tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
-       read_unlock(&n->lock);
+                  tbuf, hatype, hflags, hbuffer, dev->name);
+
 }
 
 static void arp_format_pneigh_entry(struct seq_file *seq,
--- net-2.6.19.orig/net/ipv6/ndisc.c
+++ net-2.6.19/net/ipv6/ndisc.c
@@ -1412,15 +1412,15 @@ void ndisc_send_redirect(struct sk_buff 
                return;
        }
 
-       if (dev->addr_len) {
-               read_lock_bh(&neigh->lock);
-               if (neigh->nud_state & NUD_VALID) {
+       if (dev->addr_len && (neigh->nud_state & NUD_VALID)) {
+               unsigned seq;
+               do {
+                       seq = read_seqbegin(&neigh->lock);
                        memcpy(ha_buf, neigh->ha, dev->addr_len);
-                       read_unlock_bh(&neigh->lock);
-                       ha = ha_buf;
-                       len += ndisc_opt_addr_space(dev);
-               } else
-                       read_unlock_bh(&neigh->lock);
+               } while (read_seqretry(&neigh->lock, seq));
+
+               ha = ha_buf;
+               len += ndisc_opt_addr_space(dev);
        }
 
        rd_len = min_t(unsigned int,
--- net-2.6.19.orig/net/ipv6/route.c
+++ net-2.6.19/net/ipv6/route.c
@@ -279,20 +279,19 @@ static void rt6_probe(struct rt6_info *r
         */
        if (!neigh || (neigh->nud_state & NUD_VALID))
                return;
-       read_lock_bh(&neigh->lock);
+
        if (!(neigh->nud_state & NUD_VALID) &&
-           time_after(jiffies, neigh->updated + 
rt->rt6i_idev->cnf.rtr_probe_interval)) {
+           time_after(jiffies,
+                      neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) 
{
                struct in6_addr mcaddr;
                struct in6_addr *target;
 
                neigh->updated = jiffies;
-               read_unlock_bh(&neigh->lock);
 
                target = (struct in6_addr *)&neigh->primary_key;
                addrconf_addr_solict_mult(target, &mcaddr);
                ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
-       } else
-               read_unlock_bh(&neigh->lock);
+       }
 }
 #else
 static inline void rt6_probe(struct rt6_info *rt)
@@ -323,10 +322,9 @@ static int inline rt6_check_neigh(struct
            !(rt->rt6i_flags & RTF_GATEWAY))
                m = 1;
        else if (neigh) {
-               read_lock_bh(&neigh->lock);
+               smp_rmb();
                if (neigh->nud_state & NUD_VALID)
                        m = 2;
-               read_unlock_bh(&neigh->lock);
        }
        return m;
 }
--- net-2.6.19.orig/net/sched/sch_teql.c
+++ net-2.6.19/net/sched/sch_teql.c
@@ -248,9 +248,14 @@ __teql_resolve(struct sk_buff *skb, stru
        }
        if (neigh_event_send(n, skb_res) == 0) {
                int err;
-               read_lock(&n->lock);
-               err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, 
NULL, skb->len);
-               read_unlock(&n->lock);
+               unsigned seq;
+
+               do {
+                       seq = read_seqbegin(&n->lock);
+                       err = dev->hard_header(skb, dev, ntohs(skb->protocol),
+                                              n->ha, NULL, skb->len);
+               } while (read_seqretry(&n->lock, seq));
+
                if (err < 0) {
                        neigh_release(n);
                        return -EINVAL;

--
Stephen Hemminger <[EMAIL PROTECTED]>


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to