  This patch implements a private cache of buffers for SB1250_MAC driver
  for performance enhancement purpose. It should be applied on top of 
  the NAPI patch.
    Signed off by: Dan Krejsa (dan.krejsa@windriver.com)
    Signed off by: Steve Yang (steve.yang@windriver.com)
Index: linux-2.6.14-cgl/drivers/net/Kconfig
===================================================================
--- linux-2.6.14-cgl.orig/drivers/net/Kconfig	2006-09-25 10:06:29.000000000 -0700
+++ linux-2.6.14-cgl/drivers/net/Kconfig	2006-09-25 12:26:08.401453626 -0700
@@ -2048,6 +2048,35 @@
 
 	  If in doubt, say y.
 
+config SBMAC_SKB_CACHE
+	bool "SBMAC: Enable driver local buffer caching (EXPERIMENTAL)"
+	depends on NET_SB1250_MAC && EXPERIMENTAL
+	help
+	  This configuration option makes the sb1250-mac.c driver
+	  maintain a private cache of 'sk_buff' buffers, shared
+	  between the network devices which it manages.  When a packet
+	  transmit completes, the corresponding sk_buff may be placed
+	  in the private cache (provided it meets certain criteria)
+	  rather than being returned to the general kernel pools.
+	  Durin packet reception, replenishment sk_buffs may be taken
+	  from the private cache (if available) in preference to using
+	  dev_alloc_skb().  This has been found to significantly
+	  improve performance when forwarding under heavy load.
+ 
+	  If in doubt, say y here.
+
+config SBMAC_SKB_CACHE_SIZE
+	int "SBMAC driver sk_buff cache size"
+	depends on SBMAC_SKB_CACHE
+	default 64
+	help
+	  Number of 'sk_buff' buffers in the cache. Default to 64.
+	  Up to this many buffers may be held in the cache by the driver,
+	  unavailable for other use, until the driver is unloaded.
+	  Only linear sk_buffs with a single reference count and of
+	  sufficiently large size are candidates for recycling in the
+	  private cache.
+
 config R8169_VLAN
 	bool "VLAN support"
 	depends on R8169 && VLAN_8021Q
Index: linux-2.6.14-cgl/drivers/net/sb1250-mac.c
===================================================================
--- linux-2.6.14-cgl.orig/drivers/net/sb1250-mac.c	2006-09-25 10:06:29.000000000 -0700
+++ linux-2.6.14-cgl/drivers/net/sb1250-mac.c	2006-09-25 12:43:01.299764714 -0700
@@ -35,6 +35,10 @@
 #include <asm/processor.h>		/* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/cache.h>
+#ifdef CONFIG_SBMAC_SKB_CACHE
+#include <net/dst.h>
+#include <net/xfrm.h>
+#endif /* CONFIG_SBMAC_SKB_CACHE */
 
 /* This is only here until the firmware is ready.  In that case,
    the firmware leaves the ethernet address in the register for us. */
@@ -275,10 +279,40 @@
 
 	sbmacdma_t       sbm_txdma;		/* for now, only use channel 0 */
 	sbmacdma_t       sbm_rxdma;
+ 	uint32_t	 sbm_events;		/* record interrupt status */
 	int              rx_hw_checksum;
 	int 		 sbe_idx;
 };
 
+#ifdef CONFIG_SBMAC_SKB_CACHE
+
+/* Try to locally cache a TX sk_buff if its truesize is at least this big: */
+#define SBMAC_SIZE_REQ (SKB_DATA_ALIGN(16 + ENET_PACKET_SIZE +	\
+				       SMP_CACHE_BYTES * 2 +	\
+				       ETHER_ALIGN))
+#ifndef SBMAC_SKB_CACHE_SIZE
+#define SBMAC_SKB_CACHE_SIZE	64
+#endif
+
+struct sbmac_skb_cache {
+	spinlock_t	 lock;
+	int		 index;
+	struct sk_buff * buf [SBMAC_SKB_CACHE_SIZE];
+};
+
+/*
+ * Local sk_buff cache, shared by devices.
+ * Remember to write clean-up code to free any buffers in
+ * the cache when the module unloads.
+ */
+
+static struct sbmac_skb_cache sbmac_skb = {
+	.lock = SPIN_LOCK_UNLOCKED,
+	.index = 0
+};
+
+#endif /* CONFIG_SBMAC_SKB_CACHE */
+
 
 /**********************************************************************
  *  Externs
@@ -882,7 +916,7 @@
 	d->sbdma_remptr = NULL;
 }
 
-static void sbdma_align_skb(struct sk_buff *skb,int power2,int offset)
+static inline void sbdma_align_skb(struct sk_buff *skb,int power2,int offset)
 {
 	unsigned long addr;
 	unsigned long newaddr;
@@ -917,6 +951,9 @@
 	sbdmadscr_t *nextdsc;
 	struct sk_buff *sb_new = NULL;
 	int pktsize = ENET_PACKET_SIZE;
+#ifdef CONFIG_SBMAC_SKB_CACHE
+	int i;
+#endif
 
 	/* get pointer to our current place in the ring */
 
@@ -952,26 +989,52 @@
 	 *  DMA will trash the beginning (and ending) portions.
 	 */
 
-	if (sb == NULL) {
-		sb_new = dev_alloc_skb(ENET_PACKET_SIZE + SMP_CACHE_BYTES * 2 + ETHER_ALIGN);
-		if (sb_new == NULL) {
-			printk(KERN_INFO "%s: sk_buff allocation failed\n",
-			       d->sbdma_eth->sbm_dev->name);
-			return -ENOBUFS;
+
+	sb_new = sb;
+
+	if (sb_new == NULL) {
+#ifdef CONFIG_SBMAC_SKB_CACHE
+		if (likely (spin_trylock (&sbmac_skb.lock))) {
+			if (likely ((i = sbmac_skb.index - 1) >= 0)) {
+				sb_new = sbmac_skb.buf [i];
+				sbmac_skb.index = i;
+			}
+			spin_unlock (&sbmac_skb.lock);
 		}
+		if (sb_new == NULL) {
+#endif
+			sb_new = dev_alloc_skb (ENET_PACKET_SIZE +
+						SMP_CACHE_BYTES * 2 +
+						ETHER_ALIGN);
+			if (unlikely (sb_new == NULL)) {
+				printk(KERN_INFO
+				       "%s: sk_buff allocation failed\n",
+				       d->sbdma_eth->sbm_dev->name);
+				return -ENOBUFS;
+			}
+#ifdef CONFIG_SBMAC_SKB_CACHE
+		} else {
+			/* sb_new came from the local cache, init it */
+			memset(sb_new, 0, offsetof(struct sk_buff, truesize));
 
+			/* sb->users is already 1, as is
+			   skb_shinfo(sb)->dataref */
+
+			/* reserve 16 bytes, see __dev_alloc_skb() */
+			sb_new->data = sb_new->tail = sb_new->head + 16;
+  
+			/* are these necessary given that we don't support
+			 * TCP segmentation offload?
+			 */
+			skb_shinfo(sb_new)->gso_size = 0;
+			skb_shinfo(sb_new)->gso_segs = 0;
+		}
+#endif
 		sbdma_align_skb(sb_new, SMP_CACHE_BYTES, ETHER_ALIGN);
 
 		/* mark skbuff owned by our device */
 		sb_new->dev = d->sbdma_eth->sbm_dev;
 	}
-	else {
-		sb_new = sb;
-		/*
-		 * nothing special to reinit buffer, it's already aligned
-		 * and sb->data already points to a good place.
-		 */
-	}
 
 	/*
 	 * fill in the descriptor
@@ -1344,8 +1407,12 @@
 	sbdmadscr_t *dsc;
 	struct sk_buff *sb;
 	unsigned long flags;
-	int packets_handled = 0;
-
+	int out = 0;
+#ifdef CONFIG_SBMAC_SKB_CACHE
+	int i;
+	int size;
+#endif
+ 
 	spin_lock_irqsave(&(sc->sbm_lock), flags);
 
 	if (d->sbdma_remptr == d->sbdma_addptr)
@@ -1354,8 +1421,8 @@
 	hwidx = (int) (((__raw_readq(d->sbdma_curdscr) & M_DMA_CURDSCR_ADDR) -
 			d->sbdma_dscrtable_phys) / sizeof(sbdmadscr_t));
  
-	for (;;) {
-		/*
+	for (;; d->sbdma_remptr = SBDMA_NEXTBUF(d,sbdma_remptr)) {
+ 		/*
 		 * figure out where we are (as an index) and where
 		 * the hardware is (also as an index)
 		 *
@@ -1371,8 +1438,8 @@
 		/*
 		 * If they're the same, that means we've processed all
 		 * of the descriptors up to (but not including) the one that
-		 * the hardware is working on right now.
-		 */
+		 * the hardware was working on when we started.
+ 		 */
 
 		if (curidx == hwidx)
 			break;
@@ -1394,27 +1461,95 @@
 
 		/*
 		 * for transmits, we just free buffers.
-		 */
-
-		dev_kfree_skb_irq(sb);
+		 *
+		 * CONFIG_SBMAC_SKB_CACHE:
+		 * We try to keep an optimized local sk_buff cache
+		 * shared between devices managed by this driver.
+		 * Candidate sk_buff's are linear with enough space
+		 * and reference counts of 1. Note that dataref==1
+		 * also means that 'header cloning' hasn't taken place.
+		 *
+		 * We only access the local sk_buff cache inside the
+		 * device poll routine. However, as this is shared
+		 * between devices and the net_rx_action may run on
+		 * multiple CPUs concurrently, we need a spin lock.
+		 */
+
+		if (poll) {
+			/* Interrupts are known enabled in polling routine */
+
+#ifdef CONFIG_SBMAC_SKB_CACHE
+			if (atomic_read (&sb->users) == 1 &&
+			    atomic_read (&(skb_shinfo(sb)->dataref)) == 1 &&
+			    (size = (sb->end - sb->head)) >= SBMAC_SIZE_REQ &&
+			    sb->data_len == 0 &&
+			    sb->fclone == SKB_FCLONE_UNAVAILABLE &&
+			    /*
+			     * Are these last two checks necessary
+			     * given that data_len == 0 ? In any case,
+			     * we don't seem to handle non-linear sk_buffs
+			     * in sbdma_add_txbuffer(), so it may be
+			     * overly paranoid here...
+			     */
+			    skb_shinfo(sb)->nr_frags == 0 &&
+			    skb_shinfo(sb)->frag_list == 0) {
+				smp_rmb();
+  
+				/*
+				 * XXX - the next several lines are from
+				 * __kfree_skb().
+				 */
 
-		/*
-		 * .. and advance to the next buffer.
-		 */
+				dst_release(sb->dst);
+#ifdef CONFIG_XFRM
+				secpath_put(sb->sp);
+#endif
+				if (sb->destructor) {
+					sb->destructor(sb);
+				}
+#ifdef CONFIG_NETFILTER
+				nf_conntrack_put(sb->nfct);
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+				nf_conntrack_put_reasm(sb->nfct_reasm);
+#endif
 
-		d->sbdma_remptr = SBDMA_NEXTBUF(d,sbdma_remptr);
+#ifdef CONFIG_BRIDGE_NETFILTER
+				nf_bridge_put(sb->nf_bridge);
+#endif
+#endif
+				/* XXX: IS this still necessary? - JHS */
+#ifdef CONFIG_NET_SCHED
+				sb->tc_index = 0;
+#ifdef CONFIG_NET_CLS_ACT
+				sb->tc_verd = 0;
+#endif
+#endif
 
-		packets_handled++;
+				if (spin_trylock(&sbmac_skb.lock)) {
+					if ( (i = sbmac_skb.index) <
+					    SBMAC_SKB_CACHE_SIZE) {
+						sb->truesize = size +
+						    sizeof (struct sk_buff);
+						sbmac_skb.buf[i] = sb;
+						sbmac_skb.index = i + 1;
+						spin_unlock (&sbmac_skb.lock);
+						continue;
+					}
+					spin_unlock (&sbmac_skb.lock);
+				}
 
+				kfree_skbmem (sb);
+			} else  /* SKB_CACHE */
+#endif /* CONFIG_SBMAC_SKB_CACHE */
+				dev_kfree_skb(sb);
+		} else /* Poll */
+			dev_kfree_skb_irq(sb);
 	}
 
-	/*
-	 * Decide if we should wake up the protocol or not.
-	 * Other drivers seem to do this when we reach a low
-	 * watermark on the transmit queue.
-	 */
+	if ( (out = (d->sbdma_addptr - d->sbdma_remptr)) < 0)
+		out += SBMAC_MAX_TXDESCR; /* d->sbdma_maxdescr; */
 
-	if (packets_handled)
+	if (out < SBMAC_MAX_TXDESCR / 2)
 		netif_wake_queue(d->sbdma_eth->sbm_dev);
 
 end_unlock:
@@ -2127,42 +2262,53 @@
 	int handled = 0;
 
 #ifdef CONFIG_SBMAC_NAPI
-		/*
-		 * Read the ISR (this clears the bits in the real
-		 * register, except for counter addr)
-		 */
-
-		isr = __raw_readq(sc->sbm_isr) & ~M_MAC_COUNTER_ADDR;
-
-		if (isr == 0)
-		        return IRQ_RETVAL(0);
-		handled = 1;
-
-		/*
-		 * Transmits on channel 0
-		 */
-
-		if (isr & (M_MAC_INT_CHANNEL << S_MAC_TX_CH0)) {
-			sbdma_tx_process(sc,&(sc->sbm_txdma), 0);
-#ifdef CONFIG_NETPOLL_TRAP
-		       if (netpoll_trap()) {
-			       if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) 
-				       __netif_schedule(dev);
-		       }
-#endif
-		}
-
-	if (isr & (M_MAC_INT_CHANNEL << S_MAC_RX_CH0)) {
+	 /*
+	  * Read the ISR (this clears the bits in the real
+	  * register, except for counter addr)
+	  */
+  		
+	isr = __raw_readq(sc->sbm_isr) & ~M_MAC_COUNTER_ADDR;
+  		
+	if (isr == 0)
+	        return IRQ_RETVAL(0);
+  
+	handled = 1;
+		
+	if (isr & ((M_MAC_INT_CHANNEL << S_MAC_RX_CH0) |
+		   (M_MAC_INT_CHANNEL << S_MAC_TX_CH0))) {
 		if (netif_rx_schedule_prep(dev)) {
+			sc->sbm_events = (uint32_t)isr;
 			__raw_writeq(0, sc->sbm_imr);
 			__netif_rx_schedule(dev);
 			/* Depend on the exit from poll to reenable intr */
-		}
-		else {
-			/* may leave some packets behind */
-			sbdma_rx_process(sc,&(sc->sbm_rxdma),
-					 SBMAC_MAX_RXDESCR * 2, 0);
-		}
+		} else {
+			/* Interrupt already acknowledged, sbmac_poll()
+			 * already scheduled. Do nothing. Shouldn't occur
+			 * anyhow.
+			 */
+#if 0
+			if (isr & (M_MAC_INT_CHANNEL << S_MAC_TX_CH0)) {
+				sbdma_tx_process(sc,&(sc->sbm_txdma), 0);
+#ifdef CONFIG_NETPOLL_TRAP
+				if (netpoll_trap()) {
+					if (test_and_clear_bit(__LINK_STATE_XOFF, 
+							       &dev->state)) 
+						__netif_schedule(dev);
+				}
+#endif
+			}
+			/* Note, if the ISR is allowed to call
+			 * sbdma_rx_process(), provisions have to be made
+			 * when the local SKB_CACHE is used; it presently
+			 * doesn't have protection against accesses from
+			 * interrupt level.
+			 */
+			if (isr & (M_MAC_INT_CHANNEL << S_MAC_RX_CH0)) {
+				/* may leave some packets behind */
+				sbdma_rx_process(sc,&(sc->sbm_rxdma), 32, 0);
+			}
+#endif /* 0 */
+                }
 	}
 #else
 	/* Non NAPI */
@@ -2180,8 +2326,7 @@
 		handled = 1;
 
 		if (isr & (M_MAC_INT_CHANNEL << S_MAC_TX_CH0)) {
-			sbdma_tx_process(sc,&(sc->sbm_txdma),
-					 SBMAC_MAX_RXDESCR * 2);
+			sbdma_tx_process(sc, &(sc->sbm_txdma), 0);
 #ifdef CONFIG_NETPOLL_TRAP
 		       if (netpoll_trap()) {
 			       if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) 
@@ -2232,8 +2377,15 @@
 	struct sbmac_softc *sc = netdev_priv(dev);
 
 	/* lock eth irq */
-	spin_lock_irq (&sc->sbm_lock);
 
+	/* hard_start_xmit() is always called with interrupts enabled */
+	local_irq_disable();
+	if (unlikely (!spin_trylock(&sc->sbm_lock))) {
+		/* collision */
+		local_irq_enable();
+		return NETDEV_TX_LOCKED;
+	}
+ 
 	/*
 	 * Put the buffer on the transmit ring.  If we
 	 * don't have room, stop the queue.
@@ -2244,14 +2396,14 @@
 		netif_stop_queue(dev);
 		spin_unlock_irq(&sc->sbm_lock);
 
-		return 1;
+		return NETDEV_TX_BUSY;
 	}
 
 	dev->trans_start = jiffies;
 
 	spin_unlock_irq (&sc->sbm_lock);
 
-	return 0;
+	return NETDEV_TX_OK;
 }
 
 /**********************************************************************
@@ -2421,7 +2573,7 @@
 	return 0;
 }
 #endif
-
+	
 static int sb1250_change_mtu(struct net_device *_dev, int new_mtu)
 {
 	if (new_mtu >  ENET_PACKET_SIZE)
@@ -2511,6 +2663,8 @@
 	dev->weight             = 16;
 #endif
  
+	dev->features |= NETIF_F_LLTX; /* hard_start_xmit handles locking */
+
 	/* This is needed for PASS2 for Rx H/W checksum feature */
 	sbmac_set_iphdr_offset(sc);
 
@@ -2922,33 +3076,57 @@
 	int work_to_do;
 	int work_done;
 	struct sbmac_softc *sc = netdev_priv(dev);
+	uint32_t events = sc->sbm_events;
+	uint32_t new_events = 0;
 
-	work_to_do = min(*budget, dev->quota);
-	work_done = sbdma_rx_process(sc, &(sc->sbm_rxdma), work_to_do, 1);
+	if (events & (uint32_t)(M_MAC_INT_CHANNEL << S_MAC_RX_CH0)) {
+		work_to_do = dev->quota;
+		work_done = sbdma_rx_process(sc, &(sc->sbm_rxdma),
+					     work_to_do, 1);
 
-	if (work_done > work_to_do)
-		printk(KERN_ERR "%s exceeded work_to_do budget=%d quota=%d work-done=%d\n",
-		       sc->sbm_dev->name, *budget, dev->quota, work_done);
+#if 0
+		if (work_done > work_to_do)
+			printk(KERN_ERR "%s exceeded work_to_do budget=%d "
+			       "quota=%d work-done=%d\n",
+			       sc->sbm_dev->name, *budget, dev->quota,
+			       work_done);
+#endif
+		*budget -= work_done;
+		dev->quota -= work_done;
+		if (work_done >= work_to_do)
+			new_events = (uint32_t)(M_MAC_INT_CHANNEL <<
+						S_MAC_RX_CH0);
+	}
 
-	sbdma_tx_process(sc, &(sc->sbm_txdma), 1);
+	if (events & (uint32_t)(M_MAC_INT_CHANNEL << S_MAC_TX_CH0)) {
+		sbdma_tx_process(sc, &(sc->sbm_txdma), 1);
+	}
 
-	*budget -= work_done;
-	dev->quota -= work_done;
+	/*
+	 * If we found no more TX or RX work to do, don't reschedule,
+	 * reenable interrupts.
+	 */
+	sc->sbm_events = new_events |
+	    ((uint32_t)__raw_readq(sc->sbm_isr) &
+	     (uint32_t) ((M_MAC_INT_CHANNEL << S_MAC_RX_CH0) |
+			 (M_MAC_INT_CHANNEL << S_MAC_TX_CH0)));
 
-	if (work_done < work_to_do) {
+	if (sc->sbm_events == 0) {
 		netif_rx_complete(dev);
 
 #ifdef CONFIG_SBMAC_COALESCE
-		__raw_writeq(((M_MAC_INT_EOP_COUNT | M_MAC_INT_EOP_TIMER) << S_MAC_TX_CH0) |
-			     ((M_MAC_INT_EOP_COUNT | M_MAC_INT_EOP_TIMER) << S_MAC_RX_CH0), 
-			     sc->sbm_imr);
+		__raw_writeq(((M_MAC_INT_EOP_COUNT | M_MAC_INT_EOP_TIMER)
+				<< S_MAC_TX_CH0) |
+			     ((M_MAC_INT_EOP_COUNT | M_MAC_INT_EOP_TIMER)
+				<< S_MAC_RX_CH0), sc->sbm_imr);
 #else
 		__raw_writeq((M_MAC_INT_CHANNEL << S_MAC_TX_CH0) |
-			     (M_MAC_INT_CHANNEL << S_MAC_RX_CH0), sc->sbm_imr);
+			       (M_MAC_INT_CHANNEL << S_MAC_RX_CH0), sc->sbm_imr);
 #endif
+		return 0;
 	}
 
-	return (work_done >= work_to_do);
+	return 1;
 }
 #endif
 
@@ -3072,6 +3250,20 @@
 	return 0;
 }
 
+#ifdef CONFIG_SBMAC_SKB_CACHE
+static void sbmac_skb_cache_flush (void)
+{
+	struct sk_buff * sb;
+	spin_lock (&sbmac_skb.lock);
+	while (sbmac_skb.index > 0) {
+		sb = sbmac_skb.buf [--sbmac_skb.index];
+		spin_unlock (&sbmac_skb.lock); /* probably not necessary */
+		kfree_skbmem (sb);
+		spin_lock (&sbmac_skb.lock);
+	}
+	spin_unlock (&sbmac_skb.lock);
+}
+#endif /* CONFIG_SBMAC_SKB_CACHE */
 
 static void __exit
 sbmac_cleanup_module(void)
@@ -3090,6 +3282,9 @@
 		sbmac_uninitctx(sc);
 		free_netdev(dev);
 	}
+#ifdef CONFIG_SBMAC_SKB_CACHE
+	sbmac_skb_cache_flush();
+#endif
 }
 
 module_init(sbmac_init_module);
