bridge diff needs testing

2011-06-22 Thread Bret S. Lambert
The following diff replaces the existing hash table with a red black
tree, to the acclaim of network hackers worldwide.

This has already received some testing, but needs some more shakedown
in order to make sure that nobody gets hosed by this. Folks with
more esoteric (read: vether and/or gif) setups are especially encouraged
to test, and may receive cookies.

- Bert

Index: if_bridge.h
===
RCS file: /cvs/src/sys/net/if_bridge.h,v
retrieving revision 1.32
diff -u -p -r1.32 if_bridge.h
--- if_bridge.h 28 Oct 2010 13:49:54 -  1.32
+++ if_bridge.h 29 Oct 2010 19:45:38 -
@@ -397,7 +397,7 @@ struct bridge_iflist {
  * Bridge route node
  */
 struct bridge_rtnode {
-   LIST_ENTRY(bridge_rtnode)   brt_next;   /* next in list */
+   RB_ENTRY(bridge_rtnode) brt_entry;  /* entry in rb tree */
struct  ifnet *brt_if;  /* destination ifs */
u_int8_tbrt_flags;  /* address flags */
u_int8_tbrt_age;/* age counter */
@@ -423,7 +423,7 @@ struct bridge_softc {
struct timeout  sc_brtimeout;   /* timeout state */
struct bstp_state   *sc_stp;/* stp state */
LIST_HEAD(, bridge_iflist)  sc_iflist;  /* interface list */
-   LIST_HEAD(, bridge_rtnode)  sc_rts[BRIDGE_RTABLE_SIZE]; /* hash 
table */
+   RB_HEAD(brt_tree, bridge_rtnode) sc_rts;/* route tree */
LIST_HEAD(, bridge_iflist)  sc_spanlist;/* span ports */
 };
 
Index: if_bridge.c
===
RCS file: /cvs/src/sys/net/if_bridge.c,v
retrieving revision 1.186
diff -u -p -r1.186 if_bridge.c
--- if_bridge.c 28 Oct 2010 19:00:57 -  1.186
+++ if_bridge.c 29 Oct 2010 19:45:38 -
@@ -45,6 +45,7 @@
 #include sys/ioctl.h
 #include sys/errno.h
 #include sys/kernel.h
+#include sys/tree.h
 #include machine/cpu.h
 
 #include net/if.h
@@ -135,7 +136,7 @@ int bridge_rtfind(struct bridge_softc *,
 void   bridge_rtage(struct bridge_softc *);
 void   bridge_rttrim(struct bridge_softc *);
 intbridge_rtdaddr(struct bridge_softc *, struct ether_addr *);
-intbridge_rtflush(struct bridge_softc *, int);
+void   bridge_rtflush(struct bridge_softc *, int);
 struct ifnet * bridge_rtupdate(struct bridge_softc *,
 struct ether_addr *, struct ifnet *ifp, int, u_int8_t);
 struct ifnet * bridge_rtlookup(struct bridge_softc *,
@@ -180,6 +181,16 @@ LIST_HEAD(, bridge_softc) bridge_list;
 struct if_clone bridge_cloner =
 IF_CLONE_INITIALIZER(bridge, bridge_clone_create, bridge_clone_destroy);
 
+static __inline int brt_cmp(struct bridge_rtnode *, struct bridge_rtnode *);
+RB_PROTOTYPE(brt_tree, bridge_rtnode, brt_entry, brt_cmp);
+RB_GENERATE(brt_tree, bridge_rtnode, brt_entry, brt_cmp);
+
+static __inline int
+brt_cmp(struct bridge_rtnode *a, struct bridge_rtnode *b)
+{
+   return (memcmp(a-brt_addr, b-brt_addr, sizeof(a-brt_addr)));
+}
+
 /* ARGSUSED */
 void
 bridgeattach(int n)
@@ -194,7 +205,7 @@ bridge_clone_create(struct if_clone *ifc
 {
struct bridge_softc *sc;
struct ifnet *ifp;
-   int i, s;
+   int s;
 
sc = malloc(sizeof(*sc), M_DEVBUF, M_NOWAIT|M_ZERO);
if (!sc)
@@ -211,8 +222,7 @@ bridge_clone_create(struct if_clone *ifc
timeout_set(sc-sc_brtimeout, bridge_timer, sc);
LIST_INIT(sc-sc_iflist);
LIST_INIT(sc-sc_spanlist);
-   for (i = 0; i  BRIDGE_RTABLE_SIZE; i++)
-   LIST_INIT(sc-sc_rts[i]);
+   RB_INIT(sc-sc_rts);
sc-sc_hashkey = arc4random();
ifp = sc-sc_if;
snprintf(ifp-if_xname, sizeof ifp-if_xname, %s%d, ifc-ifc_name,
@@ -560,7 +570,7 @@ bridge_ioctl(struct ifnet *ifp, u_long c
if ((error = suser(curproc, 0)) != 0)
break;
 
-   error = bridge_rtflush(sc, req-ifbr_ifsflags);
+   bridge_rtflush(sc, req-ifbr_ifsflags);
break;
case SIOCBRDGSADDR:
if ((error = suser(curproc, 0)) != 0)
@@ -1003,7 +1013,7 @@ bridge_output(struct ifnet *ifp, struct 
struct ifnet *dst_if;
struct ether_addr *dst;
struct bridge_softc *sc;
-   int s, error, len;
+   int s, error;
 #ifdef IPSEC
struct m_tag *mtag;
 #endif /* IPSEC */
@@ -1109,33 +1119,12 @@ bridge_output(struct ifnet *ifp, struct 
used = 1;
mc = m;
} else {
-   struct mbuf *m1, *m2, *mx;
-
-   m1 = m_copym2(m, 0, ETHER_HDR_LEN,
-   M_DONTWAIT);
-   if (m1 == NULL) {
-   sc-sc_if.if_oerrors++;
-   continue;
- 

pf ip reassembly test awesomization

2011-06-22 Thread Bret S. Lambert
There's no need to walk the entire fragment list to determine
if we have all the bytes of the unfragmented packet if we keep
a running count on how much we've seen already.

Please to be testing as we need to make sure that our userbase
remains unmolested.

- Bert

Index: pf_norm.c
===
RCS file: /cvs/src/sys/net/pf_norm.c,v
retrieving revision 1.132
diff -u -p -r1.132 pf_norm.c
--- pf_norm.c   23 Apr 2011 10:00:36 -  1.132
+++ pf_norm.c   16 May 2011 15:35:20 -
@@ -97,6 +97,7 @@ struct pf_fragment {
TAILQ_ENTRY(pf_fragment) frag_next;
u_int32_t   fr_timeout;
u_int16_t   fr_maxlen;  /* maximum length of single fragment */
+   u_int16_t   fr_qlen;/* length of current queue */
TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
 };
 
@@ -322,6 +323,7 @@ pf_fillup_fragment(struct pf_fragment_cm
*(struct pf_fragment_cmp *)frag = *key;
frag-fr_timeout = time_second;
frag-fr_maxlen = frent-fe_len;
+   frag-fr_qlen = frent-fe_len;
TAILQ_INIT(frag-fr_queue);
 
RB_INSERT(pf_frag_tree, pf_frag_tree, frag);
@@ -401,6 +403,9 @@ pf_fillup_fragment(struct pf_fragment_cm
pf_nfrents--;
}
 
+   /* Adjust queue length */
+   frag-fr_qlen += frent-fe_len;
+
if (prev == NULL)
TAILQ_INSERT_HEAD(frag-fr_queue, frent, fr_next);
else
@@ -419,8 +424,7 @@ pf_fillup_fragment(struct pf_fragment_cm
 int
 pf_isfull_fragment(struct pf_fragment *frag)
 {
-   struct pf_frent *frent, *next;
-   u_int16_toff, total;
+   u_int16_ttotal;
 
/* Check if we are completely reassembled */
if (TAILQ_LAST(frag-fr_queue, pf_fragq)-fe_mff)
@@ -431,22 +435,14 @@ pf_isfull_fragment(struct pf_fragment *f
TAILQ_LAST(frag-fr_queue, pf_fragq)-fe_len;
 
/* Check if we have all the data */
-   off = 0;
-   for (frent = TAILQ_FIRST(frag-fr_queue); frent; frent = next) {
-   next = TAILQ_NEXT(frent, fr_next);
-
-   off += frent-fe_len;
-   if (off  total  (next == NULL || next-fe_off != off)) {
-   DPFPRINTF(LOG_NOTICE,
-   missing fragment at %d, next %d, total %d,
-   off, next == NULL ? -1 : next-fe_off, total);
-   return (0);
-   }
-   }
-   DPFPRINTF(LOG_NOTICE, %d  %d?, off, total);
-   if (off  total)
+   if (frag-fr_qlen  total) {
+   DPFPRINTF(LOG_NOTICE,
+   missing fragment: need %d have %d, total, frag-fr_qlen);
return (0);
-   KASSERT(off == total);
+   }
+
+   DPFPRINTF(LOG_NOTICE, %d  %d?, frag-fr_qlen, total);
+   KASSERT(frag-fr_qlen  == total);
 
return (1);
 }



Re: bridge diff needs testing

2011-06-22 Thread Bret S. Lambert
On Wed, Jun 22, 2011 at 01:21:28PM +0200, Camiel Dobbelaar wrote:
 
 Hi Bret,
 
 one comment from inspection, is the part below related to the RB tree?
 Or a seperate optimization?
 
 On 22-6-2011 12:21, Bret S. Lambert wrote:
 
  @@ -1109,33 +1119,12 @@ bridge_output(struct ifnet *ifp, struct 
  used = 1;
  mc = m;
  } else {
  -   struct mbuf *m1, *m2, *mx;
  -
  -   m1 = m_copym2(m, 0, ETHER_HDR_LEN,
  -   M_DONTWAIT);
  -   if (m1 == NULL) {
  -   sc-sc_if.if_oerrors++;
  -   continue;
  -   }
  -   m2 = m_copym2(m, ETHER_HDR_LEN,
  +   mc = m_copym2(m, 0,
  M_COPYALL, M_DONTWAIT);
  -   if (m2 == NULL) {
  -   m_freem(m1);
  +   if (mc == NULL) {
  sc-sc_if.if_oerrors++;
  continue;
  }
  -
  -   for (mx = m1; mx-m_next != NULL; mx = 
  mx-m_next)
  -   /*EMPTY*/;
  -   mx-m_next = m2;
  -
  -   if (m1-m_flags  M_PKTHDR) {
  -   len = 0;
  -   for (mx = m1; mx != NULL; mx = 
  mx-m_next)
  -   len += mx-m_len;
  -   m1-m_pkthdr.len = len;
  -   }
  -   mc = m1;
  }
   
  error = bridge_ifenqueue(sc, dst_if, mc);
 

Ack; had sent an older diff before I had ripped this out. Please ignore,
and use the following updated diff. Apologies for the noise.

Index: if_bridge.h
===
RCS file: /cvs/src/sys/net/if_bridge.h,v
retrieving revision 1.34
diff -u -p -r1.34 if_bridge.h
--- if_bridge.h 20 Nov 2010 14:23:09 -  1.34
+++ if_bridge.h 22 Jun 2011 12:46:52 -
@@ -396,7 +396,7 @@ struct bridge_iflist {
  * Bridge route node
  */
 struct bridge_rtnode {
-   LIST_ENTRY(bridge_rtnode)   brt_next;   /* next in list */
+   RB_ENTRY(bridge_rtnode) brt_entry;  /* entry in rb tree */
struct  ifnet *brt_if;  /* destination ifs */
u_int8_tbrt_flags;  /* address flags */
u_int8_tbrt_age;/* age counter */
@@ -422,7 +422,7 @@ struct bridge_softc {
struct timeout  sc_brtimeout;   /* timeout state */
struct bstp_state   *sc_stp;/* stp state */
LIST_HEAD(, bridge_iflist)  sc_iflist;  /* interface list */
-   LIST_HEAD(, bridge_rtnode)  sc_rts[BRIDGE_RTABLE_SIZE]; /* hash 
table */
+   RB_HEAD(brt_tree, bridge_rtnode) sc_rts;/* route tree */
LIST_HEAD(, bridge_iflist)  sc_spanlist;/* span ports */
 };
 
Index: if_bridge.c
===
RCS file: /cvs/src/sys/net/if_bridge.c,v
retrieving revision 1.188
diff -u -p -r1.188 if_bridge.c
--- if_bridge.c 4 Nov 2010 23:07:15 -   1.188
+++ if_bridge.c 22 Jun 2011 12:46:52 -
@@ -45,6 +45,7 @@
 #include sys/ioctl.h
 #include sys/errno.h
 #include sys/kernel.h
+#include sys/tree.h
 #include machine/cpu.h
 
 #include net/if.h
@@ -135,7 +136,7 @@ int bridge_rtfind(struct bridge_softc *,
 void   bridge_rtage(struct bridge_softc *);
 void   bridge_rttrim(struct bridge_softc *);
 intbridge_rtdaddr(struct bridge_softc *, struct ether_addr *);
-intbridge_rtflush(struct bridge_softc *, int);
+void   bridge_rtflush(struct bridge_softc *, int);
 struct ifnet * bridge_rtupdate(struct bridge_softc *,
 struct ether_addr *, struct ifnet *ifp, int, u_int8_t);
 struct ifnet * bridge_rtlookup(struct bridge_softc *,
@@ -180,6 +181,16 @@ LIST_HEAD(, bridge_softc) bridge_list;
 struct if_clone bridge_cloner =
 IF_CLONE_INITIALIZER(bridge, bridge_clone_create, bridge_clone_destroy);
 
+static __inline int brt_cmp(struct bridge_rtnode *, struct bridge_rtnode *);
+RB_PROTOTYPE(brt_tree, bridge_rtnode, brt_entry, brt_cmp);
+RB_GENERATE(brt_tree, bridge_rtnode, brt_entry, brt_cmp);
+
+static __inline int
+brt_cmp(struct bridge_rtnode *a, struct bridge_rtnode *b)
+{
+   return (memcmp(a-brt_addr, b-brt_addr, sizeof(a-brt_addr)));
+}
+
 /* ARGSUSED */
 void
 bridgeattach(int n)
@@ -193,7 +204,7 @@ bridge_clone_create(struct if_clone *ifc
 {
struct bridge_softc *sc;
struct ifnet *ifp;
-   int i, s;
+   int s;
 
sc = malloc(sizeof(*sc), M_DEVBUF, M_NOWAIT

disgusting routing/network macro cleanup...now with booting kernels

2011-04-04 Thread Bret S. Lambert
Cleanup of some horrible macros pulled into the network code
from the horrible radix code.

Now boots and runs! And sends email!

Hi, otto@!

Index: net/radix.c
===
RCS file: /cvs/src/sys/net/radix.c,v
retrieving revision 1.28
diff -u -p -r1.28 radix.c
--- net/radix.c 22 Aug 2010 17:02:04 -  1.28
+++ net/radix.c 4 Apr 2011 18:29:51 -
@@ -413,9 +413,9 @@ rn_addmask(void *n_arg, int search, int 
if (mlen = skip)
return (mask_rnhead-rnh_nodes);
if (skip  1)
-   Bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
+   bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
if ((m0 = mlen)  skip)
-   Bcopy(netmask + skip, addmask_key + skip, mlen - skip);
+   bcopy(netmask + skip, addmask_key + skip, mlen - skip);
/*
 * Trim trailing zeroes.
 */
@@ -428,23 +428,23 @@ rn_addmask(void *n_arg, int search, int 
return (mask_rnhead-rnh_nodes);
}
if (m0  last_zeroed)
-   Bzero(addmask_key + m0, last_zeroed - m0);
+   bzero(addmask_key + m0, last_zeroed - m0);
*addmask_key = last_zeroed = mlen;
x = rn_search(addmask_key, rn_masktop);
if (Bcmp(addmask_key, x-rn_key, mlen) != 0)
x = 0;
if (x || search)
return (x);
-   R_Malloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
+   x = malloc(max_keylen + 2 * sizeof(*x), M_RTABLE, M_DONTWAIT);
if ((saved_x = x) == 0)
return (0);
-   Bzero(x, max_keylen + 2 * sizeof (*x));
+   bzero(x, (unsigned)(max_keylen + 2 * sizeof(*x)));
netmask = cp = (caddr_t)(x + 2);
-   Bcopy(addmask_key, cp, mlen);
+   bcopy(addmask_key, cp, mlen);
x = rn_insert(cp, mask_rnhead, maskduplicated, x);
if (maskduplicated) {
log(LOG_ERR, rn_addmask: mask impossibly already in tree\n);
-   Free(saved_x);
+   free(saved_x, M_RTABLE);
return (x);
}
/*
@@ -500,7 +500,7 @@ rn_new_radix_mask(struct radix_node *tt,
log(LOG_ERR, Mask for route not entered\n);
return (0);
}
-   Bzero(m, sizeof *m);
+   bzero(m, sizeof *m);
m-rm_b = tt-rn_b;
m-rm_flags = tt-rn_flags;
if (tt-rn_flags  RNF_NORMAL)
@@ -1003,7 +1003,7 @@ rn_inithead(void **head, int off)
 
if (*head)
return (1);
-   R_Malloc(rnh, struct radix_node_head *, sizeof (*rnh));
+   rnh = malloc(sizeof(*rnh), M_RTABLE, M_DONTWAIT);
if (rnh == 0)
return (0);
*head = rnh;
@@ -1015,7 +1015,7 @@ rn_inithead0(struct radix_node_head *rnh
 {
struct radix_node *t, *tt, *ttt;
 
-   Bzero(rnh, sizeof (*rnh));
+   bzero(rnh, sizeof (*rnh));
t = rn_newpair(rn_zeros, off, rnh-rnh_nodes);
ttt = rnh-rnh_nodes + 2;
t-rn_r = ttt;
@@ -1050,10 +1050,10 @@ rn_init()
rn_init: radix functions require max_keylen be set\n);
return;
}
-   R_Malloc(rn_zeros, char *, 3 * max_keylen);
+   rn_zeros = malloc(3 * max_keylen, M_RTABLE, M_DONTWAIT);
if (rn_zeros == NULL)
panic(rn_init);
-   Bzero(rn_zeros, 3 * max_keylen);
+   bzero(rn_zeros, 3 * max_keylen);
rn_ones = cp = rn_zeros + max_keylen;
addmask_key = cplim = rn_ones + max_keylen;
while (cp  cplim)
Index: net/radix.h
===
RCS file: /cvs/src/sys/net/radix.h,v
retrieving revision 1.16
diff -u -p -r1.16 radix.h
--- net/radix.h 28 Jun 2010 18:50:37 -  1.16
+++ net/radix.h 4 Apr 2011 18:29:51 -
@@ -98,7 +98,7 @@ extern struct radix_mask {
m = rn_mkfreelist;  \
rn_mkfreelist = (m)-rm_mklist; \
} else  \
-   R_Malloc(m, struct radix_mask *, sizeof (*(m)));\
+   m = malloc(sizeof(*(m)), M_RTABLE, M_DONTWAIT); \
 } while (0)
 
 #define MKFree(m) do { \
@@ -132,11 +132,6 @@ struct radix_node_head {
 };
 
 #ifdef _KERNEL
-#define Bcmp(a, b, n) bcmp(((caddr_t)(a)), ((caddr_t)(b)), (unsigned)(n))
-#define Bcopy(a, b, n) bcopy(((caddr_t)(a)), ((caddr_t)(b)), (unsigned)(n))
-#define Bzero(p, n) bzero((caddr_t)(p), (unsigned)(n));
-#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, 
M_DONTWAIT))
-#define Free(p) free((caddr_t)p, M_RTABLE);
 
 void   rn_init(void);
 intrn_inithead(void **, int);
Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.130
diff -u -p -r1.130 route.c
--- net/route.c 4 Apr 2011 

Re: NFS writes lock up system with -o tcp,-w32768

2011-03-30 Thread Bret S. Lambert
On Wed, Mar 30, 2011 at 09:54:45PM +0200, Claudio Jeker wrote:
 On Wed, Mar 30, 2011 at 08:36:45PM +0200, Walter Haidinger wrote:
  Am 30.03.2011 15:23, schrieb Claudio Jeker:
   Buffers below 8k are stupid. For TCP just use 32k or even 64k. 512byte
   buffers are silly. They get internally rounded up since the smallest
   packet seems to be 512bytes data plus header. This will give you TCP send
   and recv buffers of around 1200bytes. No wonder it is slow as hell.
  
  Throughput isn't the issue. The system gets unusable with sizes  2048.
  The machine freezes, it takes a couple of seconds for the next shell
  prompt to appear, like under really heavy load (I'd say way 30). 
  
  Of course bufsizes that small make no sense and your patch eliminates
  the lock ups, but the they show there is still some bug. I'd expect slow
  nfs transfers but not the behavior as if under heavy load. (*)
 
 NFS is a strange beast and I guess running with to small buffers results
 in such side effects. This has nothing todo with the buffer scaling but
 more with the way NFS works.

NFS has enough bugs to open a special exhibit at the zoo.

I have no idea if I'll ever have enough courage to dive back into it again.

 
  This is just to let to know, maybe you want to have a further look.
 
 I'm not interested. Maybe someone else likes to dig deep into NFS.
 I guess there is a reason why the default is 8k.
  
  Why did I test with small buffer sizes too? Well, I got another email
  which said about the mount options, obviously regarding the buffer sizes:
  
  When you jackfuck that knob with other values, what is the result?
   Troubleshooting isn't only for others, son!
   
  A reminder that this is an OpenBSD list... ;-)
  Luckily I always make sure to have my asbestos on when dealing with!
 
 -- 
 :wq Claudio



move bridge to rb-tree vice hash

2011-03-29 Thread Bret S. Lambert
Please test to make sure this breaks no setups and/or pleases
you aesthetically as it does me.

- Bert

Index: net/if_bridge.c
===
RCS file: /cvs/src/sys/net/if_bridge.c,v
retrieving revision 1.188
diff -u -p -r1.188 if_bridge.c
--- net/if_bridge.c 4 Nov 2010 23:07:15 -   1.188
+++ net/if_bridge.c 29 Mar 2011 19:51:56 -
@@ -45,6 +45,7 @@
 #include sys/ioctl.h
 #include sys/errno.h
 #include sys/kernel.h
+#include sys/tree.h
 #include machine/cpu.h
 
 #include net/if.h
@@ -135,7 +136,7 @@ int bridge_rtfind(struct bridge_softc *,
 void   bridge_rtage(struct bridge_softc *);
 void   bridge_rttrim(struct bridge_softc *);
 intbridge_rtdaddr(struct bridge_softc *, struct ether_addr *);
-intbridge_rtflush(struct bridge_softc *, int);
+void   bridge_rtflush(struct bridge_softc *, int);
 struct ifnet * bridge_rtupdate(struct bridge_softc *,
 struct ether_addr *, struct ifnet *ifp, int, u_int8_t);
 struct ifnet * bridge_rtlookup(struct bridge_softc *,
@@ -180,6 +181,16 @@ LIST_HEAD(, bridge_softc) bridge_list;
 struct if_clone bridge_cloner =
 IF_CLONE_INITIALIZER(bridge, bridge_clone_create, bridge_clone_destroy);
 
+static __inline int brt_cmp(struct bridge_rtnode *, struct bridge_rtnode *);
+RB_PROTOTYPE(brt_tree, bridge_rtnode, brt_entry, brt_cmp);
+RB_GENERATE(brt_tree, bridge_rtnode, brt_entry, brt_cmp);
+
+static __inline int
+brt_cmp(struct bridge_rtnode *a, struct bridge_rtnode *b)
+{
+   return (memcmp(a-brt_addr, b-brt_addr, sizeof(a-brt_addr)));
+}
+
 /* ARGSUSED */
 void
 bridgeattach(int n)
@@ -193,7 +204,7 @@ bridge_clone_create(struct if_clone *ifc
 {
struct bridge_softc *sc;
struct ifnet *ifp;
-   int i, s;
+   int s;
 
sc = malloc(sizeof(*sc), M_DEVBUF, M_NOWAIT|M_ZERO);
if (!sc)
@@ -210,8 +221,7 @@ bridge_clone_create(struct if_clone *ifc
timeout_set(sc-sc_brtimeout, bridge_timer, sc);
LIST_INIT(sc-sc_iflist);
LIST_INIT(sc-sc_spanlist);
-   for (i = 0; i  BRIDGE_RTABLE_SIZE; i++)
-   LIST_INIT(sc-sc_rts[i]);
+   RB_INIT(sc-sc_rts);
sc-sc_hashkey = arc4random();
ifp = sc-sc_if;
snprintf(ifp-if_xname, sizeof ifp-if_xname, %s%d, ifc-ifc_name,
@@ -559,7 +569,7 @@ bridge_ioctl(struct ifnet *ifp, u_long c
if ((error = suser(curproc, 0)) != 0)
break;
 
-   error = bridge_rtflush(sc, req-ifbr_ifsflags);
+   bridge_rtflush(sc, req-ifbr_ifsflags);
break;
case SIOCBRDGSADDR:
if ((error = suser(curproc, 0)) != 0)
@@ -1774,151 +1784,46 @@ struct ifnet *
 bridge_rtupdate(struct bridge_softc *sc, struct ether_addr *ea,
 struct ifnet *ifp, int setflags, u_int8_t flags)
 {
-   struct bridge_rtnode *p, *q;
-   u_int32_t h;
-   int dir;
-
-   h = bridge_hash(sc, ea);
-   p = LIST_FIRST(sc-sc_rts[h]);
-   if (p == LIST_END(sc-sc_rts[h])) {
+   struct bridge_rtnode *brt, tmp;
+
+   bcopy(ea, tmp.brt_addr, sizeof(*ea));
+
+   if ((brt = RB_FIND(brt_tree, sc-sc_rts, tmp)) == NULL) {
+
if (sc-sc_brtcnt = sc-sc_brtmax)
-   goto done;
-   p = malloc(sizeof(*p), M_DEVBUF, M_NOWAIT);
-   if (p == NULL)
-   goto done;
+   goto fail;
+   if ((brt = malloc(sizeof(*brt), M_DEVBUF, M_NOWAIT)) == NULL)
+   goto fail;
 
-   bcopy(ea, p-brt_addr, sizeof(p-brt_addr));
-   p-brt_if = ifp;
-   p-brt_age = 1;
+   bcopy(ea, brt-brt_addr, sizeof(brt-brt_addr));
+   brt-brt_if = ifp;
+   brt-brt_age = 1;
 
if (setflags)
-   p-brt_flags = flags;
+   brt-brt_flags = flags;
else
-   p-brt_flags = IFBAF_DYNAMIC;
+   brt-brt_flags = IFBAF_DYNAMIC;
 
-   LIST_INSERT_HEAD(sc-sc_rts[h], p, brt_next);
+   RB_INSERT(brt_tree, sc-sc_rts, brt);
sc-sc_brtcnt++;
-   goto want;
}
 
-   do {
-   q = p;
-   p = LIST_NEXT(p, brt_next);
-
-   dir = memcmp(ea, q-brt_addr, sizeof(q-brt_addr));
-   if (dir == 0) {
-   if (setflags) {
-   q-brt_if = ifp;
-   q-brt_flags = flags;
-   } else if (!(q-brt_flags  IFBAF_STATIC))
-   q-brt_if = ifp;
-
-   if (q-brt_if == ifp)
-   q-brt_age = 1;
-   ifp = q-brt_if;
-   goto want;
-   }
-
-   if (dir  0) {
-   if (sc-sc_brtcnt = sc-sc_brtmax)
-   goto done;
-   

Re: SCHED_ASSERT_UNLOCKED is considered harmful in the _kernel_lock()

2010-11-05 Thread Bret S. Lambert
Well, it's definitely a race, and the potential deadlock is handled
in mi_switch(), so my vote is for good intention, bad judgement
on the assertions in kern_lock.c

On Fri, Nov 05, 2010 at 05:52:23PM +0100, Mike Belopuhov wrote:
 hi,
 
 we've finally got an interesting panic that shed some light on the
 'kernel diagnostic assertion __mp_lock_held(sched_lock) == 0 failed'
 panics that plagued some heavily loaded systems.
 
 apparently, there's a window in mi_switch (right after cpu_switchto) that
 has sched_lock held but kernel_lock released, thus allowing other cpus
 to take advantage of the kernel_lock:
 
   cpu_switchto(p, nextproc);
   } else {
   p-p_stat = SONPROC;
   }
 
   clear_resched(curcpu());
 
   SCHED_ASSERT_LOCKED();
 
   /*
* To preserve lock ordering, we need to release the sched lock
* and grab it after we grab the big lock.
* In the future, when the sched lock isn't recursive, we'll
* just release it here.
*/
   #ifdef MULTIPROCESSOR
   __mp_unlock(sched_lock);
   #endif
 
 unfortunately they can't do this because we check for a sched_lock
 being held:
 
   void
   _kernel_lock(void)
   {
   SCHED_ASSERT_UNLOCKED();
   __mp_lock(kernel_lock);
   }
 
 it should be safe to take kernel_lock there because mi_switch:
 1) doesn't do anything nasty there;
 2) figures out whether process should take a kernel_lock later on:
 
   #ifdef MULTIPROCESSOR
   /*
* Reacquire the kernel_lock now.  We do this after we've
* released the scheduler lock to avoid deadlock, and before
* we reacquire the interlock and the scheduler lock.
*/
   if (p-p_flag  P_BIGLOCK)
   __mp_acquire_count(kernel_lock, hold_count);
   __mp_acquire_count(sched_lock, sched_count + 1);
   #endif
 
 opinions?  should we just remove an assertion?
 or am i trippin' on some high quality drugs? (:
 
 ddb log session:
 
 ddb{0} tr
 Debugger(d08cc9e6,e08b7e34,d08aaf94,e08b7e34,d0a44e28) at Debugger+0x4
 panic(d08aaf94,d08346ee,d08a80e0,d08a83ed,16a) at panic+0x5d
 __assert(d08346ee,d08a83ed,16a,d08a80e0,0) at __assert+0x2e
 _kernel_lock(1,0,d09d15cc,7a120,0) at _kernel_lock+0x48
 trap() at trap+0x5fb
 --- trap (number -547639296) ---
 Bad frame pointer: 0xd0ad48a0
 0:
 ddb{0} show panic
 kernel diagnostic assertion __mp_lock_held(sched_lock) == 0 failed:
 file ..
 /../../../kern/kern_lock.c, line 362
 ddb{0} show ps
 No such command
 ddb{0} ps  
PID   PPID   PGRPUID  S   FLAGS  WAIT  COMMAND
   2111  1   2111  0  3   0x2040180  selectsendmail
  12050  1  12050  0  3   0x2004080  ttyin getty
   3065  1   3065  0  3   0x2004080  ttyin getty
   2937  1   2937  0  3   0x2004080  ttyin getty
  15153  1  15153  0  3   0x2004080  ttyin getty
  24343  1  24343  0  3   0x2004080  ttyin getty
631  1631  0  3   0x2004080  ttyin getty
  18404  1  18404  0  3   0x280  selectcron
   7380  1   7380  0  3   0x2000180  selectinetd
  23123  1  23123  0  3   0x280  selectsshd
   8900  27266  27156 83  3   0x2000180  poll  ntpd
  27266  27156  27156 83  3   0x2000180  poll  ntpd
  27156  1  27156  0  3   0x280  poll  ntpd
  22867  2  2 74  3   0x2000180  bpf   pflogd
  2  1  2  0  3   0x280  netio pflogd
  15707  20626  20626 73  3   0x2000180  poll  syslogd
  20626  1  20626  0  3   0x288  netio syslogd
  29035  1  29035 77  3   0x2000180  poll  dhclient
  19499  1  27610  0  3   0x280  poll  dhclient
   7362  1   7362 77  3   0x2000180  poll  dhclient
  22676  1  27610  0  3   0x280  poll  dhclient
 19  0  0  0  3   0x2100200  aiodoned  aiodoned
 18  0  0  0  3   0x2100200  syncerupdate
 17  0  0  0  3   0x2100200  cleaner   cleaner
 16  0  0  0  30x100200  reaperreaper
 15  0  0  0  3   0x2100200  pgdaemon  pagedaemon
 14  0  0  0  3   0x2100200  bored crypto
 13  0  0  0  3   0x2100200  pftm  pfpurge
 12  0  0  0  3   0x2100200  usbtskusbtask
 11  0  0  0  3   0x2100200  usbatsk   usbatsk
 10  0  0  0  3   0x2100200  acpi0 acpi0
  9  0  0  0  7  0x40100200idle5
  8  0  0  0  7  0x40100200idle4
  7  0 

Re: fold -b1 segmentation fault

2010-10-22 Thread Bret S. Lambert
On Thu, Oct 21, 2010 at 05:57:24PM +0100, Jason McIntyre wrote:
 in case anyone's interested, and as reported in a recent freebsd pr:
 
   $ fold -b1
   Segmentation fault (core dumped)
 
 no fix is provided.
 
 jmc
 
 Number: 151592
 Category:   misc
 Synopsis:   'fold' segfaults on argument processing
 Confidential:   no
 Severity:   non-critical
 Priority:   low
 Responsible:freebsd-bugs
 State:  open
 Quarter:
 Keywords:   
 Date-Required:
 Class:  sw-bug
 Submitter-Id:   current-users
 Arrival-Date:   Wed Oct 20 03:50:08 UTC 2010
 Closed-Date:
 Last-Modified:
 Originator: Marcus Reid
 Release:8.1-STABLE
 Organization:
 Environment:
 FreeBSD austin.sea.netifice.com 8.1-STABLE FreeBSD 8.1-STABLE #0:
 Mon Sep 20 23:53:47 PDT 2010
 r...@austin.sea.netifice.com:/usr/obj/usr/src/sys/FARK  amd64
 Description:
 The 'fold' utility reads past the end of a buffer if arguments are
 incorrectly specified.  If you pass an argument to '-b' that happens
 to be in the character set 0123456789, line 101 reads past the
 end of argv[optind] and causes a segmentation fault.
 How-To-Repeat:
 Simply run 'fold -b1'
 Fix:
 I don't really see what the author is intending to do in that case
 statement where it breaks.
 
 Release-Note:
 Audit-Trail:
 Unformatted:
 

Fix appears to be to let getopt handle it, since allowing 0-9 as an
argument is not documented:


Index: fold.c
===
RCS file: /cvs/src/usr.bin/fold/fold.c,v
retrieving revision 1.12
diff -u -p -r1.12 fold.c
--- fold.c  27 Oct 2009 23:59:38 -  1.12
+++ fold.c  21 Oct 2010 17:26:24 -
@@ -57,7 +57,7 @@ main(int argc, char *argv[])
const char *errstr;
 
width = -1;
-   while ((ch = getopt(argc, argv, 0123456789bsw:)) != -1)
+   while ((ch = getopt(argc, argv, bsw:)) != -1)
switch (ch) {
case 'b':
count_bytes = 1;
@@ -70,21 +70,6 @@ main(int argc, char *argv[])
if (errstr != NULL)
errx(1, illegal width value, %s: %s, errstr, 
optarg);
-   break;
-   case '0': case '1': case '2': case '3': case '4':
-   case '5': case '6': case '7': case '8': case '9':
-   if (width == -1) {
-   p = argv[optind - 1];
-   if (p[0] == '-'  p[1] == ch  !p[2])
-   w = ++p;
-   else
-   w = argv[optind] + 1;
-
-   width = strtonum(w, 1, INT_MAX, errstr);
-   if (errstr != NULL)
-   errx(1, illegal width value, %s: %s, 
-   errstr, optarg);
-   }
break;
default:
(void)fprintf(stderr,



Re: Testers needed for strict locking diff; esp i386, amd64, softraid

2010-09-24 Thread Bret S. Lambert
On Thu, Sep 23, 2010 at 10:49:01PM -0700, Matthew Dempsky wrote:
 I'd like to commit this.  I've received positive reports from a few
 amd64 users and an i386 and softraid user, and all of the locking bugs
 exposed so far have already been fixed.

It's already exposed bugs in limited testing, so casting a wider net
(i.e., getting it into snaps) with it makes sense to me.

It also reminds me that I at one point intended to figure out how to
support enforcing dependencies between mutexes...

 
 I plan to remove the #define panic() hacks and let future locking
 problems actually panic; if anyone thinks they should stay in for a
 bit longer, let me know.
 
 ok?
 
 
 On Mon, Sep 20, 2010 at 08:00:27PM -0700, Matthew Dempsky wrote:
  I'd really appreciate if people could test the diffs below and report
  back to me.  This adds some warnings about lock API misuse that has
  already caught two bugs in DRM.  Eventually, I want these to be panics
  instead, but using just warnings for now should make it easier for
  people to test.
  
  Please apply the diff below to an up-to-date -current kernel and check
  dmesg for any stack backtraces.  If you see any, please send them to
  me.
  
  I'd especially like reports from both i386 and amd64, and also from
  softraid users.
  
  
  More details on the diff: this adds per-processor mutex lock counting
  to i386 and amd64, and updates assertwaitok() to check that this value
  is 0.  They also now check that the process that calls mtx_leave() is
  the same that called mtx_enter().  rw_exit*() are changed to call
  rw_assert_{rd,wr}lock() as appropriate.  Finally, mi_switch() also
  calls assertwaitok() now.
  
  
  Index: kern/subr_xxx.c
  ===
  RCS file: /cvs/src/sys/kern/subr_xxx.c,v
  retrieving revision 1.10
  diff -u -p -r1.10 subr_xxx.c
  --- kern/subr_xxx.c 21 Sep 2010 01:09:10 -  1.10
  +++ kern/subr_xxx.c 21 Sep 2010 01:45:59 -
  @@ -153,6 +153,9 @@ blktochr(dev_t dev)
  return (NODEV);
   }
   
  +#include ddb/db_output.h
  +#define panic(x...) do { printf(x); db_stack_dump(); } while (0);
  +
   /*
* Check that we're in a context where it's okay to sleep.
*/
  @@ -160,4 +163,9 @@ void
   assertwaitok(void)
   {
  splassert(IPL_NONE);
  +#ifdef __HAVE_CPU_MUTEX_LEVEL
  +   if (curcpu()-ci_mutex_level != 0)
  +   panic(assertwaitok: non-zero mutex count: %d,
  +   curcpu()-ci_mutex_level);
  +#endif
   }
  Index: kern/kern_rwlock.c
  ===
  RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
  retrieving revision 1.15
  diff -u -p -r1.15 kern_rwlock.c
  --- kern/kern_rwlock.c  13 Aug 2009 23:12:15 -  1.15
  +++ kern/kern_rwlock.c  21 Sep 2010 01:45:59 -
  @@ -107,6 +107,8 @@ rw_exit_read(struct rwlock *rwl)
   {
  unsigned long owner = rwl-rwl_owner;
   
  +   rw_assert_rdlock(rwl);
  +
  if (__predict_false((owner  RWLOCK_WAIT) ||
  rw_cas(rwl-rwl_owner, owner, owner - RWLOCK_READ_INCR)))
  rw_exit(rwl);
  @@ -117,6 +119,8 @@ rw_exit_write(struct rwlock *rwl)
   {
  unsigned long owner = rwl-rwl_owner;
   
  +   rw_assert_wrlock(rwl);
  +
  if (__predict_false((owner  RWLOCK_WAIT) ||
  rw_cas(rwl-rwl_owner, owner, 0)))
  rw_exit(rwl);
  @@ -236,6 +240,11 @@ rw_exit(struct rwlock *rwl)
  int wrlock = owner  RWLOCK_WRLOCK;
  unsigned long set;
   
  +   if (wrlock)
  +   rw_assert_wrlock(rwl);
  +   else
  +   rw_assert_rdlock(rwl);
  +
  do {
  owner = rwl-rwl_owner;
  if (wrlock)
  @@ -249,6 +258,10 @@ rw_exit(struct rwlock *rwl)
  wakeup(rwl);
   }
   
  +#include ddb/db_output.h
  +#define panic(x...) do { printf(x); db_stack_dump(); } while (0);
  +
  +#ifdef DIAGNOSTIC
   void
   rw_assert_wrlock(struct rwlock *rwl)
   {
  @@ -272,3 +285,4 @@ rw_assert_unlocked(struct rwlock *rwl)
  if (rwl-rwl_owner != 0L)
  panic(%s: lock held, rwl-rwl_name);
   }
  +#endif
  Index: kern/sched_bsd.c
  ===
  RCS file: /cvs/src/sys/kern/sched_bsd.c,v
  retrieving revision 1.23
  diff -u -p -r1.23 sched_bsd.c
  --- kern/sched_bsd.c30 Jun 2010 22:38:17 -  1.23
  +++ kern/sched_bsd.c21 Sep 2010 01:45:59 -
  @@ -356,6 +356,7 @@ mi_switch(void)
  int sched_count;
   #endif
   
  +   assertwaitok();
  KASSERT(p-p_stat != SONPROC);
   
  SCHED_ASSERT_LOCKED();
  Index: arch/amd64/include/cpu.h
  ===
  RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
  retrieving revision 1.55
  diff -u -p -r1.55 cpu.h
  --- arch/amd64/include/cpu.h11 Aug 2010 21:22:44 -  1.55
  +++ arch/amd64/include/cpu.h21 Sep 2010 01:46:00 -
  @@ -87,6 +87,10 @@ struct cpu_info {
  int 

Re: m_pad: ghostbuster's dream

2010-09-23 Thread Bret S. Lambert
On Wed, Sep 22, 2010 at 06:28:38PM +0200, Mike Belopuhov wrote:
 m_pad was introduced with an ipsec package import in openbsd
 in 1997.  m_inject was introduced in 1999 but this code wasn't
 changed.
 
 m_pad is equivalent to m_inject with an offset equal to the
 actual data length.
 
 Doesn't break anything here, but additional tests won't harm.

No objection in principle (consolidation of code in mbufs is
one of my currently-stalled-by-dayjob projects); comment inline.

 
 OK?
 
 Index: netinet/ip_esp.c
 ===
 RCS file: /home/cvs/src/sys/netinet/ip_esp.c,v
 retrieving revision 1.112
 diff -u -p -r1.112 ip_esp.c
 --- netinet/ip_esp.c  22 Sep 2010 13:40:05 -  1.112
 +++ netinet/ip_esp.c  22 Sep 2010 15:47:38 -
 @@ -932,9 +932,10 @@ esp_output(struct mbuf *m, struct tdb *t
* Add padding -- better to do it ourselves than use the crypto engine,
* although if/when we support compression, we'd have to do that.
*/
 - pad = (u_char *) m_pad(m, padding + alen);
 + mo = m_inject(m, m-m_pkthdr.len, padding + alen, M_DONTWAIT);
 + pad = mtod(mo, u_char *);

Sure, but this code creates a NULL pointer dereference;
mtod() isn't a function that does anything smart, as
is evidenced by the explanation in the mbuf man page:

#define mtod(m,t)   ((t)((m)-m_data))

   if (pad == NULL) {
 - DPRINTF((esp_output(): m_pad() failed for SA %s/%08x\n,
 + DPRINTF((esp_output(): m_inject failed for SA %s/%08x\n,
   ipsp_address(tdb-tdb_dst), ntohl(tdb-tdb_spi)));
   return ENOBUFS;
   }
 @@ -1178,77 +1179,4 @@ checkreplaywindow32(u_int32_t seq, u_int
  
   *bitmap |= (((u_int32_t) 1)  diff);
   return 0;
 -}
 -
 -/*
 - * m_pad(m, n) pads m with n bytes at the end. The packet header
 - * length is updated, and a pointer to the first byte of the padding
 - * (which is guaranteed to be all in one mbuf) is returned.
 - */
 -
 -caddr_t
 -m_pad(struct mbuf *m, int n)
 -{
 - struct mbuf *m0, *m1;
 - int len, pad;
 - caddr_t retval;
 -
 - if (n = 0) {  /* No stupid arguments. */
 - DPRINTF((m_pad(): pad length invalid (%d)\n, n));
 - m_freem(m);
 - return NULL;
 - }
 -
 - len = m-m_pkthdr.len;
 - pad = n;
 - m0 = m;
 -
 - while (m0-m_len  len) {
 - len -= m0-m_len;
 - m0 = m0-m_next;
 - }
 -
 - if (m0-m_len != len) {
 - DPRINTF((m_pad(): length mismatch (should be %d instead of 
 - %d)\n, m-m_pkthdr.len,
 - m-m_pkthdr.len + m0-m_len - len));
 -
 - m_freem(m);
 - return NULL;
 - }
 -
 - /* Check for zero-length trailing mbufs, and find the last one. */
 - for (m1 = m0; m1-m_next; m1 = m1-m_next) {
 - if (m1-m_next-m_len != 0) {
 - DPRINTF((m_pad(): length mismatch (should be %d 
 - instead of %d)\n, m-m_pkthdr.len,
 - m-m_pkthdr.len + m1-m_next-m_len));
 -
 - m_freem(m);
 - return NULL;
 - }
 -
 - m0 = m1-m_next;
 - }
 -
 - if ((m0-m_flags  M_EXT) ||
 - m0-m_data + m0-m_len + pad = (m0-m_dat[MLEN])) {
 - /* Add an mbuf to the chain. */
 - MGET(m1, M_DONTWAIT, MT_DATA);
 - if (m1 == 0) {
 - m_freem(m0);
 - DPRINTF((m_pad(): cannot append\n));
 - return NULL;
 - }
 -
 - m0-m_next = m1;
 - m0 = m1;
 - m0-m_len = 0;
 - }
 -
 - retval = m0-m_data + m0-m_len;
 - m0-m_len += pad;
 - m-m_pkthdr.len += pad;
 -
 - return retval;
  }
 Index: netinet/ip_ipsp.h
 ===
 RCS file: /home/cvs/src/sys/netinet/ip_ipsp.h,v
 retrieving revision 1.144
 diff -u -p -r1.144 ip_ipsp.h
 --- netinet/ip_ipsp.h 9 Jul 2010 16:58:06 -   1.144
 +++ netinet/ip_ipsp.h 22 Sep 2010 15:49:32 -
 @@ -624,9 +624,6 @@ extern int tcp_signature_tdb_input(struc
  extern int tcp_signature_tdb_output(struct mbuf *, struct tdb *,
  struct mbuf **, int, int);
  
 -/* Padding */
 -extern caddr_t m_pad(struct mbuf *, int);
 -
  /* Replay window */
  extern int checkreplaywindow32(u_int32_t, u_int32_t, u_int32_t *, u_int32_t,
  u_int32_t *, int);



Re: m_pad: ghostbuster's dream

2010-09-23 Thread Bret S. Lambert
On Thu, Sep 23, 2010 at 11:12:48AM +0200, Mike Belopuhov wrote:
 On Thu, Sep 23, 2010 at 09:57 +0200, Bret S. Lambert wrote:
  No objection in principle (consolidation of code in mbufs is
  one of my currently-stalled-by-dayjob projects); comment inline.
  
   
   OK?
   
   Index: netinet/ip_esp.c
   ===
   RCS file: /home/cvs/src/sys/netinet/ip_esp.c,v
   retrieving revision 1.112
   diff -u -p -r1.112 ip_esp.c
   --- netinet/ip_esp.c  22 Sep 2010 13:40:05 -  1.112
   +++ netinet/ip_esp.c  22 Sep 2010 15:47:38 -
   @@ -932,9 +932,10 @@ esp_output(struct mbuf *m, struct tdb *t
  * Add padding -- better to do it ourselves than use the crypto engine,
  * although if/when we support compression, we'd have to do that.
  */
   - pad = (u_char *) m_pad(m, padding + alen);
   + mo = m_inject(m, m-m_pkthdr.len, padding + alen, M_DONTWAIT);
   + pad = mtod(mo, u_char *);
  
  Sure, but this code creates a NULL pointer dereference;
  mtod() isn't a function that does anything smart, as
  is evidenced by the explanation in the mbuf man page:
  
  #define mtod(m,t)   ((t)((m)-m_data))
  
 
 /* Yeurk! */
 
 new diff.  thanks for looking through!

It looks sane enough to me, although you might also take a look
at m_pulldown to ensure X contiguous bytes at offset Y in an mbuf
chain, as, IIRC, that has the possibility of not requiring a new
mbuf allocation if there truly is enough space in an existing mbuf.

But, it's your bikeshed, you can choose whether it's grape- or
strawberry-flavored.


 Index: netinet/ip_esp.c
 ===
 RCS file: /home/cvs/src/sys/netinet/ip_esp.c,v
 retrieving revision 1.112
 diff -u -p -r1.112 ip_esp.c
 --- netinet/ip_esp.c  22 Sep 2010 13:40:05 -  1.112
 +++ netinet/ip_esp.c  23 Sep 2010 09:05:40 -
 @@ -932,12 +932,13 @@ esp_output(struct mbuf *m, struct tdb *t
* Add padding -- better to do it ourselves than use the crypto engine,
* although if/when we support compression, we'd have to do that.
*/
 - pad = (u_char *) m_pad(m, padding + alen);
 - if (pad == NULL) {
 - DPRINTF((esp_output(): m_pad() failed for SA %s/%08x\n,
 + mo = m_inject(m, m-m_pkthdr.len, padding + alen, M_DONTWAIT);
 + if (mo == NULL) {
 + DPRINTF((esp_output(): m_inject failed for SA %s/%08x\n,
   ipsp_address(tdb-tdb_dst), ntohl(tdb-tdb_spi)));
   return ENOBUFS;
   }
 + pad = mtod(mo, u_char *);
  
   /* Self-describing or random padding ? */
   if (!(tdb-tdb_flags  TDBF_RANDOMPADDING))
 @@ -1178,77 +1179,4 @@ checkreplaywindow32(u_int32_t seq, u_int
  
   *bitmap |= (((u_int32_t) 1)  diff);
   return 0;
 -}
 -
 -/*
 - * m_pad(m, n) pads m with n bytes at the end. The packet header
 - * length is updated, and a pointer to the first byte of the padding
 - * (which is guaranteed to be all in one mbuf) is returned.
 - */
 -
 -caddr_t
 -m_pad(struct mbuf *m, int n)
 -{
 - struct mbuf *m0, *m1;
 - int len, pad;
 - caddr_t retval;
 -
 - if (n = 0) {  /* No stupid arguments. */
 - DPRINTF((m_pad(): pad length invalid (%d)\n, n));
 - m_freem(m);
 - return NULL;
 - }
 -
 - len = m-m_pkthdr.len;
 - pad = n;
 - m0 = m;
 -
 - while (m0-m_len  len) {
 - len -= m0-m_len;
 - m0 = m0-m_next;
 - }
 -
 - if (m0-m_len != len) {
 - DPRINTF((m_pad(): length mismatch (should be %d instead of 
 - %d)\n, m-m_pkthdr.len,
 - m-m_pkthdr.len + m0-m_len - len));
 -
 - m_freem(m);
 - return NULL;
 - }
 -
 - /* Check for zero-length trailing mbufs, and find the last one. */
 - for (m1 = m0; m1-m_next; m1 = m1-m_next) {
 - if (m1-m_next-m_len != 0) {
 - DPRINTF((m_pad(): length mismatch (should be %d 
 - instead of %d)\n, m-m_pkthdr.len,
 - m-m_pkthdr.len + m1-m_next-m_len));
 -
 - m_freem(m);
 - return NULL;
 - }
 -
 - m0 = m1-m_next;
 - }
 -
 - if ((m0-m_flags  M_EXT) ||
 - m0-m_data + m0-m_len + pad = (m0-m_dat[MLEN])) {
 - /* Add an mbuf to the chain. */
 - MGET(m1, M_DONTWAIT, MT_DATA);
 - if (m1 == 0) {
 - m_freem(m0);
 - DPRINTF((m_pad(): cannot append\n));
 - return NULL;
 - }
 -
 - m0-m_next = m1;
 - m0 = m1;
 - m0-m_len = 0;
 - }
 -
 - retval = m0-m_data + m0-m_len;
 - m0-m_len += pad;
 - m-m_pkthdr.len += pad;
 -
 - return retval;
  }
 Index: netinet/ip_ipsp.h
 ===
 RCS file: /home/cvs/src/sys

Re: revised kernel perf control

2010-09-13 Thread Bret S. Lambert
On Mon, Sep 13, 2010 at 04:15:29PM -0400, Ted Unangst wrote:
 On Fri, Sep 10, 2010 at 4:34 PM, Alexander Hall ha...@openbsd.org wrote:
  I wasn't aware of any problems with suspend.  My laptop doesn't resume
  reliably anyway, so it's not something I tested for.  But it's just
  running some code in a timeout.  If timeouts are still running during
  suspend, I'd say that's somebody else's bug.
 
  ISTR that the problem was that during the later part of a suspend, only
  one cpu is active, while the throttling affects all of them. I don't
  know enough about that to tell anything though.
 
  Well, anyhow, with this diff, and one cpu pegged, my thinkpad did not
  survive a resume. While idle, though, all seems fine.
 
 OK, I'm willing to believe there is a problem, but I still think that
 at the point where running setperf would be harmful, timeouts should
 be disabled.
 
 Barring that, how am I supposed to check if there is an imminent suspend?
 

Register a device with suspend resume hooks which disables/enables throttling?



mbuf cleanup

2010-09-11 Thread Bret S. Lambert
Finally getting around to finishing a c2k10 diff. This no longer a) panics
nor b) drops ipsec traffic on my test box.

The code to split a single mbuf is used in both m_inject and m_split, so
roll that into its own function, m_splitm, and make them wrappers around
the new function. There's at least one other place it could be used, but
this will do for now.

As said above, this affects ipsec setups, it also affects A-MSDU wireless
traffic; tests of both and (off-list) reports will earn you my love.

- Bert

Index: kern/uipc_mbuf.c
===
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.143
diff -u -p -r1.143 uipc_mbuf.c
--- kern/uipc_mbuf.c15 Jul 2010 09:45:09 -  1.143
+++ kern/uipc_mbuf.c11 Sep 2010 17:07:07 -
@@ -110,6 +110,8 @@ struct  pool mclpools[MCLPOOLS];
 
 intm_clpool(u_int);
 
+struct mbuf *m_splitm(struct mbuf *, int, int);
+
 int max_linkhdr;   /* largest link-level header */
 int max_protohdr;  /* largest protocol header */
 int max_hdr;   /* largest link+protocol header */
@@ -1064,130 +1066,85 @@ m_getptr(struct mbuf *m, int loc, int *o
 }
 
 /*
- * Inject a new mbuf chain of length siz in mbuf chain m0 at
- * position len0. Returns a pointer to the first injected mbuf, or
- * NULL on failure (m0 is left undisturbed). Note that if there is
- * enough space for an object of size siz in the appropriate position,
+ * Inject a new mbuf chain of length len in mbuf chain m at
+ * position off. Returns a pointer to the first injected mbuf, or
+ * NULL on failure (m is left undisturbed). Note that if there is
+ * enough space for an object of size len in the appropriate position,
  * no memory will be allocated. Also, there will be no data movement in
- * the first len0 bytes (pointers to that will remain valid).
+ * the first off bytes (pointers to that will remain valid).
  *
- * XXX It is assumed that siz is less than the size of an mbuf at the moment.
+ * XXX It is assumed that len is less than the size of an mbuf at the moment.
  */
 struct mbuf *
-m_inject(struct mbuf *m0, int len0, int siz, int wait)
+m_inject(struct mbuf *m, int off, int len, int wait)
 {
-   struct mbuf *m, *n, *n2 = NULL, *n3;
-   unsigned len = len0, remain;
+   struct mbuf *n, *o, *p;
+   int off1;
 
-   if ((siz = MHLEN) || (len0 = 0))
-   return (NULL);
-   for (m = m0; m  len  m-m_len; m = m-m_next)
-   len -= m-m_len;
-   if (m == NULL)
+   if (len  MHLEN || len = 0)
return (NULL);
-   remain = m-m_len - len;
-   if (remain == 0) {
-   if ((m-m_next)  (M_LEADINGSPACE(m-m_next) = siz)) {
-   m-m_next-m_len += siz;
-   if (m0-m_flags  M_PKTHDR)
-   m0-m_pkthdr.len += siz;
-   m-m_next-m_data -= siz;
-   return m-m_next;
-   }
-   } else {
-   n2 = m_copym2(m, len, remain, wait);
-   if (n2 == NULL)
-   return (NULL);
-   }
 
-   MGET(n, wait, MT_DATA);
-   if (n == NULL) {
-   if (n2)
-   m_freem(n2);
+   if ((n = m_getptr(m, off, off1)) == NULL)
+   return (NULL);
+
+   if ((o = m_get(wait, MT_DATA)) == NULL)
+   return (NULL);
+
+   if ((p = m_splitm(n, off1, wait)) == NULL) {
+   m_freem(o);
return (NULL);
}
 
-   n-m_len = siz;
-   if (m0-m_flags  M_PKTHDR)
-   m0-m_pkthdr.len += siz;
-   m-m_len -= remain; /* Trim */
-   if (n2) {
-   for (n3 = n; n3-m_next != NULL; n3 = n3-m_next)
-   ;
-   n3-m_next = n2;
-   } else
-   n3 = n;
-   for (; n3-m_next != NULL; n3 = n3-m_next)
-   ;
-   n3-m_next = m-m_next;
-   m-m_next = n;
-   return n;
+   o-m_len = len;
+   o-m_next = p;
+   n-m_next = o;
+
+   if (m-m_flags  M_PKTHDR)
+   m-m_pkthdr.len += len;
+
+   return (o);
 }
 
 /*
- * Partition an mbuf chain in two pieces, returning the tail --
- * all but the first len0 bytes.  In case of failure, it returns NULL and
- * attempts to restore the chain to its original state.
+ * Create a pair of mbufs, leaving the chain intact.
  */
 struct mbuf *
-m_split(struct mbuf *m0, int len0, int wait)
+m_splitm(struct mbuf *m, int off, int wait)
 {
-   struct mbuf *m, *n;
-   unsigned len = len0, remain, olen;
+   struct mbuf *n;
 
-   for (m = m0; m  len  m-m_len; m = m-m_next)
-   len -= m-m_len;
-   if (m == NULL)
+   if (off  m-m_len)
return (NULL);
-   remain = m-m_len - len;
-   if (m0-m_flags  M_PKTHDR) {
-   MGETHDR(n, wait, m0-m_type);
-   if (n == NULL)
-   

Re: generating new host key...

2010-09-08 Thread Bret S. Lambert
On Wed, Sep 08, 2010 at 11:39:59AM -0400, Ted Unangst wrote:
 On Tue, Sep 7, 2010 at 7:18 PM, Alexander Hall ha...@openbsd.org wrote:
  $ which true false
  /usr/bin/true
  /usr/bin/false
 
  while those should be available to /etc/rc, I'd prefer not using them.
 
 -5 points for using which. :)
 
 $ whence -v true
 true is a shell builtin
 
 I happen to think that explicit true and false values make things
 easier to read, without as much [ ] noise.
 

Truly, you do not grasp the simple elegance of a banana-shaped bikeshed.



Re: sbt/sdmmc locking update

2010-08-21 Thread Bret S. Lambert
On Sat, Aug 21, 2010 at 05:21:20PM -0700, Matthew Dempsky wrote:
 On Sat, Aug 21, 2010 at 10:04 AM, Bret S. Lambert blamb...@openbsd.org 
 wrote:
  This diff changes the locking in sbt and sdmmc from the old-and-busted
  lockmgr locks to the new-hotness rwlock locks. I don't have the
  hardware to test this on, so anybody with sbt/sdmmc is encouraged
  to give it a spin.
 
 Why not simply change the SDMMC_LOCK/SDMMC_UNLOCK/SDMMC_ASSERT_LOCKED
 macros and leave the .c files untouched (other than the lock init)?

A version that did that was offered, but I was convinced off-list
that my original desire to not obfuscate the locking was correct.

 
 Also, I'd prefer rw_exit_write() be used instead of rw_exit() when you
 know you're holding a write lock.

As I believe you said before.

My bikeshed, therefore it will be banana-shaped.



Re: Add timeout_add_abstv(9)

2010-08-20 Thread Bret S. Lambert
On Fri, Aug 20, 2010 at 12:27:39PM -0700, Matthew Dempsky wrote:
 On Fri, Aug 20, 2010 at 11:47 AM, Bret S. Lambert
 bret.lamb...@gmail.com wrote:
  And this one, like that one, lacks context.
 
 The hzto/tvtohz diff was just a const correctness fix.  I didn't think
 it needed more context on its own.
 
  What's the ultimate goal? Just to remove some calls to hzto?
 
 timeout(9) suggests avoiding using timeout_add(9) directly in favor of
 higher-level functions, and I think it's worth adding utility
 functions for common use patterns.  I just noticed that a number of
 remaining timeout_add(9) calls were of the form timeout_add(...,
 hzto(tv)).
 
 From an abstraction point of view, I don't think it makes sense for so
 much code to needlessly know about ticks.

So...yes, it *was* just to remove calls to hzto. And you're adding
another interface for it, which just seems grotesque and excessive.

IMO, the correct way to get rid of this is to use the macros
provided in time.h which do math on timevals, and then stuff that
derived value into a timeout_add_tv.

Also realize that there were subtle issues with hz-tv conversions
when I started converting to the timeout_add_foo functions, which
need to be resolved before we can do these conversions, as the
conversions in sys/kern/kern_clock.c do something differently,
and not doing it in that manner ends up whacking out drivers
which implicitly depend upon that behavior.

Please read those, and join me in my WTF.



Re: Add timeout_add_abstv(9)

2010-08-20 Thread Bret S. Lambert
Replied to quickly just now and missed the following...

 From an abstraction point of view, I don't think it makes sense for so
 much code to needlessly know about ticks.

Yes, thus the conversions to actual time values. And, again, a mention
that a discussion needs to be had concerning issues raised by those who
have need of more specific behaviors than the current timeout API provides
(for the purposes of POSIX-specified thread behavior, IIRC, was what
guenther@ needed).



Re: defining ports LOCKDIR

2010-06-16 Thread Bret S. Lambert
On Wed, Jun 16, 2010 at 02:02:37PM +0200, Antoine Jacoutot wrote:
 On Wed, 16 Jun 2010, Marc Espie wrote:
   3 days is a bit short on some arch. /var/tmp gives you 7 days,
   but isn't cleared at boot...
   
  daily could be set to ignore those... after all, it already ignores some 
  other
  stuff.
 
 Sure, I was just pointing that one needed to be careful ;-)

What about /var/run? cleared on boot, isn't ratfucked by /etc/daily.

 
 -- 
 Antoine



Re: defining ports LOCKDIR

2010-06-16 Thread Bret S. Lambert
On Wed, Jun 16, 2010 at 02:15:37PM +0200, Marc Espie wrote:
 On Wed, Jun 16, 2010 at 02:11:47PM +0200, Bret S. Lambert wrote:
  On Wed, Jun 16, 2010 at 02:02:37PM +0200, Antoine Jacoutot wrote:
   On Wed, 16 Jun 2010, Marc Espie wrote:
 3 days is a bit short on some arch. /var/tmp gives you 7 days,
 but isn't cleared at boot...
 
daily could be set to ignore those... after all, it already ignores 
some other
stuff.
   
   Sure, I was just pointing that one needed to be careful ;-)
  
  What about /var/run? cleared on boot, isn't ratfucked by /etc/daily.
 
 !root...

bitch, bitch, bitch! :p



Re: WIP: packet inspection in PF

2010-06-13 Thread Bret S. Lambert
Is there some reason that divert sockets (``man divert'') can't do
this for you?

On Sun, Jun 13, 2010 at 03:27:57AM +0400, Vadim Jukov wrote:
 Hello, tech@, especially PF hackers!
 
 This is a work-in-progress patch that implements direct packet inspection 
 in PF. This is needed in the cases when traffic could not be easily 
 detected by other mechanisms. The actual example is new UDP-based 
 protocol of uTorrent program that spams networks heavily, and could be 
 detected only by comparing value at the offset 0x40 with 0x7FFFAB.
 
 The main reason I publish uncompleted work is that I want to receive some 
 clues, particularily:
 
  1. I detect beginning of actual data as pd-tot_len - pd-p_len - is 
 it right (method to do so)?
 
  2. I use m_data, m_len and m_next to loop through mbuf chain - is 
 it right (method to do so)?
 
 Currently, it compiles, runs, but doesn't work - please do not actually 
 run this patch unless you want to duplicate my work. :)
 
 Thanks in advance.
 
 -- 
   Best wishes,
 Vadim Zhukov
 
 Index: share/man/man5/pf.conf.5
 ===
 RCS file: /cvs/src/share/man/man5/pf.conf.5,v
 retrieving revision 1.476
 diff -u -r1.476 pf.conf.5
 --- share/man/man5/pf.conf.5  19 May 2010 13:51:37 -  1.476
 +++ share/man/man5/pf.conf.5  12 Jun 2010 23:16:15 -
 @@ -434,6 +434,33 @@
  rule that is used when a packet does not match any rules does not
  allow IP options.
  .Pp
 +.It Xo
 +.Ar inspect Aq Ar value
 +.Ar at Aq Ar offset
 +.Xc
 +Tests packet contents at the
 +.Ar offset
 +to be equal to
 +.Ar value .
 +Note that offset starts after protocol header.
 +.Ar value
 +can be specified as plain strings, or as hexadecimal raw strings (i.e.,
 +starting with 0x).
 +In the latter case you can embed any special characters.
 +Maximum length of
 +.Ar value is 64 characters.
 +.Pp
 +.It Xo
 +.Ar inspect Aq Ar mask
 +.Ar maskop Aq Ar value
 +.Ar at Aq Ar offset
 +.Xc
 +Same as previous, but also allows to specify a mask to applied to
 +the data from packet before comparing to
 +.Ar value .
 +Two operations supported are logical and and logical exclusive or.
 +They're specified using  and ^ characters, respectively.
 +.Pp
  .It Ar divert-packet Aq Ar port
  Used to send matching packets to
  .Xr divert 4
 @@ -2643,7 +2670,8 @@
nat-to ( redirhost | { redirhost-list } )
[ portspec ] [ pooltype ] [ static-port ] |
[ fastroute | route ] |
 -  [ received-on ( interface-name | interface-group ) ]
 +  [ received-on ( interface-name | interface-group ) ] |
 +  inspect
  
  scrubopts  = scrubopt [ [ , ] scrubopts ]
  scrubopt   = no-df | min-ttl number | max-mss number |
 @@ -2786,6 +2814,8 @@
  upperlimit-sc  = upperlimit sc-spec
  sc-spec= ( bandwidth-spec |
   ( bandwidth-spec number bandwidth-spec ) )
 +inspect= inspect [ inspect-op ] string at number
 +inspect-op = string (  | ^ )
  include= include filename
  .Ed
  .Sh FILES
 Index: sys/net/pf.c
 ===
 RCS file: /cvs/src/sys/net/pf.c,v
 retrieving revision 1.691
 diff -u -r1.691 pf.c
 --- sys/net/pf.c  7 May 2010 13:33:16 -   1.691
 +++ sys/net/pf.c  12 Jun 2010 23:16:15 -
 @@ -230,6 +230,8 @@
   struct pf_state_key_cmp *, u_int, struct mbuf *);
  int   pf_src_connlimit(struct pf_state **);
  int   pf_check_congestion(struct ifqueue *);
 +int   pf_inspect(struct pf_pdesc *, struct mbuf *,
 + struct pf_rule *);
  int   pf_match_rcvif(struct mbuf *, struct pf_rule *);
  
  extern struct pool pfr_ktable_pl;
 @@ -2271,6 +2273,54 @@
  }
  
  int
 +pf_inspect(struct pf_pdesc *pd, struct mbuf *m, struct pf_rule *r) {
 + u_int32_t   at, i, mpos, pos;
 + charcv;
 +
 + if (r-inspect_at + r-inspect_len  pd-p_len)
 + return (0);
 + at = r-inspect_at + (pd-tot_len - pd-p_len);
 +
 + for (pos = 0; pos + m-m_len  at;) {
 + pos += m-m_len;
 + m = m-m_next;
 + if (m == NULL)
 + /* XXX: Should not be reached */
 + return (0);
 + }
 + mpos = at - pos;
 +
 + for (i = 0; i  r-inspect_len; i++, mpos++) {
 + while (mpos = m-m_len) {
 + pos += m-m_len;
 + m = m-m_next;
 + if (m == NULL)
 + /* XXX: Should not be reached */
 + return (0);
 + mpos = 0;
 + }
 + switch (r-inspect_op) {
 + case PF_INSOP_CMP:
 + cv = m-m_data[mpos];
 + break;
 + case PF_INSOP_AND:
 + cv = m-m_data[mpos]  

Re: WIP: packet inspection in PF

2010-06-13 Thread Bret S. Lambert
On Sun, Jun 13, 2010 at 12:41:01PM +0400, Vadim Zhukov wrote:
 Hm-m-m, could you explain better, please? I don't see the way to do
 such filtering with diverting, excluding writing a proxy app listening
 all the traffic.

Why do you assume I'm excluding a proxy app?

 
 2010/6/13, Bret S. Lambert bret.lamb...@gmail.com:
  Is there some reason that divert sockets (``man divert'') can't do
  this for you?
 
  On Sun, Jun 13, 2010 at 03:27:57AM +0400, Vadim Jukov wrote:
  Hello, tech@, especially PF hackers!
 
  This is a work-in-progress patch that implements direct packet inspection
  in PF. This is needed in the cases when traffic could not be easily
  detected by other mechanisms. The actual example is new UDP-based
  protocol of uTorrent program that spams networks heavily, and could be
  detected only by comparing value at the offset 0x40 with 0x7FFFAB.
 
  The main reason I publish uncompleted work is that I want to receive some
  clues, particularily:
 
   1. I detect beginning of actual data as pd-tot_len - pd-p_len - is
  it right (method to do so)?
 
   2. I use m_data, m_len and m_next to loop through mbuf chain - is
  it right (method to do so)?
 
  Currently, it compiles, runs, but doesn't work - please do not actually
  run this patch unless you want to duplicate my work. :)
 
  Thanks in advance.
 
  --
Best wishes,
  Vadim Zhukov
 
  Index: share/man/man5/pf.conf.5
  ===
  RCS file: /cvs/src/share/man/man5/pf.conf.5,v
  retrieving revision 1.476
  diff -u -r1.476 pf.conf.5
  --- share/man/man5/pf.conf.5   19 May 2010 13:51:37 -  1.476
  +++ share/man/man5/pf.conf.5   12 Jun 2010 23:16:15 -
  @@ -434,6 +434,33 @@
   rule that is used when a packet does not match any rules does not
   allow IP options.
   .Pp
  +.It Xo
  +.Ar inspect Aq Ar value
  +.Ar at Aq Ar offset
  +.Xc
  +Tests packet contents at the
  +.Ar offset
  +to be equal to
  +.Ar value .
  +Note that offset starts after protocol header.
  +.Ar value
  +can be specified as plain strings, or as hexadecimal raw strings (i.e.,
  +starting with 0x).
  +In the latter case you can embed any special characters.
  +Maximum length of
  +.Ar value is 64 characters.
  +.Pp
  +.It Xo
  +.Ar inspect Aq Ar mask
  +.Ar maskop Aq Ar value
  +.Ar at Aq Ar offset
  +.Xc
  +Same as previous, but also allows to specify a mask to applied to
  +the data from packet before comparing to
  +.Ar value .
  +Two operations supported are logical and and logical exclusive or.
  +They're specified using  and ^ characters, respectively.
  +.Pp
   .It Ar divert-packet Aq Ar port
   Used to send matching packets to
   .Xr divert 4
  @@ -2643,7 +2670,8 @@
  nat-to ( redirhost | { redirhost-list } )
  [ portspec ] [ pooltype ] [ static-port ] |
  [ fastroute | route ] |
  -   [ received-on ( interface-name | interface-group ) ]
  +   [ received-on ( interface-name | interface-group ) ] |
  +   inspect
 
   scrubopts  = scrubopt [ [ , ] scrubopts ]
   scrubopt   = no-df | min-ttl number | max-mss number |
  @@ -2786,6 +2814,8 @@
   upperlimit-sc  = upperlimit sc-spec
   sc-spec= ( bandwidth-spec |
( bandwidth-spec number bandwidth-spec ) )
  +inspect= inspect [ inspect-op ] string at number
  +inspect-op = string (  | ^ )
   include= include filename
   .Ed
   .Sh FILES
  Index: sys/net/pf.c
  ===
  RCS file: /cvs/src/sys/net/pf.c,v
  retrieving revision 1.691
  diff -u -r1.691 pf.c
  --- sys/net/pf.c   7 May 2010 13:33:16 -   1.691
  +++ sys/net/pf.c   12 Jun 2010 23:16:15 -
  @@ -230,6 +230,8 @@
 struct pf_state_key_cmp *, u_int, struct mbuf *);
   intpf_src_connlimit(struct pf_state **);
   intpf_check_congestion(struct ifqueue *);
  +intpf_inspect(struct pf_pdesc *, struct mbuf *,
  +  struct pf_rule *);
   intpf_match_rcvif(struct mbuf *, struct pf_rule 
  *);
 
   extern struct pool pfr_ktable_pl;
  @@ -2271,6 +2273,54 @@
   }
 
   int
  +pf_inspect(struct pf_pdesc *pd, struct mbuf *m, struct pf_rule *r) {
  +  u_int32_t   at, i, mpos, pos;
  +  charcv;
  +
  +  if (r-inspect_at + r-inspect_len  pd-p_len)
  +  return (0);
  +  at = r-inspect_at + (pd-tot_len - pd-p_len);
  +
  +  for (pos = 0; pos + m-m_len  at;) {
  +  pos += m-m_len;
  +  m = m-m_next;
  +  if (m == NULL)
  +  /* XXX: Should not be reached */
  +  return (0);
  +  }
  +  mpos = at - pos;
  +
  +  for (i = 0; i  r-inspect_len; i++, mpos++) {
  +  while (mpos = m-m_len) {
  +  pos += m-m_len;
  +  m = m-m_next;
  +  if (m == NULL

Re: Source Overview

2010-04-20 Thread Bret S. Lambert
On Mon, Apr 19, 2010 at 04:15:02PM +, Thordur Bjornsson wrote:
  And if you value your sanity, stay out of anything resembling filesystems.
 This is a lie.
 
 Hacking on filesystems, and the VFS layer in general is a very rewarding
 experince, just ask Bob.
 
 NFS for example, has been a source of joy for OpenBSD developers for
 years!

Yes, if by joy you mean anal hemmoraging.

 
  2) ?Is there something like an openbsd janitors project where newbies can
  start contributing small patches? similar to the Linux janitors project?
 
  Not at all. The philosophy behind not having one is that it's considered 
  dangerous
  to farm out work to the inexperienced (and this exact topic has been 
  brought up
  before, usually by people whining that we didn't make them feel special 
  enough
  by not having one).
 Also it leads to people doing KNF style diffs, just to do KNF style
 diffs. Noone
 learns anything. Most KNF style diffs you see coming from developers is due 
 to
 them having to read some code, and they cleaned up a little while doing so.
 
 While KNF is great, doing KNF just for the sake of doing KNF is hardly
 ever worth it
 IMHO.



Re: Source Overview

2010-04-20 Thread Bret S. Lambert
 Or, in short, we need to not deter people straight away, and accept that
 perhaps sometimes decent programmers start from ones that make lots
 of mistakes.
 
 Perhaps a ports TODO similar to the NetBSD ports TODO might help; it
 doesn't require quite the same level of kernel or userspace hacking and
 provides very visible feedback and thanks once completed.

But you're also missing a big point: if someone can't figure out what
needs to be fixed, or is incapable of mustering enough motivation to
do the work, then they're not as useful to us as someone who can do
one or the other (or hopefully both).

tedu@ once said that there's an entrance exam to get an account; this
happens to be part of it, IMO.

PS - put this back on tech@ where it was originally, misc@ has enough noise
without this going there as well



Re: Source Overview

2010-04-20 Thread Bret S. Lambert
 Surely people can move on from the low hanging fruit of a port that needs
 a ./configure  make install, to minor code or makefile changes, to specific
 new functionality to from the scratch coding.

Not to sound like a dick, but this illustrates some of what I'm saying:

If I hold up one corner and a student cannot come back to me
with the other three, I do not go on with the lesson.

- Confucius, the Analects



Re: Atheros AR5424 rev 0x01 hal error

2010-04-20 Thread Bret S. Lambert
On Tue, Apr 20, 2010 at 10:55:18AM -0400, Adam M. Dutko wrote:
 When I attempt to use my wireless card (AR5424) I get the following:
 
 ath0: unable to reset hardware; hal status 3520208968
 
 The number at the end changes when I try again.

That smells like an uninitialized value for status, to me;
try setting it to something known to be stupid and incorrect at the
top of the function.

 
 I believe I've traced where the error message is thrown to a section of code
 in src/sys/dev/ic/ath.c:
 
 
 if (!ath_hal_reset(ah, ic-ic_opmode, hchan, AH_TRUE, status)) {
   printf(%s: unable to reset hardware; hal status %u\n,
   ifp-if_xname, status);
   error = EIO;
   goto done;
   }
 
 
 I'm going to recompile with ATH_DEBUG=10 and see if I can get more
 information.  I'm also trying to trace down the ath_hal_reset function
 to see if I can't find something in that area.
 
 Does anyone that's worked on a similar error have any pointers?  Thanks.



Re: Source Overview

2010-04-19 Thread Bret S. Lambert
On Mon, Apr 19, 2010 at 09:59:06AM -0400, Adam M. Dutko wrote:
 The obvious answer to this questions is Just read the source... but I
 still want to ask if someone is aware of a good overview of the OpenBSD
 source code?  I've watched several presentations by Ted Unangst, Jason Dixon
 and co. and there seems to be a good amount of information spread across the
 web, but not a single canonical reference besides the source code (yes, I
 know source is very important).
 
 I know the base of code is very very large, so maybe instead of the whole
 repository, how about important parts/subsystems?  Are any of you aware of
 such a document or documents?  Are there areas that are easier for relative
 newbies to start in versus other areas?  (eg. wireless drivers vs. SMP) Is
 there something like an openbsd janitors project where newbies can start?
 Thanks in advance.

http://www.openbsd.org/papers/

has some good pointers; specifically claudio@'s presentations on the network
internals which have call graphs, and jsg@'s introduction to drivers.

man 9 style is a good introduction to the conventions used in the source tree.

I'm going to assume that you mean an overview as to which directories mean
what inside the kernel, since outside of the kernel, it's rather self-evident
(e.g., src/usr.sbin/foo is for /usr/sbin/foo):

src/sys/
kern/   - generic stuffs (signals, scheduling, vnodes, syscalls)
net/- generic net stuffs (interface handling, pf, routing)
netinet{,6}/- IPv{4,6} stuffs
net*/   - non-IP network stuffs
sys/- system header files
arch/   - architecture-dependant code
dev/- hardware drivers stuffs
crypto/ - kernel crypto stuffs
*fs/- filesystem code
nfs/- the source of all horror
conf/   - kernel configuration stuffs
lib/- libc routines ported to the kernel, mostly
scsi/   - self-explanatory
uvm/- virtual memory stuffs
altq/   - also self-explanatory
compat/ - compatibility routines for other OSes
ddb/- kernel debugger stuffs

I may have missed something (or be slightly incorrect; hopefully
someone smarter than me can help clean this up if I'm too far off).

Hopefully this is useful for somebody.

 
 -Adam



Re: Source Overview

2010-04-19 Thread Bret S. Lambert
On Mon, Apr 19, 2010 at 11:48:02AM -0400, Adam M. Dutko wrote:
 On Mon, Apr 19, 2010 at 10:57 AM, Bret S. Lambert blamb...@openbsd.orgwrote:
 
 
   ... snip ...
 
 
  Hopefully this is useful for somebody.
 
 
 It is, thank you.
 
 With regard to the other questions I peppered everyone with... :-)
 
 1) Are there areas that are easier for relative newbies to start in versus
 other areas?  I know this depends on a lot of things, to include experience.
  Hypothetically, someone that has some C experience, but not a lot of kernel
 (and subsystem) experience.  Is it better to start from the bottom up like
 bootstrap to init? or is it better to start with memory management? network
 drivers?  What is usually the best area from a learning and future utility
 perspective?

If you're going after userland stuff, the advice I've given (and where I 
started)
is to read libc; it's the basis of userland programming, and illustrates some
of the tricks you can do with C. For the kernel side, I'd say start looking at
the system calls (cd /usr/src/sys/kern  grep -n ^sys_ *.c).

Everything else is just going deeper into the rabbit hole.

And if you value your sanity, stay out of anything resembling filesystems.

 
 2)  Is there something like an openbsd janitors project where newbies can
 start contributing small patches? similar to the Linux janitors project?

Not at all. The philosophy behind not having one is that it's considered 
dangerous
to farm out work to the inexperienced (and this exact topic has been brought up
before, usually by people whining that we didn't make them feel special enough
by not having one).

Read, understand, find an issue, post a patch, get told why it's wrong, fix it, 
repeat.

- Bert



Re: iflag for sed

2010-02-01 Thread Bret S. Lambert
On Tue, Feb 02, 2010 at 04:47:49PM +1100, Damien Miller wrote:
 On Tue, 2 Feb 2010, Ted Unangst wrote:
 
  Ordinarily, extensions to standard utilities are bad, but I think it's 
  worth considering when the flag is easily understood and provides a 
  substantial benefit.  I think case insenstive matching in sed qualifies.
  
  The diff below actually implements this behavior two different ways.  
  Option 1 is to add /i to substitution patterns.  Option 2 adds -i to the 
  command line.
 
 +1 on option 1. Case insensitive matching would be very handy, but I don't
 like the idea of adding a non-standard commandline flag that isn't
 supported somewhere else.

Agreed, especially since the -i flag means something entirely different
in the non-standard implementation we're imitating here.



Re: UBC?

2010-01-31 Thread Bret S. Lambert
On Sun, Jan 31, 2010 at 01:18:59PM +, Stuart Henderson wrote:
 On 2010/01/31 09:50, David Gwynne wrote:
  On Sun, Jan 31, 2010 at 09:41:14AM +1000, David Gwynne wrote:
   pmap_enter in this situation should fail, not panic. the error would be 
   handled properly as the stack unwinds up to ami_allocmem.
   
   i can change ami_allocmem to take a NOWAIT etc as an argument rather than 
   just assume it, so callers with process context (like the sensor refresh 
   shown below) can sleep waiting for mappings.
  
  here's such a change.
  
  i havent even compiled this, let alone tested it.
 
 no problems noticed here so far with
 
 ami0 at pci3 dev 14 function 0 Symbios Logic MegaRAID SATA 4x/8x rev 0x07: 
 apic 2 int 4 (irq 9)
 ami0: LSI 3008, 32b, FW 814B, BIOS vH431, 128MB RAM
 ami0: 1 channels, 0 FC loops, 1 logical drives
 scsibus0 at ami0: 40 targets
 scsibus1 at ami0: 16 targets

I know I'm kind of the peanut gallery in sys/dev, but we've already got
named flags which mean something to malloc(9), and it increases legibility
to use them, IMO, so I've modified dlg's diff to use M_{,NO}WAIT where
appropriate.

I have no ami hardware, so I can't test this, but it does compile :-\

Index: ami.c
===
RCS file: /cvs/src/sys/dev/ic/ami.c,v
retrieving revision 1.198
diff -u -p -r1.198 ami.c
--- ami.c   6 Dec 2009 12:31:10 -   1.198
+++ ami.c   31 Jan 2010 13:29:55 -
@@ -124,7 +124,7 @@ voidami_write(struct ami_softc *, bus_
 
 void   ami_copyhds(struct ami_softc *, const u_int32_t *,
const u_int8_t *, const u_int8_t *);
-struct ami_mem *ami_allocmem(struct ami_softc *, size_t);
+struct ami_mem *ami_allocmem(struct ami_softc *, size_t, int);
 void   ami_freemem(struct ami_softc *, struct ami_mem *);
 intami_alloc_ccbs(struct ami_softc *, int);
 
@@ -256,31 +256,32 @@ ami_write(struct ami_softc *sc, bus_size
 }
 
 struct ami_mem *
-ami_allocmem(struct ami_softc *sc, size_t size)
+ami_allocmem(struct ami_softc *sc, size_t size, int wait)
 {
struct ami_mem  *am;
int nsegs;
 
-   am = malloc(sizeof(struct ami_mem), M_DEVBUF, M_NOWAIT|M_ZERO);
+   am = malloc(sizeof(struct ami_mem), M_DEVBUF, M_ZERO | wait);
if (am == NULL)
return (NULL);
 
am-am_size = size;
+   wait = (wait == M_NOWAIT ? BUS_DMA_NOWAIT : 0);
 
if (bus_dmamap_create(sc-sc_dmat, size, 1, size, 0,
-   BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, am-am_map) != 0)
+   wait | BUS_DMA_ALLOCNOW, am-am_map) != 0)
goto amfree; 
 
if (bus_dmamem_alloc(sc-sc_dmat, size, PAGE_SIZE, 0, am-am_seg, 1,
-   nsegs, BUS_DMA_NOWAIT) != 0)
+   nsegs, wait) != 0)
goto destroy;
 
if (bus_dmamem_map(sc-sc_dmat, am-am_seg, nsegs, size, am-am_kva,
-   BUS_DMA_NOWAIT) != 0)
+   wait) != 0)
goto free;
 
if (bus_dmamap_load(sc-sc_dmat, am-am_map, am-am_kva, size, NULL,
-   BUS_DMA_NOWAIT) != 0)
+   wait) != 0)
goto unmap;
 
memset(am-am_kva, 0, size);
@@ -337,7 +338,8 @@ ami_alloc_ccbs(struct ami_softc *sc, int
return (1);
}
 
-   sc-sc_ccbmem_am = ami_allocmem(sc, sizeof(struct ami_ccbmem) * nccbs);
+   sc-sc_ccbmem_am = ami_allocmem(sc,
+   sizeof(struct ami_ccbmem) * nccbs, M_NOWAIT);
if (sc-sc_ccbmem_am == NULL) {
printf(: unable to allocate ccb dmamem\n);
goto free_ccbs;
@@ -413,14 +415,14 @@ ami_attach(struct ami_softc *sc)
paddr_t pa;
int s;
 
-   am = ami_allocmem(sc, NBPG);
+   am = ami_allocmem(sc, NBPG, M_NOWAIT);
if (am == NULL) {
printf(: unable to allocate init data\n);
return (1);
}
pa = htole32(AMIMEM_DVA(am));
 
-   sc-sc_mbox_am = ami_allocmem(sc, sizeof(struct ami_iocmd));
+   sc-sc_mbox_am = ami_allocmem(sc, sizeof(struct ami_iocmd), M_NOWAIT);
if (sc-sc_mbox_am == NULL) {
printf(: unable to allocate mbox\n);
goto free_idata;
@@ -1928,7 +1930,7 @@ ami_mgmt(struct ami_softc *sc, u_int8_t 
}
 
if (size) {
-   if ((am = ami_allocmem(sc, size)) == NULL) {
+   if ((am = ami_allocmem(sc, size, M_WAITOK)) == NULL) {
error = ENOMEM;
goto memerr;
}



Re: UBC?

2010-01-29 Thread Bret S. Lambert
On Fri, Jan 29, 2010 at 11:17:07AM -0500, Donald Allen wrote:
 Any developers care to comment on the state of the unified buffer
 cache? I did some searching and turned up threads here and there in
 the mail archives. It looks like an attempt was made to add it around
 8 years ago, some bad things happened, and the change got backed out.
 Looking at the code, it looks like there is some conditional ubc code
 that appears not to be included in the current release. Have I got
 this right -- OpenBSD doesn't provide a unified buffer cache at this
 time?

There is no spoon.

 
 /Don Allen



Re: UBC?

2010-01-29 Thread Bret S. Lambert
On Fri, Jan 29, 2010 at 01:47:09PM -0700, Bob Beck wrote:
  Well, to be fair, he was asking for $buzzword. So we could load
  him up with some Customer-Facing Enterprise Extranet Bundles,
  served over XML in a proactive win-win paradigm.
 
  Then his disk reads would be all like zoom!!!
 
  Or, for brevity's sake: there is no spoon.
 
 http://professionalsuperhero.com/

That link isn't in iCal format; I can't open it in my scheduler.



Re: Add support to AR5424

2010-01-25 Thread Bret S. Lambert
On Mon, Jan 25, 2010 at 06:18:33PM +, Luis Henriques wrote:
 On Mon, Jan 25, 2010 at 01:52:12PM +, Stuart Henderson wrote:
  On 2010/01/25 00:50, Luis Henriques wrote:
   Re-sending with cvs diff -upRN
  
  Wha, you mean I have to dismantle my AspireOne again and put the
  minipcie back in to test it? But last time I did that I had to drill
  out some of the screws :-) (only joking, I can do this on the eee 
  instead, it's far easier to dismantle! and this is great news).
  
  Please also send URLs of where the borrowed code is from so the
  licensing is totally clear.
 
 Here's the link to the Linux ath5k driver:
 
 http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.32.y.git;a=tree;f=drivers/net/wireless/ath/ath5k;h=d625eae851dd0c1e42e52a182d8408ebee855377;hb=a2febcd43d859a48672ad922990bd27e5628271f
 
 (not 100% sure if this is the exact same version I used, can check that
 later if it actually matters.)
 
 File main.c (in directory ..) sets the license to Dual BSD/GPL.

IIRC, it's more dependent upon the licence in the individual files,
not the note that satisfies the Linux kernel's just utterly weird
tell me your license so I can refuse to load you bits.

But, the files in there, where they aren't (c) reyk@, appear to be
either straight-up BSD-with-GPL option, or apparently cut-n-paset
from OpenBSD's license.template(!) (as in the file you say is dual-
licensed), so I think you're on solid footing here.

 
 And the link to the NetBSD driver:
 
 http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/external/isc/atheros_hal/dist/?only_with_tag=MAIN
  



Re: kernel hacking

2009-12-10 Thread Bret S. Lambert
On Thu, Dec 10, 2009 at 02:49:51PM -0300, Robert Yuri wrote:
 I'll learn just reading kernel code ?
 so, many night you need to understand it ?

Absolutely:

$ pwd
/usr/src/sys/kern
$ wc -l * | tail -n 1 
   64300 total

It's going to take you many nights just to *read* it.
Not to mention the namecache-induced vomit breaks.



Dynamic SysV Message Queue Implementation

2009-08-03 Thread Bret S. Lambert
The following diff makes SysV message queues and changes them from
static, boot-time to dynamic, run-time memory allocation. This makes
writing sysctls which modify the limits trivial, no longer requiring
you to recompile your kernel to do so. Also makes the code completely
machine-independent, instead of allocating the memory before the vm
system is up.

The only thing that I'm aware of that uses this (thanks to todd@) is
squid with diskd. If you have that setup, please give this a test
to make sure no edge cases have crept in.

- Bert

Index: kern/sysv_msg.c
===
RCS file: /cvs/src/sys/kern/sysv_msg.c,v
retrieving revision 1.21
diff -u -p -r1.21 sysv_msg.c
--- kern/sysv_msg.c 2 Jun 2009 12:11:16 -   1.21
+++ kern/sysv_msg.c 3 Aug 2009 06:54:28 -
@@ -1,6 +1,20 @@
 /* $OpenBSD: sysv_msg.c,v 1.21 2009/06/02 12:11:16 guenther Exp $  */
 /* $NetBSD: sysv_msg.c,v 1.19 1996/02/09 19:00:18 christos Exp $   */
-
+/*
+ * Copyright (c) 2009 Bret S. Lambert blamb...@openbsd.org
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
 /*
  * Implementation of SVID messages
  *
@@ -21,108 +35,69 @@
  */
 
 #include sys/param.h
-#include sys/systm.h
-#include sys/kernel.h
-#include sys/proc.h
-#include sys/msg.h
+#include sys/types.h
 #include sys/malloc.h
-
+#include sys/mbuf.h
 #include sys/mount.h
+#include sys/msg.h
+#include sys/pool.h
+#include sys/proc.h
+#include sys/queue.h
 #include sys/syscallargs.h
+#include sys/sysctl.h
+#include sys/systm.h
+#include sys/uio.h
 
-#ifdef MSG_DEBUG
-#defineDPRINTF(x)  printf x
-#else
-#defineDPRINTF(x)
-#endif
-
-int nfree_msgmaps; /* # of free map entries */
-short free_msgmaps;/* head of linked list of free map entries */
-struct msg *free_msghdrs;  /* list of free msg headers */
-char *msgpool; /* MSGMAX byte long msg buffer pool */
-struct msgmap *msgmaps;/* MSGSEG msgmap structures */
-struct msg *msghdrs;   /* MSGTQL msg headers */
-struct msqid_ds *msqids;   /* MSGMNI msqid_ds struct's */
-
-void msg_freehdr(struct msg *);
+struct que *que_create(key_t, struct ucred *, int);
+struct que *que_lookup(int);
+struct que *que_key_lookup(key_t);
+void que_wakeall(void);
+void que_free(struct que *);
+struct msg *msg_create(struct que *);
+void msg_free(struct msg *);
+void msg_enqueue(struct que *, struct msg *, struct proc *);
+void msg_dequeue(struct que *, struct msg *, struct proc *);
+struct msg *msg_lookup(struct que *, int);
+int msg_copyin(struct msg *, const char *, size_t, struct proc *);
+int msg_copyout(struct msg *, char *, size_t *, struct proc *);
+
+struct pool sysvmsgpl;
+struct msginfo msginfo;
+
+TAILQ_HEAD(, que) msg_queues;
+
+int num_ques;
+int num_msgs;
+int sequence;
+int maxmsgs;
 
 void
 msginit(void)
 {
-   int i;
-
-   /*
-* msginfo.msgssz should be a power of two for efficiency reasons.
-* It is also pretty silly if msginfo.msgssz is less than 8
-* or greater than about 256 so ...
-*/
-
-   i = 8;
-   while (i  1024  i != msginfo.msgssz)
-   i = 1;
-
-   if (i != msginfo.msgssz)
-   panic(msginfo.msgssz %d not a small power of 2, 
msginfo.msgssz);
-   if (msginfo.msgseg  32767)
-   panic(msginfo.msgseg %d  32767, msginfo.msgseg);
-
-   if (msgmaps == NULL)
-   panic(msgmaps is NULL);
-
-   for (i = 0; i  msginfo.msgseg; i++) {
-   if (i  0)
-   msgmaps[i-1].next = i;
-   msgmaps[i].next = -1;   /* implies entry is available */
-   }
-   free_msgmaps = 0;
-   nfree_msgmaps = msginfo.msgseg;
-
-   if (msghdrs == NULL)
-   panic(msghdrs is NULL);
-
-   for (i = 0; i  msginfo.msgtql; i++) {
-   msghdrs[i].msg_type = 0;
-   if (i  0)
-   msghdrs[i-1].msg_next = msghdrs[i];
-   msghdrs[i].msg_next = NULL;
-   }
-   free_msghdrs = msghdrs[0];
-
-   if (msqids == NULL)
-   panic(msqids is NULL);
-
-   for (i = 0; i  msginfo.msgmni; i++) {
-   msqids[i].msg_qbytes = 0;   /* implies entry is available

NFS server cleanup diffs, 2 of 3

2009-05-21 Thread Bret S. Lambert
Index: nfs/nfs_serv.c
===
RCS file: /cvs/src/sys/nfs/nfs_serv.c,v
retrieving revision 1.56
diff -u -p -r1.56 nfs_serv.c
--- nfs/nfs_serv.c  14 Jun 2008 22:44:07 -  1.56
+++ nfs/nfs_serv.c  15 Jun 2008 04:36:23 -
@@ -,7 +2211,7 @@ nfsrv_rmdir(nfsd, slp, procp, mrq)
int v3 = (nfsd-nd_flag  ND_NFSV3);
char *cp2;
struct mbuf *mb, *mreq;
-   struct vnode *vp, *dirp = (struct vnode *)0;
+   struct vnode *vp, *dirp = NULL;
struct vattr dirfor, diraft;
nfsfh_t nfh;
fhandle_t *fhp;
@@ -2241,7 +2230,7 @@ nfsrv_rmdir(nfsd, slp, procp, mrq)
procp);
else {
vrele(dirp);
-   dirp = (struct vnode *)0;
+   dirp = NULL;
}
}
if (error) {
@@ -2255,31 +2244,35 @@ nfsrv_rmdir(nfsd, slp, procp, mrq)
vp = nd.ni_vp;
if (vp-v_type != VDIR) {
error = ENOTDIR;
-   goto out;
+   goto abort;
}
/*
 * No rmdir . please.
 */
if (nd.ni_dvp == vp) {
error = EINVAL;
-   goto out;
+   goto abort;
}
/*
 * The root of a mounted filesystem cannot be deleted.
 */
-   if (vp-v_flag  VROOT)
+   if (vp-v_flag  VROOT) {
error = EBUSY;
-out:
-   if (!error) {
-   error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, nd.ni_cnd);
-   } else {
-   VOP_ABORTOP(nd.ni_dvp, nd.ni_cnd);
-   if (nd.ni_dvp == nd.ni_vp)
-   vrele(nd.ni_dvp);
-   else
-   vput(nd.ni_dvp);
-   vput(vp);
+   goto abort;
}
+
+   error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, nd.ni_cnd);
+   goto out;
+
+abort:
+   /* the following code is only reached via error cases above */
+   VOP_ABORTOP(nd.ni_dvp, nd.ni_cnd);
+   if (nd.ni_dvp == nd.ni_vp)
+   vrele(nd.ni_dvp);
+   else
+   vput(nd.ni_dvp);
+   vput(vp);
+out:
if (dirp) {
diraft_ret = VOP_GETATTR(dirp, diraft, cred, procp);
vrele(dirp);