On Sun, 18 Feb 2007, Lars Erik Gullerud wrote:
On Fri, 16 Feb 2007, Robert Watson wrote:
I can certainly investigate doing this -- since 6.2 is safely out the door
it's a good time to do so. I'll follow up by e-mail in a few days -- would
it be possible for you to help with testing?
We would of course be most happy to test any patches you come up with, and
run performance benchmarks on our systems.
It turns out this change comes in two parts:
(1) In the first part, the structure of the socket send routing, sosend(), is
simplified by breaking out the code that copies data from user space from
the code that transmits via the protocol.
(2) In the second part, a version of sosend() specific to datagram protocols
(where the socket send buffer isn't ever used) is added.
I'm going to attach two patches against RELENG_6 from today -- the first
performs only the first step (sosend_copyin.diff), and the second performs
both (sosend_dgram.diff) (so will have to be applied against a fresh version
of uipc_socket.c as opposed to the patched version). The first change
requires heavy stability testing, and the second requires both performance and
stability testing. Any assistance from you in helping to make this a reliable
MFC would be much appreciated.
For reference, the sosend_copyin.diff applies these changes:
src/sys/kern/uipc_socket.c:1.253, 1.254, 1.255
The sosend_dgram.diff patch incrementally also applies these changes on top of
sosend_copyin.diff:
src/sys/kern/uipc_socket.c:1.256
src/sys/netinet/udp_usrreq.c:1.188
I've CC'd the performance list as there is a relevant thread going on there
right now, and other people might also be interested in reviewing and testing
these changes. The short description is that this eliminates a large number
of socket buffer interactions in the UDP send path--one of the effects is to
avoid locking the socket buffer for an extended period, as it's largely unused
in the datagram transmit path. Per the commit comments, this idea was
suggested by Jinmei Tatsuya at ISC as a result of their performance analysis;
this change has been in 7-CURRENT since May of last year and has seen some bug
fixes but no substantial changes in that time, so has been moderately burned
in.
Robert N M Watson
Computer Laboratory
University of Cambridge
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.242.2.8
diff -u -r1.242.2.8 uipc_socket.c
--- kern/uipc_socket.c 3 Feb 2007 04:01:22 -0000 1.242.2.8
+++ kern/uipc_socket.c 1 Mar 2007 11:18:35 -0000
@@ -584,7 +584,149 @@
return (error);
}
+#ifdef ZERO_COPY_SOCKETS
+struct so_zerocopy_stats{
+ int size_ok;
+ int align_ok;
+ int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+#include <netinet/in.h>
+#include <net/route.h>
+#include <netinet/in_pcb.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#endif /*ZERO_COPY_SOCKETS*/
+
+/*
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio. If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp. The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+ int flags)
+{
+ struct mbuf *m, **mp, *top;
+ long len, resid;
+ int error;
+#ifdef ZERO_COPY_SOCKETS
+ int cow_send;
+#endif
+
+ *retmp = top = NULL;
+ mp = ⊤
+ len = 0;
+ resid = uio->uio_resid;
+ error = 0;
+ do {
+#ifdef ZERO_COPY_SOCKETS
+ cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+ if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+ if (top == NULL) {
+ MGETHDR(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else {
+ MGET(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ if (so_zero_copy_send &&
+ resid>=PAGE_SIZE &&
+ *space>=PAGE_SIZE &&
+ uio->uio_iov->iov_len>=PAGE_SIZE) {
+ so_zerocp_stats.size_ok++;
+ so_zerocp_stats.align_ok++;
+ cow_send = socow_setup(m, uio);
+ len = cow_send;
+ }
+ if (!cow_send) {
+ MCLGET(m, M_TRYWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ m = NULL;
+ } else {
+ len = min(min(MCLBYTES, resid),
+ *space);
+ }
+ }
+#else /* ZERO_COPY_SOCKETS */
+ if (top == NULL) {
+ m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else
+ m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+ len = min(min(MCLBYTES, resid), *space);
+#endif /* ZERO_COPY_SOCKETS */
+ } else {
+ if (top == NULL) {
+ m = m_gethdr(M_TRYWAIT, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+
+ len = min(min(MHLEN, resid), *space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && m && len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ m = m_get(M_TRYWAIT, MT_DATA);
+ len = min(min(MLEN, resid), *space);
+ }
+ }
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+
+ *space -= len;
+#ifdef ZERO_COPY_SOCKETS
+ if (cow_send)
+ error = 0;
+ else
+#endif /* ZERO_COPY_SOCKETS */
+ error = uiomove(mtod(m, void *), (int)len, uio);
+ resid = uio->uio_resid;
+ m->m_len = len;
+ *mp = m;
+ top->m_pkthdr.len += len;
+ if (error)
+ goto out;
+ mp = &m->m_next;
+ if (resid <= 0) {
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ break;
+ }
+ } while (*space > 0 && atomic);
+out:
+ *retmp = top;
+ return (error);
+}
+
#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+#define snderr(errno) { error = (errno); goto release; }
+
/*
* Send on a socket.
* If send must go all at once and message is larger than
@@ -603,21 +745,6 @@
* Data and control buffers are freed on return.
*/
-#ifdef ZERO_COPY_SOCKETS
-struct so_zerocopy_stats{
- int size_ok;
- int align_ok;
- int found_ifp;
-};
-struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
-#include <netinet/in.h>
-#include <net/route.h>
-#include <netinet/in_pcb.h>
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#endif /*ZERO_COPY_SOCKETS*/
-
int
sosend(so, addr, uio, top, control, flags, td)
struct socket *so;
@@ -628,14 +755,9 @@
int flags;
struct thread *td;
{
- struct mbuf **mp;
- struct mbuf *m;
- long space, len = 0, resid;
+ long space, resid;
int clen = 0, error, dontroute;
int atomic = sosendallatonce(so) || top;
-#ifdef ZERO_COPY_SOCKETS
- int cow_send;
-#endif /* ZERO_COPY_SOCKETS */
if (uio != NULL)
resid = uio->uio_resid;
@@ -663,7 +785,6 @@
td->td_proc->p_stats->p_ru.ru_msgsnd++;
if (control != NULL)
clen = control->m_len;
-#define snderr(errno) { error = (errno); goto release; }
SOCKBUF_LOCK(&so->so_snd);
restart:
@@ -713,153 +834,61 @@
goto restart;
}
SOCKBUF_UNLOCK(&so->so_snd);
- mp = ⊤
space -= clen;
do {
- if (uio == NULL) {
- /*
- * Data is prepackaged in "top".
- */
- resid = 0;
- if (flags & MSG_EOR)
- top->m_flags |= M_EOR;
- } else do {
-#ifdef ZERO_COPY_SOCKETS
- cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
- if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
- if (top == NULL) {
- MGETHDR(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
- } else {
- MGET(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- }
- if (so_zero_copy_send &&
- resid>=PAGE_SIZE &&
- space>=PAGE_SIZE &&
- uio->uio_iov->iov_len>=PAGE_SIZE) {
- so_zerocp_stats.size_ok++;
- so_zerocp_stats.align_ok++;
- cow_send = socow_setup(m, uio);
- len = cow_send;
- }
- if (!cow_send) {
- MCLGET(m, M_TRYWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- m_free(m);
- m = NULL;
- } else {
- len = min(min(MCLBYTES, resid),
space);
- }
- }
-#else /* ZERO_COPY_SOCKETS */
- if (top == NULL) {
- m = m_getcl(M_TRYWAIT, MT_DATA,
M_PKTHDR);
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
- } else
- m = m_getcl(M_TRYWAIT, MT_DATA, 0);
- len = min(min(MCLBYTES, resid), space);
-#endif /* ZERO_COPY_SOCKETS */
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
} else {
- if (top == NULL) {
- m = m_gethdr(M_TRYWAIT, MT_DATA);
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
-
- len = min(min(MHLEN, resid), space);
- /*
- * For datagram protocols, leave room
- * for protocol headers in first mbuf.
- */
- if (atomic && m && len < MHLEN)
- MH_ALIGN(m, len);
- } else {
- m = m_get(M_TRYWAIT, MT_DATA);
- len = min(min(MLEN, resid), space);
+ error = sosend_copyin(uio, &top, atomic,
+ &space, flags);
+ if (error != 0) {
+ SOCKBUF_LOCK(&so->so_snd);
+ goto release;
}
+ resid = uio->uio_resid;
}
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
}
-
- space -= len;
-#ifdef ZERO_COPY_SOCKETS
- if (cow_send)
- error = 0;
- else
-#endif /* ZERO_COPY_SOCKETS */
- error = uiomove(mtod(m, void *), (int)len, uio);
- resid = uio->uio_resid;
- m->m_len = len;
- *mp = m;
- top->m_pkthdr.len += len;
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- mp = &m->m_next;
- if (resid <= 0) {
- if (flags & MSG_EOR)
- top->m_flags |= M_EOR;
- break;
- }
- } while (space > 0 && atomic);
- if (dontroute) {
- SOCK_LOCK(so);
- so->so_options |= SO_DONTROUTE;
- SOCK_UNLOCK(so);
- }
- /*
- * XXX all the SBS_CANTSENDMORE checks previously
- * done could be out of date. We could have recieved
- * a reset packet in an interrupt or maybe we slept
- * while doing page faults in uiomove() etc. We could
- * probably recheck again inside the locking protection
- * here, but there are probably other places that this
- * also happens. We must rethink this.
- */
- error = (*so->so_proto->pr_usrreqs->pru_send)(so,
- (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously
+ * done could be out of date. We could have recieved
+ * a reset packet in an interrupt or maybe we slept
+ * while doing page faults in uiomove() etc. We could
+ * probably recheck again inside the locking protection
+ * here, but there are probably other places that this
+ * also happens. We must rethink this.
+ */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
/*
* If the user set MSG_EOF, the protocol
* understands this flag and nothing left to
* send then use PRU_SEND_EOF instead of PRU_SEND.
*/
- ((flags & MSG_EOF) &&
- (so->so_proto->pr_flags & PR_IMPLOPCL) &&
- (resid <= 0)) ?
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
PRUS_EOF :
/* If there is more to send set PRUS_MORETOCOME */
- (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
- top, addr, control, td);
- if (dontroute) {
- SOCK_LOCK(so);
- so->so_options &= ~SO_DONTROUTE;
- SOCK_UNLOCK(so);
- }
- clen = 0;
- control = NULL;
- top = NULL;
- mp = ⊤
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+ if (error) {
+ SOCKBUF_LOCK(&so->so_snd);
+ goto release;
+ }
} while (resid && space > 0);
SOCKBUF_LOCK(&so->so_snd);
} while (resid);
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.242.2.8
diff -u -r1.242.2.8 uipc_socket.c
--- kern/uipc_socket.c 3 Feb 2007 04:01:22 -0000 1.242.2.8
+++ kern/uipc_socket.c 1 Mar 2007 11:27:11 -0000
@@ -584,7 +584,301 @@
return (error);
}
+#ifdef ZERO_COPY_SOCKETS
+struct so_zerocopy_stats{
+ int size_ok;
+ int align_ok;
+ int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+#include <netinet/in.h>
+#include <net/route.h>
+#include <netinet/in_pcb.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#endif /*ZERO_COPY_SOCKETS*/
+
+/*
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio. If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp. The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+ int flags)
+{
+ struct mbuf *m, **mp, *top;
+ long len, resid;
+ int error;
+#ifdef ZERO_COPY_SOCKETS
+ int cow_send;
+#endif
+
+ *retmp = top = NULL;
+ mp = ⊤
+ len = 0;
+ resid = uio->uio_resid;
+ error = 0;
+ do {
+#ifdef ZERO_COPY_SOCKETS
+ cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+ if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+ if (top == NULL) {
+ MGETHDR(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else {
+ MGET(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ if (so_zero_copy_send &&
+ resid>=PAGE_SIZE &&
+ *space>=PAGE_SIZE &&
+ uio->uio_iov->iov_len>=PAGE_SIZE) {
+ so_zerocp_stats.size_ok++;
+ so_zerocp_stats.align_ok++;
+ cow_send = socow_setup(m, uio);
+ len = cow_send;
+ }
+ if (!cow_send) {
+ MCLGET(m, M_TRYWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ m = NULL;
+ } else {
+ len = min(min(MCLBYTES, resid),
+ *space);
+ }
+ }
+#else /* ZERO_COPY_SOCKETS */
+ if (top == NULL) {
+ m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else
+ m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+ len = min(min(MCLBYTES, resid), *space);
+#endif /* ZERO_COPY_SOCKETS */
+ } else {
+ if (top == NULL) {
+ m = m_gethdr(M_TRYWAIT, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+
+ len = min(min(MHLEN, resid), *space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && m && len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ m = m_get(M_TRYWAIT, MT_DATA);
+ len = min(min(MLEN, resid), *space);
+ }
+ }
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+
+ *space -= len;
+#ifdef ZERO_COPY_SOCKETS
+ if (cow_send)
+ error = 0;
+ else
+#endif /* ZERO_COPY_SOCKETS */
+ error = uiomove(mtod(m, void *), (int)len, uio);
+ resid = uio->uio_resid;
+ m->m_len = len;
+ *mp = m;
+ top->m_pkthdr.len += len;
+ if (error)
+ goto out;
+ mp = &m->m_next;
+ if (resid <= 0) {
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ break;
+ }
+ } while (*space > 0 && atomic);
+out:
+ *retmp = top;
+ return (error);
+}
+
#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+
+int
+sosend_dgram(so, addr, uio, top, control, flags, td)
+ struct socket *so;
+ struct sockaddr *addr;
+ struct uio *uio;
+ struct mbuf *top;
+ struct mbuf *control;
+ int flags;
+ struct thread *td;
+{
+ long space, resid;
+ int clen = 0, error, dontroute;
+ int atomic = sosendallatonce(so) || top;
+
+ KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
+ KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
+ ("sodgram_send: !PR_ATOMIC"));
+
+ if (uio != NULL)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ /*
+ * In theory resid should be unsigned.
+ * However, space must be signed, as it might be less than 0
+ * if we over-committed, and we must use a signed comparison
+ * of space and resid. On the other hand, a negative resid
+ * causes us to loop sending 0-length segments to the protocol.
+ *
+ * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+ * type sockets since that's an error.
+ */
+ if (resid < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ dontroute =
+ (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
+ if (td != NULL)
+ td->td_proc->p_stats->p_ru.ru_msgsnd++;
+ if (control != NULL)
+ clen = control->m_len;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EPIPE;
+ goto out;
+ }
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-
+ * based socket if it supports implied connect.
+ * Return ENOTCONN if not connected and no address is
+ * supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+ if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+ !(resid == 0 && clen != 0)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOTCONN;
+ goto out;
+ }
+ } else if (addr == NULL) {
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+ error = ENOTCONN;
+ else
+ error = EDESTADDRREQ;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out;
+ }
+ }
+
+ /*
+ * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
+ * problem and need fixing.
+ */
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ space -= clen;
+ if (resid > space) {
+ error = EMSGSIZE;
+ goto out;
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else {
+ error = sosend_copyin(uio, &top, atomic, &space, flags);
+ if (error)
+ goto out;
+ resid = uio->uio_resid;
+ }
+ KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
+ /*
+ * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
+ * than with.
+ */
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously
+ * done could be out of date. We could have recieved
+ * a reset packet in an interrupt or maybe we slept
+ * while doing page faults in uiomove() etc. We could
+ * probably recheck again inside the locking protection
+ * here, but there are probably other places that this
+ * also happens. We must rethink this.
+ */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol
+ * understands this flag and nothing left to
+ * send then use PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+out:
+ if (top != NULL)
+ m_freem(top);
+ if (control != NULL)
+ m_freem(control);
+ return (error);
+}
+
/*
* Send on a socket.
* If send must go all at once and message is larger than
@@ -602,22 +896,7 @@
* must check for short counts if EINTR/ERESTART are returned.
* Data and control buffers are freed on return.
*/
-
-#ifdef ZERO_COPY_SOCKETS
-struct so_zerocopy_stats{
- int size_ok;
- int align_ok;
- int found_ifp;
-};
-struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
-#include <netinet/in.h>
-#include <net/route.h>
-#include <netinet/in_pcb.h>
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#endif /*ZERO_COPY_SOCKETS*/
-
+#define snderr(errno) { error = (errno); goto release; }
int
sosend(so, addr, uio, top, control, flags, td)
struct socket *so;
@@ -628,14 +907,9 @@
int flags;
struct thread *td;
{
- struct mbuf **mp;
- struct mbuf *m;
- long space, len = 0, resid;
+ long space, resid;
int clen = 0, error, dontroute;
int atomic = sosendallatonce(so) || top;
-#ifdef ZERO_COPY_SOCKETS
- int cow_send;
-#endif /* ZERO_COPY_SOCKETS */
if (uio != NULL)
resid = uio->uio_resid;
@@ -663,7 +937,6 @@
td->td_proc->p_stats->p_ru.ru_msgsnd++;
if (control != NULL)
clen = control->m_len;
-#define snderr(errno) { error = (errno); goto release; }
SOCKBUF_LOCK(&so->so_snd);
restart:
@@ -713,153 +986,61 @@
goto restart;
}
SOCKBUF_UNLOCK(&so->so_snd);
- mp = ⊤
space -= clen;
do {
- if (uio == NULL) {
- /*
- * Data is prepackaged in "top".
- */
- resid = 0;
- if (flags & MSG_EOR)
- top->m_flags |= M_EOR;
- } else do {
-#ifdef ZERO_COPY_SOCKETS
- cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
- if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
- if (top == NULL) {
- MGETHDR(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
- } else {
- MGET(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- }
- if (so_zero_copy_send &&
- resid>=PAGE_SIZE &&
- space>=PAGE_SIZE &&
- uio->uio_iov->iov_len>=PAGE_SIZE) {
- so_zerocp_stats.size_ok++;
- so_zerocp_stats.align_ok++;
- cow_send = socow_setup(m, uio);
- len = cow_send;
- }
- if (!cow_send) {
- MCLGET(m, M_TRYWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- m_free(m);
- m = NULL;
- } else {
- len = min(min(MCLBYTES, resid),
space);
- }
- }
-#else /* ZERO_COPY_SOCKETS */
- if (top == NULL) {
- m = m_getcl(M_TRYWAIT, MT_DATA,
M_PKTHDR);
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
- } else
- m = m_getcl(M_TRYWAIT, MT_DATA, 0);
- len = min(min(MCLBYTES, resid), space);
-#endif /* ZERO_COPY_SOCKETS */
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
} else {
- if (top == NULL) {
- m = m_gethdr(M_TRYWAIT, MT_DATA);
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
-
- len = min(min(MHLEN, resid), space);
- /*
- * For datagram protocols, leave room
- * for protocol headers in first mbuf.
- */
- if (atomic && m && len < MHLEN)
- MH_ALIGN(m, len);
- } else {
- m = m_get(M_TRYWAIT, MT_DATA);
- len = min(min(MLEN, resid), space);
+ error = sosend_copyin(uio, &top, atomic,
+ &space, flags);
+ if (error != 0) {
+ SOCKBUF_LOCK(&so->so_snd);
+ goto release;
}
+ resid = uio->uio_resid;
}
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
-
- space -= len;
-#ifdef ZERO_COPY_SOCKETS
- if (cow_send)
- error = 0;
- else
-#endif /* ZERO_COPY_SOCKETS */
- error = uiomove(mtod(m, void *), (int)len, uio);
- resid = uio->uio_resid;
- m->m_len = len;
- *mp = m;
- top->m_pkthdr.len += len;
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- mp = &m->m_next;
- if (resid <= 0) {
- if (flags & MSG_EOR)
- top->m_flags |= M_EOR;
- break;
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
}
- } while (space > 0 && atomic);
- if (dontroute) {
- SOCK_LOCK(so);
- so->so_options |= SO_DONTROUTE;
- SOCK_UNLOCK(so);
- }
- /*
- * XXX all the SBS_CANTSENDMORE checks previously
- * done could be out of date. We could have recieved
- * a reset packet in an interrupt or maybe we slept
- * while doing page faults in uiomove() etc. We could
- * probably recheck again inside the locking protection
- * here, but there are probably other places that this
- * also happens. We must rethink this.
- */
- error = (*so->so_proto->pr_usrreqs->pru_send)(so,
- (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously
+ * done could be out of date. We could have recieved
+ * a reset packet in an interrupt or maybe we slept
+ * while doing page faults in uiomove() etc. We could
+ * probably recheck again inside the locking protection
+ * here, but there are probably other places that this
+ * also happens. We must rethink this.
+ */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
/*
* If the user set MSG_EOF, the protocol
* understands this flag and nothing left to
* send then use PRU_SEND_EOF instead of PRU_SEND.
*/
- ((flags & MSG_EOF) &&
- (so->so_proto->pr_flags & PR_IMPLOPCL) &&
- (resid <= 0)) ?
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
PRUS_EOF :
/* If there is more to send set PRUS_MORETOCOME */
- (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
- top, addr, control, td);
- if (dontroute) {
- SOCK_LOCK(so);
- so->so_options &= ~SO_DONTROUTE;
- SOCK_UNLOCK(so);
- }
- clen = 0;
- control = NULL;
- top = NULL;
- mp = ⊤
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+ if (error) {
+ SOCKBUF_LOCK(&so->so_snd);
+ goto release;
+ }
} while (resid && space > 0);
SOCKBUF_LOCK(&so->so_snd);
} while (resid);
@@ -877,6 +1058,7 @@
m_freem(control);
return (error);
}
+#undef snderr
/*
* The part of soreceive() that implements reading non-inline out-of-band
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.175.2.9
diff -u -r1.175.2.9 udp_usrreq.c
--- netinet/udp_usrreq.c 29 Dec 2006 19:25:49 -0000 1.175.2.9
+++ netinet/udp_usrreq.c 1 Mar 2007 11:27:34 -0000
@@ -1150,6 +1150,7 @@
.pru_disconnect = udp_disconnect,
.pru_peeraddr = udp_peeraddr,
.pru_send = udp_send,
+ .pru_sosend = sosend_dgram,
.pru_shutdown = udp_shutdown,
.pru_sockaddr = udp_sockaddr,
.pru_sosetlabel = in_pcbsosetlabel
_______________________________________________
freebsd-performance@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-performance
To unsubscribe, send any mail to "[EMAIL PROTECTED]"