Module Name:    src
Committed By:   ad
Date:           Mon Dec 16 19:17:25 UTC 2019

Modified Files:
        src/sys/arch/x86/x86: x86_tlb.c

Log Message:
Align the TLB packet precisely on the stack, and do 7 INVLPG since it's
what fits in a single line.


To generate a diff of this commit:
cvs rdiff -u -r1.12 -r1.13 src/sys/arch/x86/x86/x86_tlb.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/x86/x86/x86_tlb.c
diff -u src/sys/arch/x86/x86/x86_tlb.c:1.12 src/sys/arch/x86/x86/x86_tlb.c:1.13
--- src/sys/arch/x86/x86/x86_tlb.c:1.12	Mon Dec  2 20:59:56 2019
+++ src/sys/arch/x86/x86/x86_tlb.c	Mon Dec 16 19:17:25 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: x86_tlb.c,v 1.12 2019/12/02 20:59:56 pgoyette Exp $	*/
+/*	$NetBSD: x86_tlb.c,v 1.13 2019/12/16 19:17:25 ad Exp $	*/
 
 /*-
  * Copyright (c) 2008-2019 The NetBSD Foundation, Inc.
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.12 2019/12/02 20:59:56 pgoyette Exp $");
+__KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.13 2019/12/16 19:17:25 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -66,10 +66,10 @@ __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 
  * until the request is completed.  This keeps the cache line in the shared
  * state, and bus traffic to a minimum.
  *
- * On i386 the packet is 28 bytes in size.  On amd64 it's 52 bytes.
+ * On i386 the packet is 32 bytes in size.  On amd64 it's 60 bytes.
  */
 typedef struct {
-	uintptr_t		tp_va[6];
+	uintptr_t		tp_va[7];
 	uint8_t			tp_count;
 	uint8_t			tp_userpmap;
 	uint8_t			tp_global;
@@ -77,23 +77,14 @@ typedef struct {
 } pmap_tlb_packet_t;
 
 /*
- * Padded packet stored on the initiator's stack.
- */
-typedef struct {
-	uint8_t			ts_pad1[COHERENCY_UNIT];
-	pmap_tlb_packet_t	ts_tp;
-	uint8_t			ts_pad2[COHERENCY_UNIT];
-} pmap_tlb_stackbuf_t;
-
-/*
  * No more than N separate invlpg.
  *
- * Statistically, a value of six is big enough to cover the requested number
+ * Statistically, a value of 7 is big enough to cover the requested number
  * of pages in ~ 95% of the TLB shootdowns we are getting. We therefore rarely
  * reach the limit, and increasing it can actually reduce the performance due
  * to the high cost of invlpg.
  */
-#define	TP_MAXVA		6	/* for individual mappings */
+#define	TP_MAXVA		7	/* for individual mappings */
 #define	TP_ALLVA		255	/* special: shoot all mappings */
 
 /*
@@ -355,8 +346,8 @@ pmap_tlb_processpacket(volatile pmap_tlb
 void
 pmap_tlb_shootnow(void)
 {
-	volatile pmap_tlb_packet_t *tp;
-	volatile pmap_tlb_stackbuf_t ts;
+	volatile pmap_tlb_packet_t *tp, *ts;
+	volatile uint8_t stackbuf[128];
 	struct cpu_info *ci;
 	kcpuset_t *target;
 	u_int local, rcpucount;
@@ -405,11 +396,13 @@ pmap_tlb_shootnow(void)
 	 * against an interrupt on the current CPU trying the same.
 	 */
 	KASSERT(rcpucount < ncpu);
-	ts.ts_tp = *tp;
-	KASSERT(!ts.ts_tp.tp_done);
+	KASSERT(sizeof(*ts) <= (sizeof(stackbuf) / 2));
+	ts = (void *)roundup2((uintptr_t)stackbuf, (sizeof(stackbuf) / 2));
+	*ts = *tp;
+	KASSERT(!ts->tp_done);
 	while (atomic_cas_ptr(&pmap_tlb_packet, NULL,
-	    __UNVOLATILE(&ts.ts_tp)) != NULL) {
-		KASSERT(pmap_tlb_packet != &ts.ts_tp);
+	    __UNVOLATILE(ts)) != NULL) {
+		KASSERT(pmap_tlb_packet != ts);
 		/*
 		 * Don't bother with exponentional backoff, as the pointer
 		 * is in a dedicated cache line and only updated twice per
@@ -439,7 +432,7 @@ pmap_tlb_shootnow(void)
 	 */
 	pmap_tlb_pendcount = rcpucount;
 	pmap_tlb_evcnt.ev_count++;
-	pmap_tlb_processpacket(&ts.ts_tp, target);
+	pmap_tlb_processpacket(ts, target);
 
 	/*
 	 * Clear out the local CPU's buffer for the next user.  Once done,
@@ -461,7 +454,7 @@ pmap_tlb_shootnow(void)
 	 * perform local shootdown if needed, using our copy of the packet.
 	 */
 	if (local) {
-		pmap_tlb_invalidate(&ts.ts_tp);
+		pmap_tlb_invalidate(ts);
 	}
 
 	/*
@@ -470,7 +463,7 @@ pmap_tlb_shootnow(void)
 	 * CPU out will update it and only we are reading it).  No memory
 	 * barrier required due to prior stores - yay x86.
 	 */
-	while (!ts.ts_tp.tp_done) {
+	while (!ts->tp_done) {
 		x86_pause();
 	}
 }

Reply via email to