Module Name: src Committed By: ad Date: Mon Dec 16 19:17:25 UTC 2019
Modified Files: src/sys/arch/x86/x86: x86_tlb.c Log Message: Align the TLB packet precisely on the stack, and do 7 INVLPG since it's what fits in a single line. To generate a diff of this commit: cvs rdiff -u -r1.12 -r1.13 src/sys/arch/x86/x86/x86_tlb.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/x86/x86_tlb.c diff -u src/sys/arch/x86/x86/x86_tlb.c:1.12 src/sys/arch/x86/x86/x86_tlb.c:1.13 --- src/sys/arch/x86/x86/x86_tlb.c:1.12 Mon Dec 2 20:59:56 2019 +++ src/sys/arch/x86/x86/x86_tlb.c Mon Dec 16 19:17:25 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: x86_tlb.c,v 1.12 2019/12/02 20:59:56 pgoyette Exp $ */ +/* $NetBSD: x86_tlb.c,v 1.13 2019/12/16 19:17:25 ad Exp $ */ /*- * Copyright (c) 2008-2019 The NetBSD Foundation, Inc. @@ -40,7 +40,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.12 2019/12/02 20:59:56 pgoyette Exp $"); +__KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.13 2019/12/16 19:17:25 ad Exp $"); #include <sys/param.h> #include <sys/kernel.h> @@ -66,10 +66,10 @@ __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v * until the request is completed. This keeps the cache line in the shared * state, and bus traffic to a minimum. * - * On i386 the packet is 28 bytes in size. On amd64 it's 52 bytes. + * On i386 the packet is 32 bytes in size. On amd64 it's 60 bytes. */ typedef struct { - uintptr_t tp_va[6]; + uintptr_t tp_va[7]; uint8_t tp_count; uint8_t tp_userpmap; uint8_t tp_global; @@ -77,23 +77,14 @@ typedef struct { } pmap_tlb_packet_t; /* - * Padded packet stored on the initiator's stack. - */ -typedef struct { - uint8_t ts_pad1[COHERENCY_UNIT]; - pmap_tlb_packet_t ts_tp; - uint8_t ts_pad2[COHERENCY_UNIT]; -} pmap_tlb_stackbuf_t; - -/* * No more than N separate invlpg. * - * Statistically, a value of six is big enough to cover the requested number + * Statistically, a value of 7 is big enough to cover the requested number * of pages in ~ 95% of the TLB shootdowns we are getting. We therefore rarely * reach the limit, and increasing it can actually reduce the performance due * to the high cost of invlpg. */ -#define TP_MAXVA 6 /* for individual mappings */ +#define TP_MAXVA 7 /* for individual mappings */ #define TP_ALLVA 255 /* special: shoot all mappings */ /* @@ -355,8 +346,8 @@ pmap_tlb_processpacket(volatile pmap_tlb void pmap_tlb_shootnow(void) { - volatile pmap_tlb_packet_t *tp; - volatile pmap_tlb_stackbuf_t ts; + volatile pmap_tlb_packet_t *tp, *ts; + volatile uint8_t stackbuf[128]; struct cpu_info *ci; kcpuset_t *target; u_int local, rcpucount; @@ -405,11 +396,13 @@ pmap_tlb_shootnow(void) * against an interrupt on the current CPU trying the same. */ KASSERT(rcpucount < ncpu); - ts.ts_tp = *tp; - KASSERT(!ts.ts_tp.tp_done); + KASSERT(sizeof(*ts) <= (sizeof(stackbuf) / 2)); + ts = (void *)roundup2((uintptr_t)stackbuf, (sizeof(stackbuf) / 2)); + *ts = *tp; + KASSERT(!ts->tp_done); while (atomic_cas_ptr(&pmap_tlb_packet, NULL, - __UNVOLATILE(&ts.ts_tp)) != NULL) { - KASSERT(pmap_tlb_packet != &ts.ts_tp); + __UNVOLATILE(ts)) != NULL) { + KASSERT(pmap_tlb_packet != ts); /* * Don't bother with exponentional backoff, as the pointer * is in a dedicated cache line and only updated twice per @@ -439,7 +432,7 @@ pmap_tlb_shootnow(void) */ pmap_tlb_pendcount = rcpucount; pmap_tlb_evcnt.ev_count++; - pmap_tlb_processpacket(&ts.ts_tp, target); + pmap_tlb_processpacket(ts, target); /* * Clear out the local CPU's buffer for the next user. Once done, @@ -461,7 +454,7 @@ pmap_tlb_shootnow(void) * perform local shootdown if needed, using our copy of the packet. */ if (local) { - pmap_tlb_invalidate(&ts.ts_tp); + pmap_tlb_invalidate(ts); } /* @@ -470,7 +463,7 @@ pmap_tlb_shootnow(void) * CPU out will update it and only we are reading it). No memory * barrier required due to prior stores - yay x86. */ - while (!ts.ts_tp.tp_done) { + while (!ts->tp_done) { x86_pause(); } }