Module Name: src Committed By: bouyer Date: Sat Apr 25 13:16:48 UTC 2020
Modified Files: src/sys/arch/x86/conf [bouyer-xenpvh]: files.x86 src/sys/arch/x86/x86 [bouyer-xenpvh]: cpu.c i8259.c src/sys/arch/xen/conf [bouyer-xenpvh]: files.xen src/sys/arch/xen/x86 [bouyer-xenpvh]: hypervisor_machdep.c xen_intr.c xen_ipi.c src/sys/arch/xen/xen [bouyer-xenpvh]: xbdback_xenbus.c Log Message: sync with bouyer-xenpvh-base2 (HEAD) To generate a diff of this commit: cvs rdiff -u -r1.107.10.2 -r1.107.10.3 src/sys/arch/x86/conf/files.x86 cvs rdiff -u -r1.181.4.4 -r1.181.4.5 src/sys/arch/x86/x86/cpu.c cvs rdiff -u -r1.23.10.2 -r1.23.10.3 src/sys/arch/x86/x86/i8259.c cvs rdiff -u -r1.180.2.7 -r1.180.2.8 src/sys/arch/xen/conf/files.xen cvs rdiff -u -r1.36.8.6 -r1.36.8.7 src/sys/arch/xen/x86/hypervisor_machdep.c cvs rdiff -u -r1.21.2.9 -r1.21.2.10 src/sys/arch/xen/x86/xen_intr.c cvs rdiff -u -r1.35.6.6 -r1.35.6.7 src/sys/arch/xen/x86/xen_ipi.c cvs rdiff -u -r1.77.2.3 -r1.77.2.4 src/sys/arch/xen/xen/xbdback_xenbus.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/conf/files.x86 diff -u src/sys/arch/x86/conf/files.x86:1.107.10.2 src/sys/arch/x86/conf/files.x86:1.107.10.3 --- src/sys/arch/x86/conf/files.x86:1.107.10.2 Thu Apr 16 08:46:34 2020 +++ src/sys/arch/x86/conf/files.x86 Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.x86,v 1.107.10.2 2020/04/16 08:46:34 bouyer Exp $ +# $NetBSD: files.x86,v 1.107.10.3 2020/04/25 13:16:48 bouyer Exp $ # options for MP configuration through the MP spec defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI @@ -59,7 +59,7 @@ device odcm attach odcm at cpufeaturebus file arch/x86/x86/odcm.c odcm -device padlock: opencrypto +device padlock: opencrypto, rijndael attach padlock at cpufeaturebus file arch/x86/x86/via_padlock.c padlock @@ -91,6 +91,7 @@ file arch/x86/x86/efi.c machdep file arch/x86/x86/errata.c machdep file arch/x86/x86/genfb_machdep.c machdep file arch/x86/x86/identcpu.c machdep +file arch/x86/x86/identcpu_subr.c machdep file arch/x86/x86/i8259.c machdep & (!xenpv | dom0ops) file arch/x86/x86/intr.c machdep & !xenpv file arch/x86/x86/x86_softintr.c machdep Index: src/sys/arch/x86/x86/cpu.c diff -u src/sys/arch/x86/x86/cpu.c:1.181.4.4 src/sys/arch/x86/x86/cpu.c:1.181.4.5 --- src/sys/arch/x86/x86/cpu.c:1.181.4.4 Mon Apr 20 11:29:00 2020 +++ src/sys/arch/x86/x86/cpu.c Sat Apr 25 13:16:48 2020 @@ -1,7 +1,7 @@ -/* $NetBSD: cpu.c,v 1.181.4.4 2020/04/20 11:29:00 bouyer Exp $ */ +/* $NetBSD: cpu.c,v 1.181.4.5 2020/04/25 13:16:48 bouyer Exp $ */ /* - * Copyright (c) 2000-2012 NetBSD Foundation, Inc. + * Copyright (c) 2000-2020 NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -62,7 +62,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.181.4.4 2020/04/20 11:29:00 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.181.4.5 2020/04/25 13:16:48 bouyer Exp $"); #include "opt_ddb.h" #include "opt_mpbios.h" /* for MPDEBUG */ @@ -73,6 +73,7 @@ __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.18 #include "lapic.h" #include "ioapic.h" #include "acpica.h" +#include "hpet.h" #include <sys/param.h> #include <sys/proc.h> @@ -119,6 +120,7 @@ __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.18 #endif #include <dev/ic/mc146818reg.h> +#include <dev/ic/hpetvar.h> #include <i386/isa/nvram.h> #include <dev/isa/isareg.h> @@ -202,6 +204,8 @@ static vaddr_t cmos_data_mapping; #endif struct cpu_info *cpu_starting; +int (*cpu_nullop_ptr)(void *) = nullop; + #ifdef MULTIPROCESSOR void cpu_hatch(void *); static void cpu_boot_secondary(struct cpu_info *ci); @@ -433,8 +437,11 @@ cpu_attach(device_t parent, device_t sel * must be done to allow booting other processors. */ if (!again) { - atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY); + /* Make sure DELAY() (likely i8254_delay()) is initialized. */ + DELAY(1); + /* Basic init. */ + atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY); cpu_intr_init(ci); cpu_get_tsc_freq(ci); cpu_init(ci); @@ -451,8 +458,6 @@ cpu_attach(device_t parent, device_t sel lapic_calibrate_timer(ci); } #endif - /* Make sure DELAY() is initialized. */ - DELAY(1); kcsan_cpu_init(ci); again = true; } @@ -718,7 +723,6 @@ cpu_init(struct cpu_info *ci) if (ci != &cpu_info_primary) { /* Synchronize TSC */ - wbinvd(); atomic_or_32(&ci->ci_flags, CPUF_RUNNING); tsc_sync_ap(ci); } else { @@ -734,6 +738,14 @@ cpu_boot_secondary_processors(void) kcpuset_t *cpus; u_long i; +#if NHPET > 0 + /* Use HPET delay, and re-calibrate TSC on boot CPU using HPET. */ + if (hpet_delay_p() && x86_delay == i8254_delay) { + delay_func = x86_delay = hpet_delay; + cpu_get_tsc_freq(curcpu()); + } +#endif + /* Now that we know the number of CPUs, patch the text segment. */ x86_patch(false); @@ -842,7 +854,6 @@ cpu_start_secondary(struct cpu_info *ci) */ psl = x86_read_psl(); x86_disable_intr(); - wbinvd(); tsc_sync_bp(ci); x86_write_psl(psl); } @@ -873,7 +884,6 @@ cpu_boot_secondary(struct cpu_info *ci) drift = ci->ci_data.cpu_cc_skew; psl = x86_read_psl(); x86_disable_intr(); - wbinvd(); tsc_sync_bp(ci); x86_write_psl(psl); drift -= ci->ci_data.cpu_cc_skew; @@ -919,7 +929,6 @@ cpu_hatch(void *v) * Synchronize the TSC for the first time. Note that interrupts are * off at this point. */ - wbinvd(); atomic_or_32(&ci->ci_flags, CPUF_PRESENT); tsc_sync_ap(ci); @@ -1306,16 +1315,45 @@ cpu_shutdown(device_t dv, int how) return cpu_stop(dv); } +/* Get the TSC frequency and set it to ci->ci_data.cpu_cc_freq. */ void cpu_get_tsc_freq(struct cpu_info *ci) { - uint64_t last_tsc; + uint64_t freq = 0, t0, t1; + int64_t overhead; + + if (cpu_hascounter()) + freq = cpu_tsc_freq_cpuid(ci); + + if (freq != 0) { + /* Use TSC frequency taken from CPUID. */ + ci->ci_data.cpu_cc_freq = freq; + } else { + /* + * Work out the approximate overhead involved below. + * Discard the result of the first go around the loop. + */ + overhead = 0; + for (int i = 0; i <= 8; i++) { + __insn_barrier(); + t0 = cpu_counter_serializing(); + (*cpu_nullop_ptr)(NULL); + t1 = cpu_counter_serializing(); + __insn_barrier(); + if (i > 0) { + overhead += (t1 - t0); + } + } + overhead >>= 3; - if (cpu_hascounter()) { - last_tsc = cpu_counter_serializing(); + /* Now warm up x86_delay() and do the calibration. */ + x86_delay(1); + __insn_barrier(); + t0 = cpu_counter_serializing(); x86_delay(100000); - ci->ci_data.cpu_cc_freq = - (cpu_counter_serializing() - last_tsc) * 10; + t1 = cpu_counter_serializing(); + __insn_barrier(); + ci->ci_data.cpu_cc_freq = (t1 - t0 - overhead) * 10; } } Index: src/sys/arch/x86/x86/i8259.c diff -u src/sys/arch/x86/x86/i8259.c:1.23.10.2 src/sys/arch/x86/x86/i8259.c:1.23.10.3 --- src/sys/arch/x86/x86/i8259.c:1.23.10.2 Sun Apr 19 19:39:10 2020 +++ src/sys/arch/x86/x86/i8259.c Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: i8259.c,v 1.23.10.2 2020/04/19 19:39:10 bouyer Exp $ */ +/* $NetBSD: i8259.c,v 1.23.10.3 2020/04/25 13:16:48 bouyer Exp $ */ /* * Copyright 2002 (c) Wasabi Systems, Inc. @@ -70,9 +70,9 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: i8259.c,v 1.23.10.2 2020/04/19 19:39:10 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: i8259.c,v 1.23.10.3 2020/04/25 13:16:48 bouyer Exp $"); -#include <sys/param.h> +#include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/syslog.h> @@ -83,7 +83,7 @@ __KERNEL_RCSID(0, "$NetBSD: i8259.c,v 1. #include <dev/ic/i8259reg.h> #include <machine/pio.h> -#include <machine/cpufunc.h> +#include <machine/cpufunc.h> #include <machine/cpu.h> #include <machine/pic.h> #include <machine/i8259.h> @@ -166,7 +166,7 @@ i8259_default_setup(void) /* reset; program device, level-triggered, four bytes */ outb(IO_ICU2 + PIC_ICW1, ICW1_SELECT | ICW1_LTIM | ICW1_IC4); else -#endif +#endif /* reset; program device, four bytes */ outb(IO_ICU2 + PIC_ICW1, ICW1_SELECT | ICW1_IC4); Index: src/sys/arch/xen/conf/files.xen diff -u src/sys/arch/xen/conf/files.xen:1.180.2.7 src/sys/arch/xen/conf/files.xen:1.180.2.8 --- src/sys/arch/xen/conf/files.xen:1.180.2.7 Sat Apr 25 10:52:26 2020 +++ src/sys/arch/xen/conf/files.xen Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.xen,v 1.180.2.7 2020/04/25 10:52:26 bouyer Exp $ +# $NetBSD: files.xen,v 1.180.2.8 2020/04/25 13:16:48 bouyer Exp $ defflag opt_xen.h XEN XENPVH XENPVHVM PAE @@ -10,37 +10,3 @@ file arch/xen/xen/xengnt.c xen file arch/xen/x86/xen_mainbus.c xen file arch/xen/xen/xen_clock.c xen file arch/xen/x86/xen_bus_dma.c xen - -define hypervisorbus {} -define xendevbus {} - -# Xen hypervisor -device hypervisor { [apid = -1]}: isabus, pcibus, sysmon_power, xendevbus, acpibus -attach hypervisor at hypervisorbus -file arch/xen/xen/hypervisor.c hypervisor needs-flag -file arch/xen/xen/shutdown_xenbus.c hypervisor - -# Xenbus -device xenbus {[id = -1]} -attach xenbus at xendevbus -file arch/xen/xenbus/xenbus_client.c xenbus needs-flag -file arch/xen/xenbus/xenbus_comms.c xenbus needs-flag -file arch/xen/xenbus/xenbus_dev.c xenbus needs-flag -file arch/xen/xenbus/xenbus_probe.c xenbus needs-flag -file arch/xen/xenbus/xenbus_xs.c xenbus needs-flag - -# Xen console support -device xencons: tty -attach xencons at xendevbus -file arch/xen/xen/xencons.c xencons needs-flag - -# Xen Network driver -device xennet: arp, ether, ifnet -attach xennet at xenbus -file arch/xen/xen/if_xennet_xenbus.c xennet needs-flag -file arch/xen/xen/xennet_checksum.c xvif | xennet - -# Xen Block device driver and wd/sd/cd identities -device xbd: disk -attach xbd at xenbus -file arch/xen/xen/xbd_xenbus.c xbd Index: src/sys/arch/xen/x86/hypervisor_machdep.c diff -u src/sys/arch/xen/x86/hypervisor_machdep.c:1.36.8.6 src/sys/arch/xen/x86/hypervisor_machdep.c:1.36.8.7 --- src/sys/arch/xen/x86/hypervisor_machdep.c:1.36.8.6 Sat Apr 25 11:23:57 2020 +++ src/sys/arch/xen/x86/hypervisor_machdep.c Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: hypervisor_machdep.c,v 1.36.8.6 2020/04/25 11:23:57 bouyer Exp $ */ +/* $NetBSD: hypervisor_machdep.c,v 1.36.8.7 2020/04/25 13:16:48 bouyer Exp $ */ /* * @@ -54,7 +54,7 @@ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.36.8.6 2020/04/25 11:23:57 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.36.8.7 2020/04/25 13:16:48 bouyer Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -477,7 +477,7 @@ x86_cpu_idle_xen(void) KASSERT(ci->ci_ilevel == IPL_NONE); x86_disable_intr(); - if (!__predict_false(ci->ci_want_resched)) { + if (__predict_false(!ci->ci_want_resched)) { idle_block(); } else { x86_enable_intr(); Index: src/sys/arch/xen/x86/xen_intr.c diff -u src/sys/arch/xen/x86/xen_intr.c:1.21.2.9 src/sys/arch/xen/x86/xen_intr.c:1.21.2.10 --- src/sys/arch/xen/x86/xen_intr.c:1.21.2.9 Mon Apr 20 20:19:07 2020 +++ src/sys/arch/xen/x86/xen_intr.c Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: xen_intr.c,v 1.21.2.9 2020/04/20 20:19:07 bouyer Exp $ */ +/* $NetBSD: xen_intr.c,v 1.21.2.10 2020/04/25 13:16:48 bouyer Exp $ */ /*- * Copyright (c) 1998, 2001 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xen_intr.c,v 1.21.2.9 2020/04/20 20:19:07 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xen_intr.c,v 1.21.2.10 2020/04/25 13:16:48 bouyer Exp $"); #include "opt_multiprocessor.h" @@ -68,6 +68,10 @@ __KERNEL_RCSID(0, "$NetBSD: xen_intr.c,v #if NPCI > 0 #include <dev/pci/ppbreg.h> +#ifdef __HAVE_PCI_MSI_MSIX +#include <x86/pci/msipic.h> +#include <x86/pci/pci_msi_machdep.h> +#endif #endif #if defined(MULTIPROCESSOR) @@ -481,6 +485,21 @@ xen_intr_create_intrid(int legacy_irq, s return NULL; /* No pic found! */ } +static struct intrsource xen_dummy_intrsource; + +struct intrsource * +xen_intr_allocate_io_intrsource(const char *intrid) +{ + /* Nothing to do, required by MSI code */ + return &xen_dummy_intrsource; +} + +void +xen_intr_free_io_intrsource(const char *intrid) +{ + /* Nothing to do, required by MSI code */ +} + #if defined(XENPV) __strong_alias(x86_read_psl, xen_read_psl); __strong_alias(x86_write_psl, xen_write_psl); @@ -495,4 +514,6 @@ __strong_alias(intr_disestablish, xen_in __strong_alias(cpu_intr_redistribute, xen_cpu_intr_redistribute); __strong_alias(cpu_intr_count, xen_cpu_intr_count); __strong_alias(cpu_intr_init, xen_cpu_intr_init); +__strong_alias(intr_allocate_io_intrsource, xen_intr_allocate_io_intrsource); +__strong_alias(intr_free_io_intrsource, xen_intr_free_io_intrsource); #endif /* XENPV */ Index: src/sys/arch/xen/x86/xen_ipi.c diff -u src/sys/arch/xen/x86/xen_ipi.c:1.35.6.6 src/sys/arch/xen/x86/xen_ipi.c:1.35.6.7 --- src/sys/arch/xen/x86/xen_ipi.c:1.35.6.6 Mon Apr 20 19:46:44 2020 +++ src/sys/arch/xen/x86/xen_ipi.c Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: xen_ipi.c,v 1.35.6.6 2020/04/20 19:46:44 bouyer Exp $ */ +/* $NetBSD: xen_ipi.c,v 1.35.6.7 2020/04/25 13:16:48 bouyer Exp $ */ /*- * Copyright (c) 2011, 2019 The NetBSD Foundation, Inc. @@ -35,7 +35,7 @@ * Based on: x86/ipi.c */ -__KERNEL_RCSID(0, "$NetBSD: xen_ipi.c,v 1.35.6.6 2020/04/20 19:46:44 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xen_ipi.c,v 1.35.6.7 2020/04/25 13:16:48 bouyer Exp $"); #include "opt_ddb.h" @@ -157,7 +157,7 @@ valid_ipimask(uint32_t ipimask) { uint32_t masks = XEN_IPI_GENERIC | XEN_IPI_HVCB | XEN_IPI_XCALL | XEN_IPI_DDB | XEN_IPI_SYNCH_FPU | - XEN_IPI_HALT | XEN_IPI_KICK | XEN_IPI_AST | XEN_IPI_KPREEMPT; + XEN_IPI_HALT | XEN_IPI_AST | XEN_IPI_KPREEMPT; if (ipimask & ~masks) { return false; Index: src/sys/arch/xen/xen/xbdback_xenbus.c diff -u src/sys/arch/xen/xen/xbdback_xenbus.c:1.77.2.3 src/sys/arch/xen/xen/xbdback_xenbus.c:1.77.2.4 --- src/sys/arch/xen/xen/xbdback_xenbus.c:1.77.2.3 Mon Apr 20 19:40:51 2020 +++ src/sys/arch/xen/xen/xbdback_xenbus.c Sat Apr 25 13:16:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: xbdback_xenbus.c,v 1.77.2.3 2020/04/20 19:40:51 bouyer Exp $ */ +/* $NetBSD: xbdback_xenbus.c,v 1.77.2.4 2020/04/25 13:16:48 bouyer Exp $ */ /* * Copyright (c) 2006 Manuel Bouyer. @@ -26,9 +26,8 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.77.2.3 2020/04/20 19:40:51 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.77.2.4 2020/04/25 13:16:48 bouyer Exp $"); -#include <sys/atomic.h> #include <sys/buf.h> #include <sys/condvar.h> #include <sys/conf.h> @@ -73,8 +72,10 @@ __KERNEL_RCSID(0, "$NetBSD: xbdback_xenb /* Need to alloc one extra page to account for possible mapping offset */ #define VBD_VA_SIZE (MAXPHYS + PAGE_SIZE) +#define VBD_MAX_INDIRECT_SEGMENTS VBD_VA_SIZE >> PAGE_SHIFT + +CTASSERT(XENSHM_MAX_PAGES_PER_REQUEST >= VBD_MAX_INDIRECT_SEGMENTS); -struct xbdback_io; struct xbdback_instance; /* @@ -140,6 +141,41 @@ struct xbdback_va { vaddr_t xv_vaddr; }; +/* + * For each I/O operation associated with one of those requests, an + * xbdback_io is allocated from a pool. It may correspond to multiple + * Xen disk requests, or parts of them, if several arrive at once that + * can be coalesced. + */ +struct xbdback_io { + SLIST_ENTRY(xbdback_io) xio_next; + /* The instance pointer is duplicated for convenience. */ + struct xbdback_instance *xio_xbdi; /* our xbd instance */ + uint8_t xio_operation; + uint64_t xio_id; + union { + struct { + struct buf xio_buf; /* our I/O */ + /* the virtual address to map the request at */ + vaddr_t xio_vaddr; + struct xbdback_va *xio_xv; + vaddr_t xio_start_offset; /* I/O start offset */ + /* grants to map */ + grant_ref_t xio_gref[VBD_MAX_INDIRECT_SEGMENTS]; + /* grants release */ + grant_handle_t xio_gh[VBD_MAX_INDIRECT_SEGMENTS]; + uint16_t xio_nrma; /* number of guest pages */ + } xio_rw; + } u; +}; +#define xio_buf u.xio_rw.xio_buf +#define xio_vaddr u.xio_rw.xio_vaddr +#define xio_start_offset u.xio_rw.xio_start_offset +#define xio_xv u.xio_rw.xio_xv +#define xio_gref u.xio_rw.xio_gref +#define xio_gh u.xio_rw.xio_gh +#define xio_nrma u.xio_rw.xio_nrma + /* we keep the xbdback instances in a linked list */ struct xbdback_instance { SLIST_ENTRY(xbdback_instance) next; @@ -152,7 +188,9 @@ struct xbdback_instance { kmutex_t xbdi_lock; kcondvar_t xbdi_cv; /* wait channel for thread work */ xbdback_state_t xbdi_status; /* thread's status */ - /* KVA for mapping transfers */ + /* context and KVA for mapping transfers */ + struct xbdback_io xbdi_io[BLKIF_RING_SIZE]; + SLIST_HEAD(, xbdback_io) xbdi_io_free; struct xbdback_va xbdi_va[BLKIF_RING_SIZE]; SLIST_HEAD(, xbdback_va) xbdi_va_free; /* backing device parameters */ @@ -179,75 +217,28 @@ struct xbdback_instance { RING_IDX xbdi_req_prod; /* limit on request indices */ xbdback_cont_t xbdi_cont; /* _request state: track requests fetched from ring */ - struct xbdback_request *xbdi_req; /* if NULL, ignore following */ blkif_request_t xbdi_xen_req; - /* _io state: I/O associated to this instance */ - struct xbdback_io *xbdi_io; + struct blkif_request_segment xbdi_seg[VBD_MAX_INDIRECT_SEGMENTS]; + bus_dmamap_t xbdi_seg_dmamap; + grant_ref_t xbdi_in_gntref; /* other state */ - int xbdi_same_page; /* are we merging two segments on the same page? */ uint xbdi_pendingreqs; /* number of I/O in fly */ struct timeval xbdi_lasterr_time; /* error time tracking */ -#ifdef DEBUG - struct timeval xbdi_lastfragio_time; /* fragmented I/O tracking */ -#endif }; /* Manipulation of the above reference count. */ -#define xbdi_get(xbdip) atomic_inc_uint(&(xbdip)->xbdi_refcnt) -#define xbdi_put(xbdip) \ -do { \ - if (atomic_dec_uint_nv(&(xbdip)->xbdi_refcnt) == 0) \ - xbdback_finish_disconnect(xbdip); \ +#define xbdi_get(xbdip) (xbdip)->xbdi_refcnt++ +#define xbdi_put(xbdip) \ +do { \ + if (--((xbdip)->xbdi_refcnt) == 0) \ + xbdback_finish_disconnect(xbdip); \ } while (/* CONSTCOND */ 0) static SLIST_HEAD(, xbdback_instance) xbdback_instances; static kmutex_t xbdback_lock; -/* - * For each I/O operation associated with one of those requests, an - * xbdback_io is allocated from a pool. It may correspond to multiple - * Xen disk requests, or parts of them, if several arrive at once that - * can be coalesced. - */ -struct xbdback_io { - /* The instance pointer is duplicated for convenience. */ - struct xbdback_instance *xio_xbdi; /* our xbd instance */ - uint8_t xio_operation; - uint64_t xio_id; - union { - struct { - struct buf xio_buf; /* our I/O */ - /* the virtual address to map the request at */ - vaddr_t xio_vaddr; - struct xbdback_va *xio_xv; - vaddr_t xio_start_offset; /* I/O start offset */ - /* grants to map */ - grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; - /* grants release */ - grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST]; - uint16_t xio_nrma; /* number of guest pages */ - } xio_rw; - } u; -}; -#define xio_buf u.xio_rw.xio_buf -#define xio_vaddr u.xio_rw.xio_vaddr -#define xio_start_offset u.xio_rw.xio_start_offset -#define xio_xv u.xio_rw.xio_xv -#define xio_gref u.xio_rw.xio_gref -#define xio_gh u.xio_rw.xio_gh -#define xio_nrma u.xio_rw.xio_nrma - -/* - * Pools to manage the chain of block requests and I/Os fragments - * submitted by frontend. - */ -static struct pool_cache xbdback_io_pool; - /* Interval between reports of I/O errors from frontend */ static const struct timeval xbdback_err_intvl = { 1, 0 }; -#ifdef DEBUG -static const struct timeval xbdback_fragio_intvl = { 60, 0 }; -#endif void xbdbackattach(int); static int xbdback_xenbus_create(struct xenbus_device *); static int xbdback_xenbus_destroy(void *); @@ -277,14 +268,15 @@ static void *xbdback_co_do_io(struct xbd static void xbdback_io_error(struct xbdback_io *, int); static void xbdback_iodone(struct buf *); +static void xbdback_iodone_locked(struct xbdback_instance *, + struct xbdback_io *, struct buf *); static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int); static void *xbdback_map_shm(struct xbdback_io *); static void xbdback_unmap_shm(struct xbdback_io *); -static void *xbdback_pool_get(struct pool_cache *, - struct xbdback_instance *); -static void xbdback_pool_put(struct pool_cache *, void *); +static struct xbdback_io *xbdback_io_get(struct xbdback_instance *); +static void xbdback_io_put(struct xbdback_instance *, struct xbdback_io *); static void xbdback_thread(void *); static void xbdback_wakeup_thread(struct xbdback_instance *); static void xbdback_trampoline(struct xbdback_instance *, void *); @@ -306,13 +298,6 @@ xbdbackattach(int n) SLIST_INIT(&xbdback_instances); mutex_init(&xbdback_lock, MUTEX_DEFAULT, IPL_NONE); - pool_cache_bootstrap(&xbdback_io_pool, - sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL, - IPL_SOFTBIO, NULL, NULL, NULL); - - /* we allocate enough to handle a whole ring at once */ - pool_prime(&xbdback_io_pool.pc_pool, BLKIF_RING_SIZE); - xenbus_backend_register(&xbd_backend_driver); } @@ -351,10 +336,7 @@ xbdback_xenbus_create(struct xenbus_devi xbusd->xbusd_path); return EFTYPE; } - - if (xbdif_lookup(domid, handle)) { - return EEXIST; - } + xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP); xbdi->xbdi_domid = domid; @@ -362,21 +344,43 @@ xbdback_xenbus_create(struct xenbus_devi snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d", xbdi->xbdi_domid, xbdi->xbdi_handle); + mutex_enter(&xbdback_lock); + if (xbdif_lookup(domid, handle)) { + mutex_exit(&xbdback_lock); + kmem_free(xbdi, sizeof(*xbdi)); + return EEXIST; + } + SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next); + mutex_exit(&xbdback_lock); + /* initialize status and reference counter */ xbdi->xbdi_status = DISCONNECTED; xbdi_get(xbdi); mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO); cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name); - mutex_enter(&xbdback_lock); - SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next); - mutex_exit(&xbdback_lock); xbusd->xbusd_u.b.b_cookie = xbdi; xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy; xbusd->xbusd_otherend_changed = xbdback_frontend_changed; xbdi->xbdi_xbusd = xbusd; + if (bus_dmamap_create(xbdi->xbdi_xbusd->xbusd_dmat, PAGE_SIZE, + 1, PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, + &xbdi->xbdi_seg_dmamap) != 0) { + printf("%s: can't create dma map for indirect segments\n", + xbdi->xbdi_name); + goto fail; + } + if (bus_dmamap_load(xbdi->xbdi_xbusd->xbusd_dmat, + xbdi->xbdi_seg_dmamap, xbdi->xbdi_seg, + sizeof(xbdi->xbdi_seg), NULL, BUS_DMA_WAITOK) != 0) { + printf("%s: can't load dma map for indirect segments\n", + xbdi->xbdi_name); + goto fail; + } + KASSERT(xbdi->xbdi_seg_dmamap->dm_nsegs == 1); + SLIST_INIT(&xbdi->xbdi_va_free); for (i = 0; i < BLKIF_RING_SIZE; i++) { xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map, @@ -385,6 +389,12 @@ xbdback_xenbus_create(struct xenbus_devi xv_next); } + SLIST_INIT(&xbdi->xbdi_io_free); + for (i = 0; i < BLKIF_RING_SIZE; i++) { + SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, &xbdi->xbdi_io[i], + xio_next); + } + error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device", &xbdi->xbdi_watch, xbdback_backend_changed); if (error) { @@ -460,6 +470,9 @@ xbdback_xenbus_destroy(void *arg) } } + bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat, xbdi->xbdi_seg_dmamap); + bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat, xbdi->xbdi_seg_dmamap); + mutex_destroy(&xbdi->xbdi_lock); cv_destroy(&xbdi->xbdi_cv); kmem_free(xbdi, sizeof(*xbdi)); @@ -807,6 +820,13 @@ again: xbusd->xbusd_path, err); goto abort; } + err = xenbus_printf(xbt, xbusd->xbusd_path, + "feature-max-indirect-segments", "%u", VBD_MAX_INDIRECT_SEGMENTS); + if (err) { + printf("xbdback: failed to write %s/feature-indirect: %d\n", + xbusd->xbusd_path, err); + goto abort; + } err = xenbus_transaction_end(xbt, 0); if (err == EAGAIN) goto again; @@ -835,7 +855,7 @@ xbdback_finish_disconnect(struct xbdback xbdi->xbdi_status = DISCONNECTED; - cv_signal(&xbdi->xbdi_cv); + cv_broadcast(&xbdi->xbdi_cv); } static bool @@ -844,14 +864,14 @@ xbdif_lookup(domid_t dom , uint32_t hand struct xbdback_instance *xbdi; bool found = false; - mutex_enter(&xbdback_lock); + KASSERT(mutex_owned(&xbdback_lock)); + SLIST_FOREACH(xbdi, &xbdback_instances, next) { if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) { found = true; break; } } - mutex_exit(&xbdback_lock); return found; } @@ -864,7 +884,9 @@ xbdback_evthandler(void *arg) XENPRINTF(("xbdback_evthandler domain %d: cont %p\n", xbdi->xbdi_domid, xbdi->xbdi_cont)); + mutex_enter(&xbdi->xbdi_lock); xbdback_wakeup_thread(xbdi); + mutex_exit(&xbdi->xbdi_lock); return 1; } @@ -878,16 +900,14 @@ xbdback_thread(void *arg) { struct xbdback_instance *xbdi = arg; + mutex_enter(&xbdi->xbdi_lock); for (;;) { - mutex_enter(&xbdi->xbdi_lock); switch (xbdi->xbdi_status) { case WAITING: cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); - mutex_exit(&xbdi->xbdi_lock); break; case RUN: xbdi->xbdi_status = WAITING; /* reset state */ - mutex_exit(&xbdi->xbdi_lock); if (xbdi->xbdi_cont == NULL) { xbdi->xbdi_cont = xbdback_co_main; @@ -899,22 +919,24 @@ xbdback_thread(void *arg) if (xbdi->xbdi_pendingreqs > 0) { /* there are pending I/Os. Wait for them. */ cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); - mutex_exit(&xbdi->xbdi_lock); - break; + continue; } /* All I/Os should have been processed by now, * xbdi_refcnt should drop to 0 */ xbdi_put(xbdi); KASSERT(xbdi->xbdi_refcnt == 0); - mutex_exit(&xbdi->xbdi_lock); - kthread_exit(0); - break; + goto out; + /* NOTREACHED */ default: panic("%s: invalid state %d", xbdi->xbdi_name, xbdi->xbdi_status); } } +out: + mutex_exit(&xbdi->xbdi_lock); + + kthread_exit(0); } static void * @@ -939,39 +961,35 @@ xbdback_co_main(struct xbdback_instance * the ring. */ static void * -xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) +xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj __unused) { - blkif_request_t *req; + blkif_request_t *req, *reqn; blkif_x86_32_request_t *req32; blkif_x86_64_request_t *req64; + blkif_request_indirect_t *rin; - (void)obj; - req = &xbdi->xbdi_xen_req; if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) { + req = &xbdi->xbdi_xen_req; + memset(req, 0, sizeof(*req)); + switch(xbdi->xbdi_proto) { case XBDIP_NATIVE: - memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, - xbdi->xbdi_ring.ring_n.req_cons), - sizeof(blkif_request_t)); + reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, + xbdi->xbdi_ring.ring_n.req_cons); + req->operation = reqn->operation; + req->id = reqn->id; break; case XBDIP_32: req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, xbdi->xbdi_ring.ring_n.req_cons); req->operation = req32->operation; - req->nr_segments = req32->nr_segments; - req->handle = req32->handle; req->id = req32->id; - req->sector_number = req32->sector_number; break; - case XBDIP_64: req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, xbdi->xbdi_ring.ring_n.req_cons); req->operation = req64->operation; - req->nr_segments = req64->nr_segments; - req->handle = req64->handle; req->id = req64->id; - req->sector_number = req64->sector_number; break; } __insn_barrier(); @@ -981,7 +999,23 @@ xbdback_co_main_loop(struct xbdback_inst xbdi->xbdi_req_prod, xbdi->xbdi_ring.ring_n.rsp_prod_pvt, req->id)); - switch(req->operation) { + switch (req->operation) { + case BLKIF_OP_INDIRECT: + /* just check indirect_op, rest is handled later */ + rin = (blkif_request_indirect_t *) + RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, + xbdi->xbdi_ring.ring_n.req_cons); + if (rin->indirect_op != BLKIF_OP_READ && + rin->indirect_op != BLKIF_OP_WRITE) { + if (ratecheck(&xbdi->xbdi_lasterr_time, + &xbdback_err_intvl)) { + printf("%s: unknown ind operation %d\n", + xbdi->xbdi_name, + rin->indirect_op); + } + goto fail; + } + /* FALLTHROUGH */ case BLKIF_OP_READ: case BLKIF_OP_WRITE: xbdi->xbdi_cont = xbdback_co_io; @@ -996,13 +1030,13 @@ xbdback_co_main_loop(struct xbdback_inst printf("%s: unknown operation %d\n", xbdi->xbdi_name, req->operation); } +fail: xbdback_send_reply(xbdi, req->id, req->operation, BLKIF_RSP_ERROR); xbdi->xbdi_cont = xbdback_co_main_incr; break; } } else { - KASSERT(xbdi->xbdi_io == NULL); xbdi->xbdi_cont = xbdback_co_main_done2; } return xbdi; @@ -1013,30 +1047,19 @@ xbdback_co_main_loop(struct xbdback_inst * we want to disconnect, leave continuation now. */ static void * -xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj) +xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj __unused) { - (void)obj; + KASSERT(mutex_owned(&xbdi->xbdi_lock)); + blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n; ring->req_cons++; - /* - * Do not bother with locking here when checking for xbdi_status: if - * we get a transient state, we will get the right value at - * the next increment. - */ if (xbdi->xbdi_status == DISCONNECTING) xbdi->xbdi_cont = NULL; else xbdi->xbdi_cont = xbdback_co_main_loop; - /* - * Each time the thread processes a full ring of requests, give - * a chance to other threads to process I/Os too - */ - if ((ring->req_cons % BLKIF_RING_SIZE) == 0) - yield(); - return xbdi; } @@ -1074,7 +1097,7 @@ xbdback_co_cache_flush(struct xbdback_in return NULL; } xbdi->xbdi_cont = xbdback_co_cache_doflush; - return xbdback_pool_get(&xbdback_io_pool, xbdi); + return xbdback_io_get(xbdi); } /* Start the flush work */ @@ -1084,12 +1107,12 @@ xbdback_co_cache_doflush(struct xbdback_ struct xbdback_io *xbd_io; XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj)); - xbd_io = xbdi->xbdi_io = obj; + xbd_io = obj; xbd_io->xio_xbdi = xbdi; xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; xbd_io->xio_id = xbdi->xbdi_xen_req.id; xbdi->xbdi_cont = xbdback_co_do_io; - return xbdi; + return xbd_io; } /* @@ -1097,31 +1120,22 @@ xbdback_co_cache_doflush(struct xbdback_ * then get the segment information directly from the ring request. */ static void * -xbdback_co_io(struct xbdback_instance *xbdi, void *obj) +xbdback_co_io(struct xbdback_instance *xbdi, void *obj __unused) { int i, error; - blkif_request_t *req; + blkif_request_t *req, *reqn; blkif_x86_32_request_t *req32; blkif_x86_64_request_t *req64; + blkif_request_indirect_t *rinn; + blkif_x86_32_request_indirect_t *rin32; + blkif_x86_64_request_indirect_t *rin64; - (void)obj; - - /* some sanity checks */ req = &xbdi->xbdi_xen_req; - if (req->nr_segments < 1 || - req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) { - if (ratecheck(&xbdi->xbdi_lasterr_time, - &xbdback_err_intvl)) { - printf("%s: invalid number of segments: %d\n", - xbdi->xbdi_name, - xbdi->xbdi_xen_req.nr_segments); - } - error = EINVAL; - goto end; - } + /* some sanity checks */ KASSERT(req->operation == BLKIF_OP_READ || - req->operation == BLKIF_OP_WRITE); + req->operation == BLKIF_OP_WRITE || + req->operation == BLKIF_OP_INDIRECT); if (req->operation == BLKIF_OP_WRITE) { if (xbdi->xbdi_ro) { error = EROFS; @@ -1130,27 +1144,88 @@ xbdback_co_io(struct xbdback_instance *x } /* copy request segments */ - switch(xbdi->xbdi_proto) { + switch (xbdi->xbdi_proto) { case XBDIP_NATIVE: - /* already copied in xbdback_co_main_loop */ + reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, + xbdi->xbdi_ring.ring_n.req_cons); + req->handle = reqn->handle; + req->sector_number = reqn->sector_number; + if (reqn->operation == BLKIF_OP_INDIRECT) { + rinn = (blkif_request_indirect_t *)reqn; + req->operation = rinn->indirect_op; + req->nr_segments = (uint8_t)rinn->nr_segments; + if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) + goto bad_nr_segments; + xbdi->xbdi_in_gntref = rinn->indirect_grefs[0]; + /* first_sect and segment grefs fetched later */ + } else { + req->nr_segments = reqn->nr_segments; + if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) + goto bad_nr_segments; + for (i = 0; i < req->nr_segments; i++) + xbdi->xbdi_seg[i] = reqn->seg[i]; + xbdi->xbdi_in_gntref = 0; + } break; case XBDIP_32: req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, xbdi->xbdi_ring.ring_n.req_cons); - for (i = 0; i < req->nr_segments; i++) - req->seg[i] = req32->seg[i]; + req->handle = req32->handle; + req->sector_number = req32->sector_number; + if (req32->operation == BLKIF_OP_INDIRECT) { + rin32 = (blkif_x86_32_request_indirect_t *)req32; + req->operation = rin32->indirect_op; + req->nr_segments = (uint8_t)rin32->nr_segments; + if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) + goto bad_nr_segments; + xbdi->xbdi_in_gntref = rin32->indirect_grefs[0]; + /* first_sect and segment grefs fetched later */ + } else { + req->nr_segments = req32->nr_segments; + if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) + goto bad_nr_segments; + for (i = 0; i < req->nr_segments; i++) + xbdi->xbdi_seg[i] = req32->seg[i]; + xbdi->xbdi_in_gntref = 0; + } break; case XBDIP_64: req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, xbdi->xbdi_ring.ring_n.req_cons); - for (i = 0; i < req->nr_segments; i++) - req->seg[i] = req64->seg[i]; + req->handle = req64->handle; + req->sector_number = req64->sector_number; + if (req64->operation == BLKIF_OP_INDIRECT) { + rin64 = (blkif_x86_64_request_indirect_t *)req64; + req->nr_segments = (uint8_t)rin64->nr_segments; + if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) + goto bad_nr_segments; + xbdi->xbdi_in_gntref = rin64->indirect_grefs[0]; + /* first_sect and segment grefs fetched later */ + } else { + req->nr_segments = req64->nr_segments; + if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) + goto bad_nr_segments; + for (i = 0; i < req->nr_segments; i++) + xbdi->xbdi_seg[i] = req64->seg[i]; + xbdi->xbdi_in_gntref = 0; + } break; } - KASSERT(xbdi->xbdi_io == NULL); + /* Max value checked already earlier */ + if (req->nr_segments < 1) + goto bad_nr_segments; + xbdi->xbdi_cont = xbdback_co_io_gotio; - return xbdback_pool_get(&xbdback_io_pool, xbdi); + return xbdback_io_get(xbdi); + + bad_nr_segments: + if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) { + printf("%s: invalid number of segments: %d\n", + xbdi->xbdi_name, req->nr_segments); + } + error = EINVAL; + /* FALLTHROUGH */ end: xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id, @@ -1169,31 +1244,63 @@ xbdback_co_io_gotio(struct xbdback_insta size_t bcount; blkif_request_t *req; + KASSERT(mutex_owned(&xbdi->xbdi_lock)); + xbdi_get(xbdi); - atomic_inc_uint(&xbdi->xbdi_pendingreqs); + xbdi->xbdi_pendingreqs++; req = &xbdi->xbdi_xen_req; - xbd_io = xbdi->xbdi_io = obj; + xbd_io = obj; memset(xbd_io, 0, sizeof(*xbd_io)); buf_init(&xbd_io->xio_buf); xbd_io->xio_xbdi = xbdi; xbd_io->xio_operation = req->operation; xbd_io->xio_id = req->id; + /* If segments are on an indirect page, copy them now */ + if (xbdi->xbdi_in_gntref) { + gnttab_copy_t gop; + paddr_t ma; + + gop.flags = GNTCOPY_source_gref; + gop.len = req->nr_segments + * sizeof(struct blkif_request_segment); + + gop.source.u.ref = xbdi->xbdi_in_gntref; + gop.source.offset = 0; + gop.source.domid = xbdi->xbdi_domid; + + ma = xbdi->xbdi_seg_dmamap->dm_segs[0].ds_addr; + gop.dest.offset = ma & PAGE_MASK; + gop.dest.domid = DOMID_SELF; + gop.dest.u.gmfn = ma >> PAGE_SHIFT; + + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, &gop, 1) != 0) { + printf("%s: GNTTABOP_copy failed\n", xbdi->xbdi_name); + xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id, + xbdi->xbdi_xen_req.operation, + BLKIF_RSP_ERROR); + xbdi->xbdi_cont = xbdback_co_main_incr; + return NULL; + } + } + /* Process segments */ bcount = 0; for (int i = 0; i < req->nr_segments; i++) { - xbd_io->xio_gref[i] = req->seg[i].gref; - bcount += (req->seg[i].last_sect - req->seg[i].first_sect + 1) + struct blkif_request_segment *seg = &xbdi->xbdi_seg[i]; + xbd_io->xio_gref[i] = seg->gref; + bcount += (seg->last_sect - seg->first_sect + 1) * VBD_BSIZE; } - KASSERT(bcount <= MAXPHYS); xbd_io->xio_nrma = req->nr_segments; + xbd_io->xio_start_offset = xbdi->xbdi_seg[0].first_sect * VBD_BSIZE; - xbd_io->xio_start_offset = req->seg[0].first_sect * VBD_BSIZE; + KASSERT(bcount <= MAXPHYS); KASSERT(xbd_io->xio_start_offset < PAGE_SIZE); KASSERT(bcount + xbd_io->xio_start_offset < VBD_VA_SIZE); + /* Fill-in the buf */ if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) { buf_flags = B_WRITE; } else { @@ -1214,14 +1321,18 @@ xbdback_co_io_gotio(struct xbdback_insta xbd_io->xio_buf.b_private = xbd_io; xbdi->xbdi_cont = xbdback_co_do_io; - return xbdback_map_shm(xbdi->xbdi_io); + return xbdback_map_shm(xbd_io); } static void xbdback_io_error(struct xbdback_io *xbd_io, int error) { - xbd_io->xio_buf.b_error = error; - xbdback_iodone(&xbd_io->xio_buf); + KASSERT(mutex_owned(&xbd_io->xio_xbdi->xbdi_lock)); + + struct buf *bp = &xbd_io->xio_buf; + + bp->b_error = error; + xbdback_iodone_locked(xbd_io->xio_xbdi, xbd_io, bp); } /* @@ -1231,7 +1342,7 @@ xbdback_io_error(struct xbdback_io *xbd_ static void * xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj) { - struct xbdback_io *xbd_io = xbdi->xbdi_io; + struct xbdback_io *xbd_io = obj; switch (xbd_io->xio_operation) { case BLKIF_OP_FLUSH_DISKCACHE: @@ -1239,8 +1350,11 @@ xbdback_co_do_io(struct xbdback_instance int error; int force = 1; + KASSERT(mutex_owned(&xbdi->xbdi_lock)); + mutex_exit(&xbdi->xbdi_lock); error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE, kauth_cred_get()); + mutex_enter(&xbdi->xbdi_lock); if (error) { aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n", xbdi->xbdi_xbusd->xbusd_path, error); @@ -1252,9 +1366,8 @@ xbdback_co_do_io(struct xbdback_instance error = BLKIF_RSP_OKAY; xbdback_send_reply(xbdi, xbd_io->xio_id, xbd_io->xio_operation, error); - xbdback_pool_put(&xbdback_io_pool, xbd_io); + xbdback_io_put(xbdi, xbd_io); xbdi_put(xbdi); - xbdi->xbdi_io = NULL; xbdi->xbdi_cont = xbdback_co_main_incr; return xbdi; } @@ -1270,7 +1383,6 @@ xbdback_co_do_io(struct xbdback_instance } /* will call xbdback_iodone() asynchronously when done */ bdev_strategy(&xbd_io->xio_buf); - xbdi->xbdi_io = NULL; xbdi->xbdi_cont = xbdback_co_main_incr; return xbdi; default: @@ -1283,24 +1395,36 @@ xbdback_co_do_io(struct xbdback_instance /* * Called from softint(9) context when an I/O is done: for each request, send * back the associated reply to the domain. - * - * This gets reused by xbdback_io_error to report errors from other sources. */ static void xbdback_iodone(struct buf *bp) { struct xbdback_io *xbd_io; struct xbdback_instance *xbdi; - int status; - - KERNEL_LOCK(1, NULL); /* XXXSMP */ xbd_io = bp->b_private; + KASSERT(bp == &xbd_io->xio_buf); xbdi = xbd_io->xio_xbdi; + mutex_enter(&xbdi->xbdi_lock); + xbdback_iodone_locked(xbdi, xbd_io, bp); + mutex_exit(&xbdi->xbdi_lock); +} + +/* + * This gets reused by xbdback_io_error to report errors from other sources. + */ +static void +xbdback_iodone_locked(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io, + struct buf *bp) +{ + int status; + XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n", xbdi->xbdi_domid, (long)xbd_io)); + KASSERT(mutex_owned(&xbdi->xbdi_lock)); + KASSERT(bp->b_error != 0 || xbd_io->xio_xv != NULL); if (xbd_io->xio_xv != NULL) xbdback_unmap_shm(xbd_io); @@ -1315,12 +1439,12 @@ xbdback_iodone(struct buf *bp) xbdback_send_reply(xbdi, xbd_io->xio_id, xbd_io->xio_operation, status); xbdi_put(xbdi); - atomic_dec_uint(&xbdi->xbdi_pendingreqs); + KASSERT(xbdi->xbdi_pendingreqs > 0); + xbdi->xbdi_pendingreqs--; buf_destroy(&xbd_io->xio_buf); - xbdback_pool_put(&xbdback_io_pool, xbd_io); + xbdback_io_put(xbdi, xbd_io); xbdback_wakeup_thread(xbdi); - KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ } /* @@ -1329,13 +1453,12 @@ xbdback_iodone(struct buf *bp) static void xbdback_wakeup_thread(struct xbdback_instance *xbdi) { + KASSERT(mutex_owned(&xbdi->xbdi_lock)); - mutex_enter(&xbdi->xbdi_lock); /* only set RUN state when we are WAITING for work */ if (xbdi->xbdi_status == WAITING) xbdi->xbdi_status = RUN; - cv_broadcast(&xbdi->xbdi_cv); - mutex_exit(&xbdi->xbdi_lock); + cv_signal(&xbdi->xbdi_cv); } /* @@ -1351,12 +1474,13 @@ xbdback_send_reply(struct xbdback_instan blkif_x86_64_response_t *resp64; int notify; + KASSERT(mutex_owned(&xbdi->xbdi_lock)); + /* * The ring can be accessed by the xbdback thread, xbdback_iodone() * handler, or any handler that triggered the shm callback. So * protect ring access via the xbdi_lock mutex. */ - mutex_enter(&xbdi->xbdi_lock); switch (xbdi->xbdi_proto) { case XBDIP_NATIVE: resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n, @@ -1382,7 +1506,6 @@ xbdback_send_reply(struct xbdback_instan } xbdi->xbdi_ring.ring_n.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify); - mutex_exit(&xbdi->xbdi_lock); if (notify) { XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid)); @@ -1398,7 +1521,7 @@ static void * xbdback_map_shm(struct xbdback_io *xbd_io) { struct xbdback_instance *xbdi = xbd_io->xio_xbdi; - int error, s; + int error; #ifdef XENDEBUG_VBD int i; @@ -1408,12 +1531,12 @@ xbdback_map_shm(struct xbdback_io *xbd_i } #endif - s = splvm(); /* XXXSMP */ + KASSERT(mutex_owned(&xbdi->xbdi_lock)); + xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free); KASSERT(xbd_io->xio_xv != NULL); SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next); xbd_io->xio_vaddr = xbd_io->xio_xv->xv_vaddr; - splx(s); error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid, xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh, @@ -1428,16 +1551,16 @@ xbdback_map_shm(struct xbdback_io *xbd_i } printf("\n"); #endif - return xbdi; + return xbd_io; default: if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) { printf("xbdback_map_shm: xen_shm error %d ", error); } - xbdback_io_error(xbdi->xbdi_io, error); + /* this will also free xbd_io via xbdback_iodone() */ + xbdback_io_error(xbd_io, error); SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next); xbd_io->xio_xv = NULL; - xbdi->xbdi_io = NULL; - // do not retry + /* do not retry */ xbdi->xbdi_cont = xbdback_co_main_incr; return xbdi; } @@ -1467,18 +1590,21 @@ xbdback_unmap_shm(struct xbdback_io *xbd } /* Obtain memory from a pool */ -static void * -xbdback_pool_get(struct pool_cache *pc, - struct xbdback_instance *xbdi) +static struct xbdback_io * +xbdback_io_get(struct xbdback_instance *xbdi) { - return pool_cache_get(pc, PR_WAITOK); + struct xbdback_io *xbd_io = SLIST_FIRST(&xbdi->xbdi_io_free); + KASSERT(xbd_io != NULL); + SLIST_REMOVE_HEAD(&xbdi->xbdi_io_free, xio_next); + return xbd_io; } /* Restore memory to a pool */ static void -xbdback_pool_put(struct pool_cache *pc, void *item) +xbdback_io_put(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io) { - pool_cache_put(pc, item); + KASSERT(xbd_io != NULL); + SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next); } /*