[PATCH v2 2/4] block-backend: process I/O in the current AioContext
Switch blk_aio_*() APIs over to multi-queue by using qemu_get_current_aio_context() instead of blk_get_aio_context(). This change will allow devices to process I/O in multiple IOThreads in the future. I audited existing blk_aio_*() callers: - migration/block.c: blk_mig_lock() protects the data accessed by the completion callback. - The remaining emulated devices and exports run with qemu_get_aio_context() == blk_get_aio_context(). Signed-off-by: Stefan Hajnoczi --- block/block-backend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index a77295a198..4863be5691 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1530,7 +1530,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, acb->blk = blk; acb->ret = ret; -replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), +replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), error_callback_bh, acb); return >common; } @@ -1584,11 +1584,11 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, acb->has_returned = false; co = qemu_coroutine_create(co_entry, acb); -aio_co_enter(blk_get_aio_context(blk), co); +aio_co_enter(qemu_get_current_aio_context(), co); acb->has_returned = true; if (acb->rwco.ret != NOT_DONE) { -replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), +replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), blk_aio_complete_bh, acb); } -- 2.41.0
[PATCH v2 1/4] block: remove AIOCBInfo->get_aio_context()
The synchronous bdrv_aio_cancel() function needs the acb's AioContext so it can call aio_poll() to wait for cancellation. It turns out that all users run under the BQL in the main AioContext, so this callback is not needed. Remove the callback, mark bdrv_aio_cancel() GLOBAL_STATE_CODE just like its blk_aio_cancel() caller, and poll the main loop AioContext. The purpose of this cleanup is to identify bdrv_aio_cancel() as an API that does not work with the multi-queue block layer. Signed-off-by: Stefan Hajnoczi --- include/block/aio.h| 1 - include/block/block-global-state.h | 2 ++ include/block/block-io.h | 1 - block/block-backend.c | 17 - block/io.c | 23 --- hw/nvme/ctrl.c | 7 --- softmmu/dma-helpers.c | 8 util/thread-pool.c | 8 8 files changed, 10 insertions(+), 57 deletions(-) diff --git a/include/block/aio.h b/include/block/aio.h index 32042e8905..bcc165c974 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -31,7 +31,6 @@ typedef void BlockCompletionFunc(void *opaque, int ret); typedef struct AIOCBInfo { void (*cancel_async)(BlockAIOCB *acb); -AioContext *(*get_aio_context)(BlockAIOCB *acb); size_t aiocb_size; } AIOCBInfo; diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index f347199bff..ac2a605ef5 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -185,6 +185,8 @@ void bdrv_drain_all_begin_nopoll(void); void bdrv_drain_all_end(void); void bdrv_drain_all(void); +void bdrv_aio_cancel(BlockAIOCB *acb); + int bdrv_has_zero_init_1(BlockDriverState *bs); int bdrv_has_zero_init(BlockDriverState *bs); BlockDriverState *bdrv_find_node(const char *node_name); diff --git a/include/block/block-io.h b/include/block/block-io.h index 4415506e40..b078d17bf1 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -101,7 +101,6 @@ bdrv_co_delete_file_noerr(BlockDriverState *bs); /* async block I/O */ -void bdrv_aio_cancel(BlockAIOCB *acb); void bdrv_aio_cancel_async(BlockAIOCB *acb); /* sg packet commands */ diff --git a/block/block-backend.c b/block/block-backend.c index 4009ed5fed..a77295a198 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -33,8 +33,6 @@ #define NOT_DONE 0x7fff /* used while emulated sync operation in progress */ -static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb); - typedef struct BlockBackendAioNotifier { void (*attached_aio_context)(AioContext *new_context, void *opaque); void (*detach_aio_context)(void *opaque); @@ -103,7 +101,6 @@ typedef struct BlockBackendAIOCB { } BlockBackendAIOCB; static const AIOCBInfo block_backend_aiocb_info = { -.get_aio_context = blk_aiocb_get_aio_context, .aiocb_size = sizeof(BlockBackendAIOCB), }; @@ -1545,16 +1542,8 @@ typedef struct BlkAioEmAIOCB { bool has_returned; } BlkAioEmAIOCB; -static AioContext *blk_aio_em_aiocb_get_aio_context(BlockAIOCB *acb_) -{ -BlkAioEmAIOCB *acb = container_of(acb_, BlkAioEmAIOCB, common); - -return blk_get_aio_context(acb->rwco.blk); -} - static const AIOCBInfo blk_aio_em_aiocb_info = { .aiocb_size = sizeof(BlkAioEmAIOCB), -.get_aio_context= blk_aio_em_aiocb_get_aio_context, }; static void blk_aio_complete(BlkAioEmAIOCB *acb) @@ -2434,12 +2423,6 @@ AioContext *blk_get_aio_context(BlockBackend *blk) return blk->ctx; } -static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb) -{ -BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb); -return blk_get_aio_context(blk_acb->blk); -} - int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, Error **errp) { diff --git a/block/io.c b/block/io.c index 055fcf7438..16245dc93a 100644 --- a/block/io.c +++ b/block/io.c @@ -2944,25 +2944,18 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, /**/ /* async I/Os */ +/** + * Synchronously cancels an acb. Must be called with the BQL held and the acb + * must be processed with the BQL held too (IOThreads are not allowed). + * + * Use bdrv_aio_cancel_async() instead when possible. + */ void bdrv_aio_cancel(BlockAIOCB *acb) { -IO_CODE(); +GLOBAL_STATE_CODE(); qemu_aio_ref(acb); bdrv_aio_cancel_async(acb); -while (acb->refcnt > 1) { -if (acb->aiocb_info->get_aio_context) { -aio_poll(acb->aiocb_info->get_aio_context(acb), true); -} else if (acb->bs) { -/* qemu_aio_ref and qemu_aio_unref are not thread-safe, so - * assert that we're not using an I/O thread. Thread-safe - * code should use bdrv_aio_cancel_async exclusively. - */ -assert(bdrv_get_aio_context(acb->bs) ==
[PATCH v2 0/4] block-backend: process I/O in the current AioContext
v2 - Add patch to remove AIOCBInfo->get_aio_context() [Kevin] - Add patch to use qemu_get_current_aio_context() in block-coroutine-wrapper so that the wrappers use the current AioContext instead of bdrv_get_aio_context(). Switch blk_aio_*() APIs over to multi-queue by using qemu_get_current_aio_context() instead of blk_get_aio_context(). This change will allow devices to process I/O in multiple IOThreads in the future. The final patch requires my QIOChannel AioContext series to pass tests/qemu-iotests/check -qcow2 281 because the nbd block driver is now accessed from the main loop thread in addition to the IOThread: https://lore.kernel.org/qemu-devel/20230823234504.1387239-1-stefa...@redhat.com/T/#t Based-on: 20230823234504.1387239-1-stefa...@redhat.com Stefan Hajnoczi (4): block: remove AIOCBInfo->get_aio_context() block-backend: process I/O in the current AioContext block-backend: process zoned requests in the current AioContext block-coroutine-wrapper: use qemu_get_current_aio_context() include/block/aio.h| 1 - include/block/block-global-state.h | 2 ++ include/block/block-io.h | 1 - block/block-backend.c | 35 -- block/io.c | 23 +++- hw/nvme/ctrl.c | 7 -- softmmu/dma-helpers.c | 8 --- util/thread-pool.c | 8 --- scripts/block-coroutine-wrapper.py | 6 ++--- 9 files changed, 21 insertions(+), 70 deletions(-) -- 2.41.0
[PATCH v2 3/4] block-backend: process zoned requests in the current AioContext
Process zoned requests in the current thread's AioContext instead of in the BlockBackend's AioContext. There is no need to use the BlockBackend's AioContext thanks to CoMutex bs->wps->colock, which protects zone metadata. Signed-off-by: Stefan Hajnoczi --- block/block-backend.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 4863be5691..427ebcc0e4 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1890,11 +1890,11 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, acb->has_returned = false; co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); -aio_co_enter(blk_get_aio_context(blk), co); +aio_co_enter(qemu_get_current_aio_context(), co); acb->has_returned = true; if (acb->rwco.ret != NOT_DONE) { -replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), +replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), blk_aio_complete_bh, acb); } @@ -1931,11 +1931,11 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, acb->has_returned = false; co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); -aio_co_enter(blk_get_aio_context(blk), co); +aio_co_enter(qemu_get_current_aio_context(), co); acb->has_returned = true; if (acb->rwco.ret != NOT_DONE) { -replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), +replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), blk_aio_complete_bh, acb); } @@ -1971,10 +1971,10 @@ BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, acb->has_returned = false; co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); -aio_co_enter(blk_get_aio_context(blk), co); +aio_co_enter(qemu_get_current_aio_context(), co); acb->has_returned = true; if (acb->rwco.ret != NOT_DONE) { -replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), +replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), blk_aio_complete_bh, acb); } -- 2.41.0
[PATCH v2 4/4] block-coroutine-wrapper: use qemu_get_current_aio_context()
Use qemu_get_current_aio_context() in mixed wrappers and coroutine wrappers so that code runs in the caller's AioContext instead of moving to the BlockDriverState's AioContext. This change is necessary for the multi-queue block layer where any thread can call into the block layer. Most wrappers are IO_CODE where it's safe to use the current AioContext nowadays. BlockDrivers and the core block layer use their own locks and no longer depend on the AioContext lock for thread-safety. The bdrv_create() wrapper invokes GLOBAL_STATE code. Using the current AioContext is safe because this code is only called with the BQL held from the main loop thread. The output of qemu-iotests 051 is sensitive to event loop activity. Update the output because the monitor BH runs at a different time, causing prompts to be printed differently in the output. Signed-off-by: Stefan Hajnoczi --- scripts/block-coroutine-wrapper.py | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py index d4a183db61..f93fe154c3 100644 --- a/scripts/block-coroutine-wrapper.py +++ b/scripts/block-coroutine-wrapper.py @@ -88,8 +88,6 @@ def __init__(self, wrapper_type: str, return_type: str, name: str, raise ValueError(f"no_co function can't be rdlock: {self.name}") self.target_name = f'{subsystem}_{subname}' -self.ctx = self.gen_ctx() - self.get_result = 's->ret = ' self.ret = 'return s.ret;' self.co_ret = 'return ' @@ -162,7 +160,7 @@ def create_mixed_wrapper(func: FuncDecl) -> str: {func.co_ret}{name}({ func.gen_list('{name}') }); }} else {{ {struct_name} s = {{ -.poll_state.ctx = {func.ctx}, +.poll_state.ctx = qemu_get_current_aio_context(), .poll_state.in_progress = true, { func.gen_block('.{name} = {name},') } @@ -186,7 +184,7 @@ def create_co_wrapper(func: FuncDecl) -> str: {func.return_type} {func.name}({ func.gen_list('{decl}') }) {{ {struct_name} s = {{ -.poll_state.ctx = {func.ctx}, +.poll_state.ctx = qemu_get_current_aio_context(), .poll_state.in_progress = true, { func.gen_block('.{name} = {name},') } -- 2.41.0
[PATCH 2/2] io: follow coroutine AioContext in qio_channel_yield()
The ongoing QEMU multi-queue block layer effort makes it possible for multiple threads to process I/O in parallel. The nbd block driver is not compatible with the multi-queue block layer yet because QIOChannel cannot be used easily from coroutines running in multiple threads. This series changes the QIOChannel API to make that possible. In the current API, calling qio_channel_attach_aio_context() sets the AioContext where qio_channel_yield() installs an fd handler prior to yielding: qio_channel_attach_aio_context(ioc, my_ctx); ... qio_channel_yield(ioc); // my_ctx is used here ... qio_channel_detach_aio_context(ioc); This API design has limitations: reading and writing must be done in the same AioContext and moving between AioContexts involves a cumbersome sequence of API calls that is not suitable for doing on a per-request basis. There is no fundamental reason why a QIOChannel needs to run within the same AioContext every time qio_channel_yield() is called. QIOChannel only uses the AioContext while inside qio_channel_yield(). The rest of the time, QIOChannel is independent of any AioContext. In the new API, qio_channel_yield() queries the AioContext from the current coroutine using qemu_coroutine_get_aio_context(). There is no need to explicitly attach/detach AioContexts anymore and qio_channel_attach_aio_context() and qio_channel_detach_aio_context() are gone. One coroutine can read from the QIOChannel while another coroutine writes from a different AioContext. This API change allows the nbd block driver to use QIOChannel from any thread. It's important to keep in mind that the block driver already synchronizes QIOChannel access and ensures that two coroutines never read simultaneously or write simultaneously. This patch updates all users of qio_channel_attach_aio_context() to the new API. Most conversions are simple, but vhost-user-server requires a new qemu_coroutine_yield() call to quiesce the vu_client_trip() coroutine when not attached to any AioContext. While the API is has become simpler, there is one wart: QIOChannel has a special case for the iohandler AioContext (used for handlers that must not run in nested event loops). I didn't find an elegant way preserve that behavior, so I added a new API called qio_channel_set_follow_coroutine_ctx(ioc, true|false) for opting in to the new AioContext model. By default QIOChannel uses the iohandler AioHandler. Code that formerly called qio_channel_attach_aio_context() now calls qio_channel_set_follow_coroutine_ctx(ioc, true) once after the QIOChannel is created. Signed-off-by: Stefan Hajnoczi --- include/io/channel.h | 34 +++-- include/qemu/vhost-user-server.h | 1 + block/nbd.c | 11 +-- io/channel-command.c | 13 +++- io/channel-file.c| 18 - io/channel-null.c| 3 +- io/channel-socket.c | 18 - io/channel-tls.c | 6 +- io/channel.c | 120 ++- migration/channel-block.c| 3 +- nbd/client.c | 2 +- nbd/server.c | 14 +--- scsi/qemu-pr-helper.c| 4 +- util/vhost-user-server.c | 27 +-- 14 files changed, 191 insertions(+), 83 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index 229bf36910..dfbe6f2931 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -81,9 +81,11 @@ struct QIOChannel { Object parent; unsigned int features; /* bitmask of QIOChannelFeatures */ char *name; -AioContext *ctx; +AioContext *read_ctx; Coroutine *read_coroutine; +AioContext *write_ctx; Coroutine *write_coroutine; +bool follow_coroutine_ctx; #ifdef _WIN32 HANDLE event; /* For use with GSource on Win32 */ #endif @@ -140,8 +142,9 @@ struct QIOChannelClass { int whence, Error **errp); void (*io_set_aio_fd_handler)(QIOChannel *ioc, - AioContext *ctx, + AioContext *read_ctx, IOHandler *io_read, + AioContext *write_ctx, IOHandler *io_write, void *opaque); int (*io_flush)(QIOChannel *ioc, @@ -498,6 +501,21 @@ int qio_channel_set_blocking(QIOChannel *ioc, bool enabled, Error **errp); +/** + * qio_channel_set_follow_coroutine_ctx: + * @ioc: the channel object + * @enabled: whether or not to follow the coroutine's AioContext + * + * If @enabled is true, calls to qio_channel_yield() use the current + * coroutine's AioContext. Usually this is desirable. + * + * If @enabled is false, calls to qio_channel_yield() use the global iohandler + * AioContext. This is may be used by coroutines that run in the main loop and + * do not
[PATCH 0/2] io: follow coroutine AioContext in qio_channel_yield()
The ongoing QEMU multi-queue block layer effort makes it possible for multiple threads to process I/O in parallel. The nbd block driver is not compatible with the multi-queue block layer yet because QIOChannel cannot be used easily from coroutines running in multiple threads. This series changes the QIOChannel API to make that possible. Stefan Hajnoczi (2): io: check there are no qio_channel_yield() coroutines during ->finalize() io: follow coroutine AioContext in qio_channel_yield() include/io/channel.h | 34 - include/qemu/vhost-user-server.h | 1 + block/nbd.c | 11 +-- io/channel-command.c | 13 +++- io/channel-file.c| 18 - io/channel-null.c| 3 +- io/channel-socket.c | 18 - io/channel-tls.c | 6 +- io/channel.c | 124 ++- migration/channel-block.c| 3 +- nbd/client.c | 2 +- nbd/server.c | 14 +--- scsi/qemu-pr-helper.c| 4 +- util/vhost-user-server.c | 27 +-- 14 files changed, 195 insertions(+), 83 deletions(-) -- 2.41.0
[PATCH 1/2] io: check there are no qio_channel_yield() coroutines during ->finalize()
Callers must clean up their coroutines before calling object_unref(OBJECT(ioc)) to prevent an fd handler leak. Add an assertion to check this. This patch is preparation for the fd handler changes that follow. Signed-off-by: Stefan Hajnoczi --- io/channel.c | 4 1 file changed, 4 insertions(+) diff --git a/io/channel.c b/io/channel.c index 72f0066af5..c415f3fc88 100644 --- a/io/channel.c +++ b/io/channel.c @@ -653,6 +653,10 @@ static void qio_channel_finalize(Object *obj) { QIOChannel *ioc = QIO_CHANNEL(obj); +/* Must not have coroutines in qio_channel_yield() */ +assert(!ioc->read_coroutine); +assert(!ioc->write_coroutine); + g_free(ioc->name); #ifdef _WIN32 -- 2.41.0
Re: [PATCH v2 0/3] block: align CoR requests to subclusters
On 8/22/23 22:58, John Snow wrote: > On Tue, Aug 22, 2023 at 1:33 PM Andrey Drobyshev > wrote: >> >> On 8/16/23 12:22, Andrey Drobyshev wrote: >>> On 7/31/23 17:51, Andrey Drobyshev wrote: On 7/24/23 16:11, Andrey Drobyshev wrote: > On 7/11/23 20:25, Andrey Drobyshev wrote: >> v1 --> v2: >> * Fixed line indentation; >> * Fixed wording in a comment; >> * Added R-b. >> >> v1: >> https://lists.nongnu.org/archive/html/qemu-block/2023-06/msg00606.html >> >> Andrey Drobyshev (3): >> block: add subcluster_size field to BlockDriverInfo >> block/io: align requests to subcluster_size >> tests/qemu-iotests/197: add testcase for CoR with subclusters >> >> block.c | 7 + >> block/io.c | 50 ++-- >> block/mirror.c | 8 +++--- >> block/qcow2.c| 1 + >> include/block/block-common.h | 5 >> include/block/block-io.h | 8 +++--- >> tests/qemu-iotests/197 | 29 + >> tests/qemu-iotests/197.out | 24 + >> 8 files changed, 99 insertions(+), 33 deletions(-) >> > > Ping Another ping >>> >>> Yet another friendly ping >> >> One more friendly ping > > Looks like Stefan gave you an R-B for the series; do we just need an > ACK by the block maintainers at this point or is there someone > specific you're hoping will review this? > > --js > Hi John, I figure a maintainer's R-b doesn't imply the patches being merged into the tree. Hence I'm waiting for the notice that they actually are merged. Please let me know if the process should be different. Andrey
[PATCH v4 3/3] hw/nvme: add nvme management interface model
From: Klaus Jensen Add the 'nmi-i2c' device that emulates an NVMe Management Interface controller. Initial support is very basic (Read NMI DS, Configuration Get). This is based on previously posted code by Padmakar Kalghatgi, Arun Kumar Agasar and Saurav Kumar. Signed-off-by: Klaus Jensen --- hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c| 418 +++ hw/nvme/trace-events | 6 + 4 files changed, 429 insertions(+) diff --git a/hw/nvme/Kconfig b/hw/nvme/Kconfig index 8ac90942e55e..1d89a4f4ecea 100644 --- a/hw/nvme/Kconfig +++ b/hw/nvme/Kconfig @@ -2,3 +2,7 @@ config NVME_PCI bool default y if PCI_DEVICES depends on PCI + +config NVME_NMI_I2C +bool +default y if I2C_MCTP diff --git a/hw/nvme/meson.build b/hw/nvme/meson.build index 1a6a2ca2f307..7bc85f31c149 100644 --- a/hw/nvme/meson.build +++ b/hw/nvme/meson.build @@ -1 +1,2 @@ system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c')) +system_ss.add(when: 'CONFIG_NVME_NMI_I2C', if_true: files('nmi-i2c.c')) diff --git a/hw/nvme/nmi-i2c.c b/hw/nvme/nmi-i2c.c new file mode 100644 index ..9040ba28a87c --- /dev/null +++ b/hw/nvme/nmi-i2c.c @@ -0,0 +1,418 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * + * SPDX-FileContributor: Padmakar Kalghatgi + * SPDX-FileContributor: Arun Kumar Agasar + * SPDX-FileContributor: Saurav Kumar + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/crc32c.h" +#include "hw/registerfields.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" +#include "trace.h" + +#define NMI_MAX_MESSAGE_LENGTH 4224 + +#define TYPE_NMI_I2C_DEVICE "nmi-i2c" +OBJECT_DECLARE_SIMPLE_TYPE(NMIDevice, NMI_I2C_DEVICE) + +typedef struct NMIDevice { +MCTPI2CEndpoint mctp; + +uint8_t buffer[NMI_MAX_MESSAGE_LENGTH]; +uint8_t scratch[NMI_MAX_MESSAGE_LENGTH]; + +size_t len; +int64_t pos; +} NMIDevice; + +FIELD(NMI_MCTPD, MT, 0, 7) +FIELD(NMI_MCTPD, IC, 7, 1) + +#define NMI_MCTPD_MT_NMI 0x4 +#define NMI_MCTPD_IC_ENABLED 0x1 + +FIELD(NMI_NMP, ROR, 7, 1) +FIELD(NMI_NMP, NMIMT, 3, 4) + +#define NMI_NMP_NMIMT_NVME_MI 0x1 +#define NMI_NMP_NMIMT_NVME_ADMIN 0x2 + +typedef struct NMIMessage { +uint8_t mctpd; +uint8_t nmp; +uint8_t rsvd2[2]; +uint8_t payload[]; /* includes the Message Integrity Check */ +} NMIMessage; + +typedef struct NMIRequest { + uint8_t opc; + uint8_t rsvd1[3]; + uint32_t dw0; + uint32_t dw1; + uint32_t mic; +} NMIRequest; + +typedef enum NMIReadDSType { +NMI_CMD_READ_NMI_DS_SUBSYSTEM = 0x0, +NMI_CMD_READ_NMI_DS_PORTS = 0x1, +NMI_CMD_READ_NMI_DS_CTRL_LIST = 0x2, +NMI_CMD_READ_NMI_DS_CTRL_INFO = 0x3, +NMI_CMD_READ_NMI_DS_OPT_CMD_SUPPORT = 0x4, +NMI_CMD_READ_NMI_DS_MEB_CMD_SUPPORT = 0x5, +} NMIReadDSType; + +#define NMI_STATUS_INVALID_PARAMETER 0x4 + +static void nmi_set_parameter_error(NMIDevice *nmi, uint8_t bit, uint16_t byte) +{ +/* NVM Express Management Interface 1.2c, Figure 30 */ +struct resp { +uint8_t status; +uint8_t bit; +uint16_t byte; +}; + +struct resp *buf = (struct resp *)(nmi->scratch + nmi->pos); + +buf->status = NMI_STATUS_INVALID_PARAMETER; +buf->bit = bit & 0x3; +buf->byte = byte; + +nmi->pos += sizeof(struct resp); +} + +static void nmi_set_error(NMIDevice *nmi, uint8_t status) +{ +uint8_t buf[4] = {}; + +buf[0] = status; + +memcpy(nmi->scratch + nmi->pos, buf, 4); +nmi->pos += 4; +} + +static void nmi_handle_mi_read_nmi_ds(NMIDevice *nmi, NMIRequest *request) +{ +I2CSlave *i2c = I2C_SLAVE(nmi); + +uint32_t dw0 = le32_to_cpu(request->dw0); +uint8_t dtyp = (dw0 >> 24) & 0xf; +uint8_t *buf; +size_t len; + +trace_nmi_handle_mi_read_nmi_ds(dtyp); + +static uint8_t nmi_ds_subsystem[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x00, /* number of ports */ +0x01, /* major version */ +0x01, /* minor version */ +}; + +/* cannot be static since we need to patch in the i2c address */ +uint8_t nmi_ds_ports[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x02, /* port type (smbus) */ +0x00, /* reserved */ +0x40, 0x00, /* maximum mctp transission unit size (64 bytes) */ +0x00, 0x00, 0x00, 0x00, /* management endpoint buffer size */ +0x00, /* vpd i2c address */ +0x00, /* vpd i2c frequency */ +0x00, /* management endpoint i2c address */ +0x01, /* management endpoint i2c frequency */ +0x00, /* nvme basic management
[PATCH v4 2/3] hw/i2c: add mctp core
From: Klaus Jensen Add an abstract MCTP over I2C endpoint model. This implements MCTP control message handling as well as handling the actual I2C transport (packetization). Devices are intended to derive from this and implement the class methods. Parts of this implementation is inspired by code[1] previously posted by Jonathan Cameron. Squashed a fix[2] from Matt Johnston. [1]: https://lore.kernel.org/qemu-devel/20220520170128.4436-1-jonathan.came...@huawei.com/ [2]: https://lore.kernel.org/qemu-devel/20221121080445.ga29...@codeconstruct.com.au/ Signed-off-by: Klaus Jensen --- MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 428 ++ hw/i2c/meson.build| 1 + hw/i2c/trace-events | 13 ++ include/hw/i2c/mctp.h | 127 +++ include/net/mctp.h| 35 + 8 files changed, 616 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6111b6b4d928..8ca71167607d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3395,6 +3395,13 @@ F: tests/qtest/adm1272-test.c F: tests/qtest/max34451-test.c F: tests/qtest/isl_pmbus_vr-test.c +MCTP I2C Transport +M: Klaus Jensen +S: Maintained +F: hw/i2c/mctp.c +F: include/hw/i2c/mctp.h +F: include/net/mctp.h + Firmware schema specifications M: Philippe Mathieu-Daudé R: Daniel P. Berrange diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 7e6834844051..5bcb1e0e8a6f 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -541,6 +541,7 @@ config ASPEED_SOC select DS1338 select FTGMAC100 select I2C +select I2C_MCTP select DPS310 select PCA9552 select SERIAL diff --git a/hw/i2c/Kconfig b/hw/i2c/Kconfig index 14886b35dac2..2b2a50b83d1e 100644 --- a/hw/i2c/Kconfig +++ b/hw/i2c/Kconfig @@ -6,6 +6,10 @@ config I2C_DEVICES # to any board's i2c bus bool +config I2C_MCTP +bool +select I2C + config SMBUS bool select I2C diff --git a/hw/i2c/mctp.c b/hw/i2c/mctp.c new file mode 100644 index ..217073d62435 --- /dev/null +++ b/hw/i2c/mctp.c @@ -0,0 +1,428 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" + +#include "hw/qdev-properties.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/smbus_master.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" + +#include "trace.h" + +/* DSP0237 1.2.0, Figure 1 */ +typedef struct MCTPI2CPacketHeader { +uint8_t dest; +#define MCTP_I2C_COMMAND_CODE 0xf +uint8_t command_code; +uint8_t byte_count; +uint8_t source; +} MCTPI2CPacketHeader; + +typedef struct MCTPI2CPacket { +MCTPI2CPacketHeader i2c; +MCTPPacket mctp; +} MCTPI2CPacket; + +#define i2c_mctp_payload_offset offsetof(MCTPI2CPacket, mctp.payload) +#define i2c_mctp_payload(buf) (buf + i2c_mctp_payload_offset) + +/* DSP0236 1.3.0, Figure 20 */ +typedef struct MCTPControlMessage { +#define MCTP_MESSAGE_TYPE_CONTROL 0x0 +uint8_t type; +#define MCTP_CONTROL_FLAGS_RQ (1 << 7) +#define MCTP_CONTROL_FLAGS_D(1 << 6) +uint8_t flags; +uint8_t command_code; +uint8_t data[]; +} MCTPControlMessage; + +enum MCTPControlCommandCodes { +MCTP_CONTROL_SET_EID= 0x01, +MCTP_CONTROL_GET_EID= 0x02, +MCTP_CONTROL_GET_VERSION= 0x04, +MCTP_CONTROL_GET_MESSAGE_TYPE_SUPPORT = 0x05, +}; + +#define MCTP_CONTROL_ERROR_UNSUPPORTED_CMD 0x5 + +#define i2c_mctp_control_data_offset \ +(i2c_mctp_payload_offset + offsetof(MCTPControlMessage, data)) +#define i2c_mctp_control_data(buf) (buf + i2c_mctp_control_data_offset) + +/** + * The byte count field in the SMBUS Block Write containers the number of bytes + * *following* the field itself. + * + * This is at least 5. + * + * 1 byte for the MCTP/I2C piggy-backed I2C source address in addition to the + * size of the MCTP transport/packet header. + */ +#define MCTP_I2C_BYTE_COUNT_OFFSET (sizeof(MCTPPacketHeader) + 1) + +void i2c_mctp_schedule_send(MCTPI2CEndpoint *mctp) +{ +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(DEVICE(mctp))); + +mctp->tx.state = I2C_MCTP_STATE_TX_START_SEND; + +i2c_bus_master(i2c, mctp->tx.bh); +} + +static void i2c_mctp_tx(void *opaque) +{ +DeviceState *dev = DEVICE(opaque); +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(dev)); +I2CSlave *slave = I2C_SLAVE(dev); +MCTPI2CEndpoint *mctp = MCTP_I2C_ENDPOINT(dev); +MCTPI2CEndpointClass *mc = MCTP_I2C_ENDPOINT_GET_CLASS(mctp); +MCTPI2CPacket *pkt = (MCTPI2CPacket *)mctp->buffer; +uint8_t flags = 0; + +switch (mctp->tx.state) { +case I2C_MCTP_STATE_TX_SEND_BYTE: +if (mctp->pos < mctp->len) { +uint8_t byte = mctp->buffer[mctp->pos]; + +trace_i2c_mctp_tx_send_byte(mctp->pos, byte); + +
[PATCH v4 0/3] hw/{i2c,nvme}: mctp endpoint, nvme management interface model
This adds a generic MCTP endpoint model that other devices may derive from. Also included is a very basic implementation of an NVMe-MI device, supporting only a small subset of the required commands. Since this all relies on i2c target mode, this can currently only be used with an SoC that includes the Aspeed I2C controller. The easiest way to get up and running with this, is to grab my buildroot overlay[1] (aspeed_ast2600evb_nmi_defconfig). It includes modified a modified dts as well as a couple of required packages. QEMU can then be launched along these lines: qemu-system-arm \ -nographic \ -M ast2600-evb \ -kernel output/images/zImage \ -initrd output/images/rootfs.cpio \ -dtb output/images/aspeed-ast2600-evb-nmi.dtb \ -nic user,hostfwd=tcp::-:22 \ -device nmi-i2c,address=0x3a \ -serial mon:stdio >From within the booted system, mctp addr add 8 dev mctpi2c15 mctp link set mctpi2c15 up mctp route add 9 via mctpi2c15 mctp neigh add 9 dev mctpi2c15 lladdr 0x3a mi-mctp 1 9 info Comments are very welcome! [1]: https://github.com/birkelund/hwtests/tree/main/br2-external Changes since v3 - Inlined the POLY define in the crc8 function (Philippe) - Changed a bunch of fields to use registerfields.h - From Jonathan: + Added references to specs defining the structures. - From Corey: + Reworked the buffer handling (again) ;) Derived devices can now never write into the mctp core buffers. Instead, the mctp core will request a buffer pointer and copy from there. Internally, within the mctp core, writes to internal buffers are also checked. Changes since v2 - Applied a bunch of feedback from Jonathan: + Moved a lot of internally used structs out of the include headers and into the source files. + Added spec references in various places + Split the patch for i2c_smbus_pec() into its own + Fix a compile error (and bug) in nmi-i2c.c. - From Corey: + Reworked the buffer handling. The deriving devices now returns a pointer to their own buffer that the mctp core copies into. + Added a couple of extra debugging trace events. Changes since v1 - Fix SPDX-License tag for hw/nvme/nmi-i2c.c (Philippe) - Add some asserts to verify buffer indices (by request from Corey). - Drop short packets that could result in underflow (Corey) - Move i2c_smbus_pec() to smbus common code (Corey) - A couple of logic fixes (patch from Jeremy squashed in) - Added a patch to handle messages with dest eid 0 (Matt) Maybe squash this as well. Signed-off-by: Klaus Jensen --- Klaus Jensen (3): hw/i2c: add smbus pec utility function hw/i2c: add mctp core hw/nvme: add nvme management interface model MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 428 ++ hw/i2c/meson.build| 1 + hw/i2c/smbus_master.c | 26 +++ hw/i2c/trace-events | 13 ++ hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c | 418 + hw/nvme/trace-events | 6 + include/hw/i2c/mctp.h | 127 + include/hw/i2c/smbus_master.h | 2 + include/net/mctp.h| 35 14 files changed, 1073 insertions(+) --- base-commit: b0dd9a7d6dd15a6898e9c585b521e6bec79b25aa change-id: 20230822-nmi-i2c-d804ed5be7e6 Best regards, -- Klaus Jensen
[PATCH v4 1/3] hw/i2c: add smbus pec utility function
From: Klaus Jensen Add i2c_smbus_pec() to calculate the SMBus Packet Error Code for a message. Signed-off-by: Klaus Jensen --- hw/i2c/smbus_master.c | 26 ++ include/hw/i2c/smbus_master.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/i2c/smbus_master.c b/hw/i2c/smbus_master.c index 6a53c34e70b7..01a8e4700222 100644 --- a/hw/i2c/smbus_master.c +++ b/hw/i2c/smbus_master.c @@ -15,6 +15,32 @@ #include "hw/i2c/i2c.h" #include "hw/i2c/smbus_master.h" +static uint8_t crc8(uint16_t data) +{ +int i; + +for (i = 0; i < 8; i++) { +if (data & 0x8000) { +data ^= 0x1070U << 3; +} + +data <<= 1; +} + +return (uint8_t)(data >> 8); +} + +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len) +{ +int i; + +for (i = 0; i < len; i++) { +crc = crc8((crc ^ buf[i]) << 8); +} + +return crc; +} + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read) { diff --git a/include/hw/i2c/smbus_master.h b/include/hw/i2c/smbus_master.h index bb13bc423c22..d90f81767d86 100644 --- a/include/hw/i2c/smbus_master.h +++ b/include/hw/i2c/smbus_master.h @@ -27,6 +27,8 @@ #include "hw/i2c/i2c.h" +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len); + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read); int smbus_receive_byte(I2CBus *bus, uint8_t addr); -- 2.42.0
Re: Lost partition tables on ide-hd + ahci drive
Am 23.08.23 um 10:47 schrieb Fiona Ebner: > Am 17.02.23 um 22:22 schrieb Mike Maslenkin: >> I can not tell anything about dma-reentracy issues, but yes, i would >> start to look at check_cmd() function call sequence. >> The most interesting is why Sector Count = 1. I thought about race >> with IDE reset where registers initialized with >> value SATA_SIGNATURE_DISK = 0x0101, but this means LBA=1 as well... >> > > You got it! Since we got another report (after half a year of nothing) > and also because of Simon's mail, I gave it another shot too and was > finally able to reproduce the issue (with our patched QEMU 8.0, but > patches shouldn't affect IDE code). See below for the traces that > confirm your theory. The reason the write goes to sector 0 and not 1 is > because ide_dma_cb() uses sector_num = ide_get_sector(s); and that will > evaluate to 0 after a reset. > > So the issue is indeed that ide_dma_cb can get called with an IDEState > just after that state was reset. Can we somehow wait for pending > requests before proceeding with the reset, or can we force an error > return for callbacks that are still pending during reset? > I noticed that ide_bus_reset() does the reset first and then cancels the aiocb. Maybe it's already enough to switch those around? Best Regards, Fiona
Re: Lost partition tables on ide-hd + ahci drive
Am 17.02.23 um 22:22 schrieb Mike Maslenkin: > I can not tell anything about dma-reentracy issues, but yes, i would > start to look at check_cmd() function call sequence. > The most interesting is why Sector Count = 1. I thought about race > with IDE reset where registers initialized with > value SATA_SIGNATURE_DISK = 0x0101, but this means LBA=1 as well... > You got it! Since we got another report (after half a year of nothing) and also because of Simon's mail, I gave it another shot too and was finally able to reproduce the issue (with our patched QEMU 8.0, but patches shouldn't affect IDE code). See below for the traces that confirm your theory. The reason the write goes to sector 0 and not 1 is because ide_dma_cb() uses sector_num = ide_get_sector(s); and that will evaluate to 0 after a reset. So the issue is indeed that ide_dma_cb can get called with an IDEState just after that state was reset. Can we somehow wait for pending requests before proceeding with the reset, or can we force an error return for callbacks that are still pending during reset? Best Regards, Fiona QEMU trace log (-trace dma_*,file=/root/sata.log -trace ide_*,file=/root/sata.log -trace ahci_*,file=/root/sata.log -trace *ncq*,file=/root/sata.log -trace handle_cmd*,file=/root/sata.log) > ahci_port_write ahci(0x5595af6923f0)[0]: port write [reg:PxSCTL] @ 0x2c: > 0x0300 > ahci_reset_port ahci(0x5595af6923f0)[0]: reset port > ide_reset IDEstate 0x5595af6949d0 > ide_reset IDEstate 0x5595af694da8 > ide_bus_reset_aio aio_cancel > dma_aio_cancel dbs=0x7f64600089a0 > dma_blk_cb dbs=0x7f64600089a0 ret=0 > dma_complete dbs=0x7f64600089a0 ret=0 cb=0x5595acd40b30 > ahci_populate_sglist ahci(0x5595af6923f0)[0] > ahci_dma_prepare_buf ahci(0x5595af6923f0)[0]: prepare buf limit=512 > prepared=512 > ide_dma_cb IDEState 0x5595af6949d0; sector_num=0 n=1 cmd=DMA WRITE > dma_blk_io dbs=0x7f6420802010 bs=0x5595ae2c6c30 offset=0 to_dev=1 > dma_blk_cb dbs=0x7f6420802010 ret=0 Info from GDB: > (gdb) p *qiov > > $11 = {iov = 0x7f647c76d840, niov = 1, {{nalloc = 1, local_iov = {iov_base = > 0x0, > iov_len = 512}}, {__pad = > "\001\000\000\000\000\000\000\000\000\000\000", > size = 512}}} > (gdb) bt > > #0 blk_aio_pwritev (blk=0x5595ae2c6c30, offset=0, qiov=0x7f6420802070, > flags=0, > cb=0x5595ace6f0b0 , opaque=0x7f6420802010) > at ../block/block-backend.c:1682 > #1 0x5595ace6f185 in dma_blk_cb (opaque=0x7f6420802010, ret= out>) > at ../softmmu/dma-helpers.c:179 > #2 0x5595ace6f778 in dma_blk_io (ctx=0x5595ae0609f0, > sg=sg@entry=0x5595af694d00, offset=offset@entry=0, align=align@entry=512, > io_func=io_func@entry=0x5595ace6ee30 , > io_func_opaque=io_func_opaque@entry=0x5595ae2c6c30, > cb=0x5595acd40b30 , opaque=0x5595af6949d0, > dir=DMA_DIRECTION_TO_DEVICE) at ../softmmu/dma-helpers.c:244 > #3 0x5595ace6f90a in dma_blk_write (blk=0x5595ae2c6c30, > sg=sg@entry=0x5595af694d00, offset=offset@entry=0, align=align@entry=512, > cb=cb@entry=0x5595acd40b30 , > opaque=opaque@entry=0x5595af6949d0) > at ../softmmu/dma-helpers.c:280 > #4 0x5595acd40e18 in ide_dma_cb (opaque=0x5595af6949d0, ret= out>) > at ../hw/ide/core.c:953 > #5 0x5595ace6f319 in dma_complete (ret=0, dbs=0x7f64600089a0) > at ../softmmu/dma-helpers.c:107 > #6 dma_blk_cb (opaque=0x7f64600089a0, ret=0) at ../softmmu/dma-helpers.c:127 > #7 0x5595ad12227d in blk_aio_complete (acb=0x7f6460005b10) > at ../block/block-backend.c:1527 > #8 blk_aio_complete (acb=0x7f6460005b10) at ../block/block-backend.c:1524 > #9 blk_aio_write_entry (opaque=0x7f6460005b10) at > ../block/block-backend.c:1594 > #10 0x5595ad258cfb in coroutine_trampoline (i0=, > i1=) at ../util/coroutine-ucontext.c:177 > #11 0x7f64f2fcb8d0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6 > #12 0x7f64d0ff3290 in ?? () > #13 0x in ?? () This is of course not directly after the reset, since the break happened a bit later: > (gdb) p *((IDEState*)0x5595af6949d0) > > $12 = {bus = 0x5595af694948, unit = 0 '\000', drive_kind = IDE_HD, > drive_heads = 16, drive_sectors = 63, cylinders = 8740, heads = 16, > sectors = 63, chs_trans = 2, nb_sectors = 8810496, mult_sectors = 16, > identify_set = 1, > identify_data = > "@\000$\"\000\000\020\000\000~\000\002?\000\000\000\000\000\000\000MQ 5", > ' ' , "\003\000\000\002\004\000.2+5EQUMH RADDSI K", ' ' > , > "\020\200\001\000\000\v\000\000\000\002\000\002\a\000$\"\020\000?\000\300m\206\000\020\001\000p\206\000\a\000\a\000\003\000x\000x\000x\000x\000\000@\000\000\000\000\000\000\000\000\000\000\037\000\000\001\000\000\000\000\000\000\360\000\026\000!@\000t\000@!@\0004\000@?\020\000\000\000\000\000\000\000\000\001`", > '\000' ..., drive_serial = 5, > drive_serial_str = "QM5", '\000' , > drive_model_str = "QEMU HARDDISK", '\000' , wwn = 0, > feature = 0 '\000', error = 0 '\000', nsector = 1, sector = 1 '\001',