perform_cow() can satisfy a partial-cluster COW write by merging the
preserved head, the guest payload, and the preserved tail into a single
write request. Today that merged write is submitted as a multi-segment
qiov.

With a raw data file opened with O_DIRECT, that request can violate host
DMA alignment restrictions on some setups. The COW head and tail
buffers are blocked aligned, but the guest payload may still be provided
as a separately spliced iovec with an internal offset. On affected
hosts, the resulting direct write fails which can break some
partial-cluster COW workloads.

Fix this by assembling merged partial-cluster COW writes into one aligned
temporary buffer before submitting them to an O_DIRECT data file. Keep
the existing scatter-gather path for buffered I/O.

Closes: https://gitlab.com/qemu-project/qemu/-/work_items/3521
Reported-by: Farrah Chen <[email protected]>
Signed-off-by: Chenyi Qiang <[email protected]>

---

Hi block/qcow2 folks,

This patch is intened to fix a reported qcow2 COW issue seen when
launching a VM from a refreshed overlay image.

I am not very familiar with this qcow2 path, so I would appreciate
feedback on both the root-cause analysis and the proposed fix. I used AI
as a debugging aid while looking into the issue, and since AI-related
project policy is still being discussed in QEMU, I am posting this patch
mainly to get feedback from people who know this code better.

I tested this change against the reported case and it does resolve the
problem for me, but I am not yet confident that this is the right fix
for upstream.
---
 block/qcow2-cluster.c | 50 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 8b1e80bd0b..7129b41d68 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -893,7 +893,7 @@ perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     unsigned buffer_size;
     unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
     bool merge_reads;
-    uint8_t *start_buffer, *end_buffer;
+    uint8_t *start_buffer, *end_buffer, *write_buffer = NULL;
     QEMUIOVector qiov;
     int ret;
 
@@ -982,12 +982,45 @@ perform_cow(BlockDriverState *bs, QCowL2Meta *m)
      * can write everything in one single operation */
     if (m->data_qiov) {
         qemu_iovec_reset(&qiov);
-        if (start->nb_bytes) {
-            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
-        }
-        qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, 
data_bytes);
-        if (end->nb_bytes) {
-            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        /*
+         * Direct I/O through the raw file backend can still hit host DMA
+         * alignment restrictions when we splice guest buffers between the
+         * COW head/tail regions. For those writes, assemble a single aligned
+         * buffer before submitting the COW write.
+         */
+        if (s->data_file->bs->open_flags & BDRV_O_NOCACHE) {
+            uint64_t write_size = start->nb_bytes + data_bytes + end->nb_bytes;
+
+            if (merge_reads) {
+                write_buffer = start_buffer;
+            } else {
+                write_buffer = qemu_try_blockalign(bs, write_size);
+                if (write_buffer == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
+
+                if (start->nb_bytes) {
+                    memcpy(write_buffer, start_buffer, start->nb_bytes);
+                }
+                if (end->nb_bytes) {
+                    memcpy(write_buffer + start->nb_bytes + data_bytes,
+                           end_buffer, end->nb_bytes);
+                }
+            }
+
+            qemu_iovec_to_buf(m->data_qiov, m->data_qiov_offset,
+                              write_buffer + start->nb_bytes, data_bytes);
+            qemu_iovec_add(&qiov, write_buffer, write_size);
+        } else {
+            if (start->nb_bytes) {
+                qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+            }
+            qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset,
+                              data_bytes);
+            if (end->nb_bytes) {
+                qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+            }
         }
         /* NOTE: we have a write_aio blkdebug event here followed by
          * a cow_write one in do_perform_cow_write(), but there's only
@@ -1020,6 +1053,9 @@ fail:
         qcow2_cache_depends_on_flush(s->l2_table_cache);
     }
 
+    if (write_buffer != start_buffer) {
+        qemu_vfree(write_buffer);
+    }
     qemu_vfree(start_buffer);
     qemu_iovec_destroy(&qiov);
     return ret;
-- 
2.43.5


Reply via email to