[PULL 15/18] qcow2: Fix corruption on discard during write with COW

Kevin Wolf Tue, 19 May 2026 10:06:01 -0700

Most code in qcow2 that accesses (and potentially modifies) L2 tables
does so while holding s->lock.


There is one exception, which is allocating writes. They hold the lock
initially while allocating clusters, but drop it for writing the guest
payload before taking the lock again for updating the L2 tables. This
allows concurrent requests that touch other parts of the image file to
continue in parallel and is an important performance optimisation.

However, this means that other requests that run while the lock is
dropped for writing guest data must synchronise with the list of
allocating requests in s->cluster_allocs and wait if they would overlap.
For writes, this is done in handle_dependencies(), but discard and write
zeros operations neglect to synchronise with s->cluster_allocs.

This means that discard can free a cluster whose L2 entry will already
be modified in qcow2_alloc_cluster_link_l2() by a previously started
write. In the case of a pre-allocated zero cluster that is in the
process of being overwritten, this means that discard can lead to a
situation where the cluster is still mapped (because the write will
restore the L2 entry just without the zero flag), but its refcount has
been decreased, resulting in a corrupted image.

Add the missing synchronisation to qcow2_cluster_discard() and
qcow2_subcluster_zeroize() to fix the problem.

Cc: [email protected]
Reported-by: Denis V. Lunev <[email protected]>
Signed-off-by: Kevin Wolf <[email protected]>
Message-ID: <[email protected]>
Reviewed-by: Denis V. Lunev <[email protected]>
Tested-by: Denis V. Lunev <[email protected]>
Signed-off-by: Kevin Wolf <[email protected]>
---
 block/qcow2-cluster.c | 52 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c655bf6df42..8b1e80bd0b3 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1392,6 +1392,9 @@ count_single_write_clusters(BlockDriverState *bs, int 
nb_clusters,
  * the same cluster. In this case we need to wait until the previous
  * request has completed and updated the L2 table accordingly.
  *
+ * If allow_shortening == true, instead of waiting for a dependency, *cur_bytes
+ * can be shortened so that the cluster allocations don't overlap.
+ *
  * Returns:
  *   0       if there was no dependency. *cur_bytes indicates the number of
  *           bytes from guest_offset that can be read before the next
@@ -1403,7 +1406,9 @@ count_single_write_clusters(BlockDriverState *bs, int 
nb_clusters,
  */
 static int coroutine_fn handle_dependencies(BlockDriverState *bs,
                                             uint64_t guest_offset,
-                                            uint64_t *cur_bytes, QCowL2Meta 
**m)
+                                            uint64_t *cur_bytes,
+                                            bool allow_shortening,
+                                            QCowL2Meta **m)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowL2Meta *old_alloc;
@@ -1434,7 +1439,7 @@ static int coroutine_fn 
handle_dependencies(BlockDriverState *bs,
 
         /* Conflict */
 
-        if (start < old_start) {
+        if (start < old_start && allow_shortening) {
             /* Stop at the start of a running allocation */
             bytes = old_start - start;
         } else {
@@ -1469,6 +1474,29 @@ static int coroutine_fn 
handle_dependencies(BlockDriverState *bs,
     return 0;
 }
 
+static void coroutine_mixed_fn wait_for_dependencies(BlockDriverState *bs,
+                                                     uint64_t guest_offset,
+                                                     uint64_t bytes)
+{
+    BDRVQcow2State *s = bs->opaque;
+    QCowL2Meta *m = NULL;
+    int ret;
+
+    /*
+     * Discard has some non-coroutine callers (creating internal snapshots and
+     * make empty). They are calling from qemu-img or in a drained section, so
+     * we know that no writes can be in progress.
+     */
+    if (!qemu_in_coroutine()) {
+        assert(QLIST_EMPTY(&s->cluster_allocs));
+        return;
+    }
+
+    do {
+        ret = handle_dependencies(bs, guest_offset, &bytes, false, &m);
+    } while (ret == -EAGAIN);
+}
+
 /*
  * Checks how many already allocated clusters that don't require a new
  * allocation there are at the given guest_offset (up to *bytes).
@@ -1840,7 +1868,7 @@ again:
          *         the right synchronisation between the in-flight request and
          *         the new one.
          */
-        ret = handle_dependencies(bs, start, &cur_bytes, m);
+        ret = handle_dependencies(bs, start, &cur_bytes, true, m);
         if (ret == -EAGAIN) {
             /* Currently handle_dependencies() doesn't yield if we already had
              * an allocation. If it did, we would have to clean up the L2Meta
@@ -2000,6 +2028,15 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t 
offset,
     int64_t cleared;
     int ret;
 
+    /*
+     * If we're touching a cluster for which allocating writes are in flight,
+     * wait for them to complete to avoid conflicting metadata updates.
+     *
+     * We don't need to allocate a QCowL2Meta for the discard operation because
+     * s->lock is held for the duration of the whole operation.
+     */
+    wait_for_dependencies(bs, offset, bytes);
+
     /* Caller must pass aligned values, except at image end */
     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
@@ -2160,6 +2197,15 @@ int coroutine_fn 
qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
     int64_t cleared;
     int ret;
 
+    /*
+     * If we're touching a cluster for which allocating writes are in flight,
+     * wait for them to complete to avoid conflicting metadata updates.
+     *
+     * We don't need to allocate a QCowL2Meta for the zeroize operation because
+     * s->lock is held for the duration of the whole operation.
+     */
+    wait_for_dependencies(bs, offset, bytes);
+
     /* If we have to stay in sync with an external data file, zero out
      * s->data_file first. */
     if (data_file_is_raw(bs)) {
-- 
2.54.0

[PULL 15/18] qcow2: Fix corruption on discard during write with COW

Reply via email to