Re: [PATCH v4 15/21] migration/block-dirty-bitmap: relax error handling in incoming part

2020-07-27 Thread Eric Blake

On 7/27/20 2:42 PM, Vladimir Sementsov-Ogievskiy wrote:

Bitmaps data is not critical, and we should not fail the migration (or
use postcopy recovering) because of dirty-bitmaps migration failure.
Instead we should just lose unfinished bitmaps.

Still we have to report io stream violation errors, as they affect the
whole migration stream.



I'm amending this to also add:

While touching this, tighten code that was previously blindly calling 
malloc on a size read from the migration stream, as a corrupted stream 
(perhaps from a malicious user) should not be able to convince us to 
allocate an inordinate amount of memory.



Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  migration/block-dirty-bitmap.c | 164 +
  1 file changed, 127 insertions(+), 37 deletions(-)




@@ -650,15 +695,46 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  
  if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {

  trace_dirty_bitmap_load_bits_zeroes();
-bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte, nr_bytes,
- false);
+if (!s->cancelled) {
+bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte,
+ nr_bytes, false);
+}
  } else {
  size_t ret;
-uint8_t *buf;
+g_autofree uint8_t *buf = NULL;
  uint64_t buf_size = qemu_get_be64(f);
-uint64_t needed_size =
-bdrv_dirty_bitmap_serialization_size(s->bitmap,
- first_byte, nr_bytes);
+uint64_t needed_size;
+
+/*
+ * Actual check for buf_size is done a bit later. We can't do it in


s/Actual/The actual/


+ * cancelled mode as we don't have the bitmap to check the constraints
+ * (so, we do allocate buffer and read prior to the check). On the 
other
+ * hand, we shouldn't blindly g_malloc the number from the stream.
+ * Actually one chunk should not be larger thatn CHUNK_SIZE. Let's 
allow


than


+ * a bit larger (which means that bitmap migration will fail anyway and
+ * the whole migration will most probably fail soon due to broken
+ * stream).
+ */
+if (buf_size > 10 * CHUNK_SIZE) {
+error_report("Bitmap migration stream requests too large buffer "
+ "size to allocate");


Bitmap migration stream buffer allocation request is too large

I'll make those touchups.

Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




[PATCH v4 15/21] migration/block-dirty-bitmap: relax error handling in incoming part

2020-07-27 Thread Vladimir Sementsov-Ogievskiy
Bitmaps data is not critical, and we should not fail the migration (or
use postcopy recovering) because of dirty-bitmaps migration failure.
Instead we should just lose unfinished bitmaps.

Still we have to report io stream violation errors, as they affect the
whole migration stream.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 migration/block-dirty-bitmap.c | 164 +
 1 file changed, 127 insertions(+), 37 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index eb4ffeac4d..4e45e79251 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -145,6 +145,15 @@ typedef struct DBMLoadState {
 
 bool before_vm_start_handled; /* set in dirty_bitmap_mig_before_vm_start */
 
+/*
+ * cancelled
+ * Incoming migration is cancelled for some reason. That means that we
+ * still should read our chunks from migration stream, to not affect other
+ * migration objects (like RAM), but just ignore them and do not touch any
+ * bitmaps or nodes.
+ */
+bool cancelled;
+
 GSList *bitmaps;
 QemuMutex lock; /* protect bitmaps */
 } DBMLoadState;
@@ -531,6 +540,10 @@ static int dirty_bitmap_load_start(QEMUFile *f, 
DBMLoadState *s)
 uint8_t flags = qemu_get_byte(f);
 LoadBitmapState *b;
 
+if (s->cancelled) {
+return 0;
+}
+
 if (s->bitmap) {
 error_report("Bitmap with the same name ('%s') already exists on "
  "destination", bdrv_dirty_bitmap_name(s->bitmap));
@@ -613,13 +626,47 @@ void dirty_bitmap_mig_before_vm_start(void)
 qemu_mutex_unlock(>lock);
 }
 
+static void cancel_incoming_locked(DBMLoadState *s)
+{
+GSList *item;
+
+if (s->cancelled) {
+return;
+}
+
+s->cancelled = true;
+s->bs = NULL;
+s->bitmap = NULL;
+
+/* Drop all unfinished bitmaps */
+for (item = s->bitmaps; item; item = g_slist_next(item)) {
+LoadBitmapState *b = item->data;
+
+/*
+ * Bitmap must be unfinished, as finished bitmaps should already be
+ * removed from the list.
+ */
+assert(!s->before_vm_start_handled || !b->migrated);
+if (bdrv_dirty_bitmap_has_successor(b->bitmap)) {
+bdrv_reclaim_dirty_bitmap(b->bitmap, _abort);
+}
+bdrv_release_dirty_bitmap(b->bitmap);
+}
+
+g_slist_free_full(s->bitmaps, g_free);
+s->bitmaps = NULL;
+}
+
 static void dirty_bitmap_load_complete(QEMUFile *f, DBMLoadState *s)
 {
 GSList *item;
 trace_dirty_bitmap_load_complete();
-bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
 
-qemu_mutex_lock(>lock);
+if (s->cancelled) {
+return;
+}
+
+bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
 
 if (bdrv_dirty_bitmap_has_successor(s->bitmap)) {
 bdrv_reclaim_dirty_bitmap(s->bitmap, _abort);
@@ -637,8 +684,6 @@ static void dirty_bitmap_load_complete(QEMUFile *f, 
DBMLoadState *s)
 break;
 }
 }
-
-qemu_mutex_unlock(>lock);
 }
 
 static int dirty_bitmap_load_bits(QEMUFile *f, DBMLoadState *s)
@@ -650,15 +695,46 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
 
 if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
 trace_dirty_bitmap_load_bits_zeroes();
-bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte, nr_bytes,
- false);
+if (!s->cancelled) {
+bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte,
+ nr_bytes, false);
+}
 } else {
 size_t ret;
-uint8_t *buf;
+g_autofree uint8_t *buf = NULL;
 uint64_t buf_size = qemu_get_be64(f);
-uint64_t needed_size =
-bdrv_dirty_bitmap_serialization_size(s->bitmap,
- first_byte, nr_bytes);
+uint64_t needed_size;
+
+/*
+ * Actual check for buf_size is done a bit later. We can't do it in
+ * cancelled mode as we don't have the bitmap to check the constraints
+ * (so, we do allocate buffer and read prior to the check). On the 
other
+ * hand, we shouldn't blindly g_malloc the number from the stream.
+ * Actually one chunk should not be larger thatn CHUNK_SIZE. Let's 
allow
+ * a bit larger (which means that bitmap migration will fail anyway and
+ * the whole migration will most probably fail soon due to broken
+ * stream).
+ */
+if (buf_size > 10 * CHUNK_SIZE) {
+error_report("Bitmap migration stream requests too large buffer "
+ "size to allocate");
+return -EIO;
+}
+
+buf = g_malloc(buf_size);
+ret = qemu_get_buffer(f, buf, buf_size);
+if (ret != buf_size) {
+error_report("Failed to read bitmap bits");
+return -EIO;
+