Re: [PATCH v2 09/22] migration/block-dirty-bitmap: relax error handling in incoming part

2020-07-24 Thread Vladimir Sementsov-Ogievskiy

19.02.2020 18:34, Vladimir Sementsov-Ogievskiy wrote:

18.02.2020 21:54, Andrey Shinkevich wrote:



On 17/02/2020 18:02, Vladimir Sementsov-Ogievskiy wrote:

Bitmaps data is not critical, and we should not fail the migration (or
use postcopy recovering) because of dirty-bitmaps migration failure.
Instead we should just lose unfinished bitmaps.

Still we have to report io stream violation errors, as they affect the
whole migration stream.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  migration/block-dirty-bitmap.c | 148 +
  1 file changed, 113 insertions(+), 35 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 1329db8d7d..aea5326804 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -145,6 +145,15 @@ typedef struct DBMLoadState {
  bool before_vm_start_handled; /* set in dirty_bitmap_mig_before_vm_start 
*/
+    /*
+ * cancelled
+ * Incoming migration is cancelled for some reason. That means that we
+ * still should read our chunks from migration stream, to not affect other
+ * migration objects (like RAM), but just ignore them and do not touch any
+ * bitmaps or nodes.
+ */
+    bool cancelled;
+
  GSList *bitmaps;
  QemuMutex lock; /* protect bitmaps */
  } DBMLoadState;
@@ -545,13 +554,47 @@ void dirty_bitmap_mig_before_vm_start(void)
  qemu_mutex_unlock(>lock);
  }
+static void cancel_incoming_locked(DBMLoadState *s)
+{
+    GSList *item;
+
+    if (s->cancelled) {
+    return;
+    }
+
+    s->cancelled = true;
+    s->bs = NULL;
+    s->bitmap = NULL;
+
+    /* Drop all unfinished bitmaps */
+    for (item = s->bitmaps; item; item = g_slist_next(item)) {
+    LoadBitmapState *b = item->data;
+
+    /*
+ * Bitmap must be unfinished, as finished bitmaps should already be
+ * removed from the list.
+ */
+    assert(!s->before_vm_start_handled || !b->migrated);
+    if (bdrv_dirty_bitmap_has_successor(b->bitmap)) {
+    bdrv_reclaim_dirty_bitmap(b->bitmap, _abort);
+    }
+    bdrv_release_dirty_bitmap(b->bitmap);
+    }
+
+    g_slist_free_full(s->bitmaps, g_free);
+    s->bitmaps = NULL;
+}
+
  static void dirty_bitmap_load_complete(QEMUFile *f, DBMLoadState *s)
  {
  GSList *item;
  trace_dirty_bitmap_load_complete();
-    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
-    qemu_mutex_lock(>lock);


Why is it safe to remove the critical section?


It's not removed, it becomes wider in this patch.




+    if (s->cancelled) {
+    return;
+    }
+
+    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
  if (bdrv_dirty_bitmap_has_successor(s->bitmap)) {
  bdrv_reclaim_dirty_bitmap(s->bitmap, _abort);
@@ -569,8 +612,6 @@ static void dirty_bitmap_load_complete(QEMUFile *f, 
DBMLoadState *s)
  break;
  }
  }
-
-    qemu_mutex_unlock(>lock);
  }
  static int dirty_bitmap_load_bits(QEMUFile *f, DBMLoadState *s)
@@ -582,15 +623,32 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
  trace_dirty_bitmap_load_bits_zeroes();
-    bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte, nr_bytes,
- false);
+    if (!s->cancelled) {
+    bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte,
+ nr_bytes, false);
+    }
  } else {
  size_t ret;
  uint8_t *buf;
  uint64_t buf_size = qemu_get_be64(f);
-    uint64_t needed_size =
-    bdrv_dirty_bitmap_serialization_size(s->bitmap,
- first_byte, nr_bytes);
+    uint64_t needed_size;
+
+    buf = g_malloc(buf_size);
+    ret = qemu_get_buffer(f, buf, buf_size);
+    if (ret != buf_size) {
+    error_report("Failed to read bitmap bits");
+    g_free(buf);
+    return -EIO;
+    }
+
+    if (s->cancelled) {
+    g_free(buf);
+    return 0;
+    }
+
+    needed_size = bdrv_dirty_bitmap_serialization_size(s->bitmap,
+   first_byte,
+   nr_bytes);
  if (needed_size > buf_size ||
  buf_size > QEMU_ALIGN_UP(needed_size, 4 * sizeof(long))
@@ -599,15 +657,8 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  error_report("Migrated bitmap granularity doesn't "
   "match the destination bitmap '%s' granularity",
   bdrv_dirty_bitmap_name(s->bitmap));
-    return -EINVAL;
-    }
-
-    buf = g_malloc(buf_size);
-    ret = qemu_get_buffer(f, buf, buf_size);
-    if (ret != buf_size) {
-    error_report("Failed to read bitmap 

Re: [PATCH v2 09/22] migration/block-dirty-bitmap: relax error handling in incoming part

2020-02-19 Thread Vladimir Sementsov-Ogievskiy

18.02.2020 21:54, Andrey Shinkevich wrote:



On 17/02/2020 18:02, Vladimir Sementsov-Ogievskiy wrote:

Bitmaps data is not critical, and we should not fail the migration (or
use postcopy recovering) because of dirty-bitmaps migration failure.
Instead we should just lose unfinished bitmaps.

Still we have to report io stream violation errors, as they affect the
whole migration stream.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  migration/block-dirty-bitmap.c | 148 +
  1 file changed, 113 insertions(+), 35 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 1329db8d7d..aea5326804 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -145,6 +145,15 @@ typedef struct DBMLoadState {
  bool before_vm_start_handled; /* set in dirty_bitmap_mig_before_vm_start 
*/
+    /*
+ * cancelled
+ * Incoming migration is cancelled for some reason. That means that we
+ * still should read our chunks from migration stream, to not affect other
+ * migration objects (like RAM), but just ignore them and do not touch any
+ * bitmaps or nodes.
+ */
+    bool cancelled;
+
  GSList *bitmaps;
  QemuMutex lock; /* protect bitmaps */
  } DBMLoadState;
@@ -545,13 +554,47 @@ void dirty_bitmap_mig_before_vm_start(void)
  qemu_mutex_unlock(>lock);
  }
+static void cancel_incoming_locked(DBMLoadState *s)
+{
+    GSList *item;
+
+    if (s->cancelled) {
+    return;
+    }
+
+    s->cancelled = true;
+    s->bs = NULL;
+    s->bitmap = NULL;
+
+    /* Drop all unfinished bitmaps */
+    for (item = s->bitmaps; item; item = g_slist_next(item)) {
+    LoadBitmapState *b = item->data;
+
+    /*
+ * Bitmap must be unfinished, as finished bitmaps should already be
+ * removed from the list.
+ */
+    assert(!s->before_vm_start_handled || !b->migrated);
+    if (bdrv_dirty_bitmap_has_successor(b->bitmap)) {
+    bdrv_reclaim_dirty_bitmap(b->bitmap, _abort);
+    }
+    bdrv_release_dirty_bitmap(b->bitmap);
+    }
+
+    g_slist_free_full(s->bitmaps, g_free);
+    s->bitmaps = NULL;
+}
+
  static void dirty_bitmap_load_complete(QEMUFile *f, DBMLoadState *s)
  {
  GSList *item;
  trace_dirty_bitmap_load_complete();
-    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
-    qemu_mutex_lock(>lock);


Why is it safe to remove the critical section?


It's not removed, it becomes wider in this patch.




+    if (s->cancelled) {
+    return;
+    }
+
+    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
  if (bdrv_dirty_bitmap_has_successor(s->bitmap)) {
  bdrv_reclaim_dirty_bitmap(s->bitmap, _abort);
@@ -569,8 +612,6 @@ static void dirty_bitmap_load_complete(QEMUFile *f, 
DBMLoadState *s)
  break;
  }
  }
-
-    qemu_mutex_unlock(>lock);
  }
  static int dirty_bitmap_load_bits(QEMUFile *f, DBMLoadState *s)
@@ -582,15 +623,32 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
  trace_dirty_bitmap_load_bits_zeroes();
-    bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte, nr_bytes,
- false);
+    if (!s->cancelled) {
+    bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte,
+ nr_bytes, false);
+    }
  } else {
  size_t ret;
  uint8_t *buf;
  uint64_t buf_size = qemu_get_be64(f);
-    uint64_t needed_size =
-    bdrv_dirty_bitmap_serialization_size(s->bitmap,
- first_byte, nr_bytes);
+    uint64_t needed_size;
+
+    buf = g_malloc(buf_size);
+    ret = qemu_get_buffer(f, buf, buf_size);
+    if (ret != buf_size) {
+    error_report("Failed to read bitmap bits");
+    g_free(buf);
+    return -EIO;
+    }
+
+    if (s->cancelled) {
+    g_free(buf);
+    return 0;
+    }
+
+    needed_size = bdrv_dirty_bitmap_serialization_size(s->bitmap,
+   first_byte,
+   nr_bytes);
  if (needed_size > buf_size ||
  buf_size > QEMU_ALIGN_UP(needed_size, 4 * sizeof(long))
@@ -599,15 +657,8 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  error_report("Migrated bitmap granularity doesn't "
   "match the destination bitmap '%s' granularity",
   bdrv_dirty_bitmap_name(s->bitmap));
-    return -EINVAL;
-    }
-
-    buf = g_malloc(buf_size);
-    ret = qemu_get_buffer(f, buf, buf_size);
-    if (ret != buf_size) {
-    error_report("Failed to read bitmap bits");
-    g_free(buf);
-    return -EIO;
+ 

Re: [PATCH v2 09/22] migration/block-dirty-bitmap: relax error handling in incoming part

2020-02-18 Thread Andrey Shinkevich




On 17/02/2020 18:02, Vladimir Sementsov-Ogievskiy wrote:

Bitmaps data is not critical, and we should not fail the migration (or
use postcopy recovering) because of dirty-bitmaps migration failure.
Instead we should just lose unfinished bitmaps.

Still we have to report io stream violation errors, as they affect the
whole migration stream.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  migration/block-dirty-bitmap.c | 148 +
  1 file changed, 113 insertions(+), 35 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 1329db8d7d..aea5326804 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -145,6 +145,15 @@ typedef struct DBMLoadState {
  
  bool before_vm_start_handled; /* set in dirty_bitmap_mig_before_vm_start */
  
+/*

+ * cancelled
+ * Incoming migration is cancelled for some reason. That means that we
+ * still should read our chunks from migration stream, to not affect other
+ * migration objects (like RAM), but just ignore them and do not touch any
+ * bitmaps or nodes.
+ */
+bool cancelled;
+
  GSList *bitmaps;
  QemuMutex lock; /* protect bitmaps */
  } DBMLoadState;
@@ -545,13 +554,47 @@ void dirty_bitmap_mig_before_vm_start(void)
  qemu_mutex_unlock(>lock);
  }
  
+static void cancel_incoming_locked(DBMLoadState *s)

+{
+GSList *item;
+
+if (s->cancelled) {
+return;
+}
+
+s->cancelled = true;
+s->bs = NULL;
+s->bitmap = NULL;
+
+/* Drop all unfinished bitmaps */
+for (item = s->bitmaps; item; item = g_slist_next(item)) {
+LoadBitmapState *b = item->data;
+
+/*
+ * Bitmap must be unfinished, as finished bitmaps should already be
+ * removed from the list.
+ */
+assert(!s->before_vm_start_handled || !b->migrated);
+if (bdrv_dirty_bitmap_has_successor(b->bitmap)) {
+bdrv_reclaim_dirty_bitmap(b->bitmap, _abort);
+}
+bdrv_release_dirty_bitmap(b->bitmap);
+}
+
+g_slist_free_full(s->bitmaps, g_free);
+s->bitmaps = NULL;
+}
+
  static void dirty_bitmap_load_complete(QEMUFile *f, DBMLoadState *s)
  {
  GSList *item;
  trace_dirty_bitmap_load_complete();
-bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
  
-qemu_mutex_lock(>lock);


Why is it safe to remove the critical section?


+if (s->cancelled) {
+return;
+}
+
+bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
  
  if (bdrv_dirty_bitmap_has_successor(s->bitmap)) {

  bdrv_reclaim_dirty_bitmap(s->bitmap, _abort);
@@ -569,8 +612,6 @@ static void dirty_bitmap_load_complete(QEMUFile *f, 
DBMLoadState *s)
  break;
  }
  }
-
-qemu_mutex_unlock(>lock);
  }
  
  static int dirty_bitmap_load_bits(QEMUFile *f, DBMLoadState *s)

@@ -582,15 +623,32 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  
  if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {

  trace_dirty_bitmap_load_bits_zeroes();
-bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte, nr_bytes,
- false);
+if (!s->cancelled) {
+bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte,
+ nr_bytes, false);
+}
  } else {
  size_t ret;
  uint8_t *buf;
  uint64_t buf_size = qemu_get_be64(f);
-uint64_t needed_size =
-bdrv_dirty_bitmap_serialization_size(s->bitmap,
- first_byte, nr_bytes);
+uint64_t needed_size;
+
+buf = g_malloc(buf_size);
+ret = qemu_get_buffer(f, buf, buf_size);
+if (ret != buf_size) {
+error_report("Failed to read bitmap bits");
+g_free(buf);
+return -EIO;
+}
+
+if (s->cancelled) {
+g_free(buf);
+return 0;
+}
+
+needed_size = bdrv_dirty_bitmap_serialization_size(s->bitmap,
+   first_byte,
+   nr_bytes);
  
  if (needed_size > buf_size ||

  buf_size > QEMU_ALIGN_UP(needed_size, 4 * sizeof(long))
@@ -599,15 +657,8 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
  error_report("Migrated bitmap granularity doesn't "
   "match the destination bitmap '%s' granularity",
   bdrv_dirty_bitmap_name(s->bitmap));
-return -EINVAL;
-}
-
-buf = g_malloc(buf_size);
-ret = qemu_get_buffer(f, buf, buf_size);
-if (ret != buf_size) {
-error_report("Failed to read bitmap bits");
-g_free(buf);
-return -EIO;
+cancel_incoming_locked(s);


   /* Continue 

[PATCH v2 09/22] migration/block-dirty-bitmap: relax error handling in incoming part

2020-02-17 Thread Vladimir Sementsov-Ogievskiy
Bitmaps data is not critical, and we should not fail the migration (or
use postcopy recovering) because of dirty-bitmaps migration failure.
Instead we should just lose unfinished bitmaps.

Still we have to report io stream violation errors, as they affect the
whole migration stream.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 migration/block-dirty-bitmap.c | 148 +
 1 file changed, 113 insertions(+), 35 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 1329db8d7d..aea5326804 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -145,6 +145,15 @@ typedef struct DBMLoadState {
 
 bool before_vm_start_handled; /* set in dirty_bitmap_mig_before_vm_start */
 
+/*
+ * cancelled
+ * Incoming migration is cancelled for some reason. That means that we
+ * still should read our chunks from migration stream, to not affect other
+ * migration objects (like RAM), but just ignore them and do not touch any
+ * bitmaps or nodes.
+ */
+bool cancelled;
+
 GSList *bitmaps;
 QemuMutex lock; /* protect bitmaps */
 } DBMLoadState;
@@ -545,13 +554,47 @@ void dirty_bitmap_mig_before_vm_start(void)
 qemu_mutex_unlock(>lock);
 }
 
+static void cancel_incoming_locked(DBMLoadState *s)
+{
+GSList *item;
+
+if (s->cancelled) {
+return;
+}
+
+s->cancelled = true;
+s->bs = NULL;
+s->bitmap = NULL;
+
+/* Drop all unfinished bitmaps */
+for (item = s->bitmaps; item; item = g_slist_next(item)) {
+LoadBitmapState *b = item->data;
+
+/*
+ * Bitmap must be unfinished, as finished bitmaps should already be
+ * removed from the list.
+ */
+assert(!s->before_vm_start_handled || !b->migrated);
+if (bdrv_dirty_bitmap_has_successor(b->bitmap)) {
+bdrv_reclaim_dirty_bitmap(b->bitmap, _abort);
+}
+bdrv_release_dirty_bitmap(b->bitmap);
+}
+
+g_slist_free_full(s->bitmaps, g_free);
+s->bitmaps = NULL;
+}
+
 static void dirty_bitmap_load_complete(QEMUFile *f, DBMLoadState *s)
 {
 GSList *item;
 trace_dirty_bitmap_load_complete();
-bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
 
-qemu_mutex_lock(>lock);
+if (s->cancelled) {
+return;
+}
+
+bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
 
 if (bdrv_dirty_bitmap_has_successor(s->bitmap)) {
 bdrv_reclaim_dirty_bitmap(s->bitmap, _abort);
@@ -569,8 +612,6 @@ static void dirty_bitmap_load_complete(QEMUFile *f, 
DBMLoadState *s)
 break;
 }
 }
-
-qemu_mutex_unlock(>lock);
 }
 
 static int dirty_bitmap_load_bits(QEMUFile *f, DBMLoadState *s)
@@ -582,15 +623,32 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
 
 if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
 trace_dirty_bitmap_load_bits_zeroes();
-bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte, nr_bytes,
- false);
+if (!s->cancelled) {
+bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_byte,
+ nr_bytes, false);
+}
 } else {
 size_t ret;
 uint8_t *buf;
 uint64_t buf_size = qemu_get_be64(f);
-uint64_t needed_size =
-bdrv_dirty_bitmap_serialization_size(s->bitmap,
- first_byte, nr_bytes);
+uint64_t needed_size;
+
+buf = g_malloc(buf_size);
+ret = qemu_get_buffer(f, buf, buf_size);
+if (ret != buf_size) {
+error_report("Failed to read bitmap bits");
+g_free(buf);
+return -EIO;
+}
+
+if (s->cancelled) {
+g_free(buf);
+return 0;
+}
+
+needed_size = bdrv_dirty_bitmap_serialization_size(s->bitmap,
+   first_byte,
+   nr_bytes);
 
 if (needed_size > buf_size ||
 buf_size > QEMU_ALIGN_UP(needed_size, 4 * sizeof(long))
@@ -599,15 +657,8 @@ static int dirty_bitmap_load_bits(QEMUFile *f, 
DBMLoadState *s)
 error_report("Migrated bitmap granularity doesn't "
  "match the destination bitmap '%s' granularity",
  bdrv_dirty_bitmap_name(s->bitmap));
-return -EINVAL;
-}
-
-buf = g_malloc(buf_size);
-ret = qemu_get_buffer(f, buf, buf_size);
-if (ret != buf_size) {
-error_report("Failed to read bitmap bits");
-g_free(buf);
-return -EIO;
+cancel_incoming_locked(s);
+return 0;
 }
 
 bdrv_dirty_bitmap_deserialize_part(s->bitmap, buf, first_byte, 
nr_bytes,
@@ -632,14 +683,16 @@ static int dirty_bitmap_load_header(QEMUFile