When using libvirt for RDMA live migration, if the VM memory
is too large, it will take a lot of time to deregister the VM
at the source side, resulting in a long downtime (VM 64G,
deregister vm time is about 400ms).
    
Although the VM's memory uses 2M huge pages, the MLNX driver
still uses 4K pages for pin memory, as well as for unpin.
So we use huge pages to skip the process of pin memory and
unpin memory to reduce downtime.
    
---
v2
- Add page_size in struct RDMALocalBlock
- Use page_size to determine whether VM uses huge page
---
    
Signed-off-by: lizhaoxin <lizhaox...@kingsoft.com>

diff --git a/migration/rdma.c b/migration/rdma.c
index 1cdb4561f3..703816ebc7 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -215,6 +215,7 @@ typedef struct RDMALocalBlock {
     uint64_t       remote_host_addr; /* remote virtual address */
     uint64_t       offset;
     uint64_t       length;
+    uint64_t       page_size;
     struct         ibv_mr **pmr;    /* MRs for chunk-level registration */
     struct         ibv_mr *mr;      /* MR for non-chunk-level registration */
     uint32_t      *remote_keys;     /* rkeys for chunk-level registration */
@@ -565,7 +566,8 @@ static inline uint8_t *ram_chunk_end(const RDMALocalBlock 
*rdma_ram_block,
 
 static int rdma_add_block(RDMAContext *rdma, const char *block_name,
                          void *host_addr,
-                         ram_addr_t block_offset, uint64_t length)
+                         ram_addr_t block_offset, uint64_t length,
+                         uint64_t page_size)
 {
     RDMALocalBlocks *local = &rdma->local_ram_blocks;
     RDMALocalBlock *block;
@@ -595,6 +597,7 @@ static int rdma_add_block(RDMAContext *rdma, const char 
*block_name,
     block->local_host_addr = host_addr;
     block->offset = block_offset;
     block->length = length;
+    block->page_size = page_size;
     block->index = local->nb_blocks;
     block->src_index = ~0U; /* Filled in by the receipt of the block list */
     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
@@ -634,7 +637,8 @@ static int qemu_rdma_init_one_block(RAMBlock *rb, void 
*opaque)
     void *host_addr = qemu_ram_get_host_addr(rb);
     ram_addr_t block_offset = qemu_ram_get_offset(rb);
     ram_addr_t length = qemu_ram_get_used_length(rb);
-    return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
+    ram_addr_t page_size = qemu_ram_pagesize(rb);
+    return rdma_add_block(opaque, block_name, host_addr, block_offset, length, 
page_size);
 }
 
 /*
@@ -1123,13 +1127,25 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 
     for (i = 0; i < local->nb_blocks; i++) {
-        local->block[i].mr =
-            ibv_reg_mr(rdma->pd,
-                    local->block[i].local_host_addr,
-                    local->block[i].length,
-                    IBV_ACCESS_LOCAL_WRITE |
-                    IBV_ACCESS_REMOTE_WRITE
-                    );
+        if (local->block[i].page_size != qemu_real_host_page_size) {
+            local->block[i].mr =
+                ibv_reg_mr(rdma->pd,
+                        local->block[i].local_host_addr,
+                        local->block[i].length,
+                        IBV_ACCESS_LOCAL_WRITE |
+                        IBV_ACCESS_REMOTE_WRITE |
+                        IBV_ACCESS_ON_DEMAND |
+                        IBV_ACCESS_HUGETLB
+                        );
+        } else {
+            local->block[i].mr =
+                ibv_reg_mr(rdma->pd,
+                        local->block[i].local_host_addr,
+                        local->block[i].length,
+                        IBV_ACCESS_LOCAL_WRITE |
+                        IBV_ACCESS_REMOTE_WRITE
+                        );
+        }
         if (!local->block[i].mr) {
             perror("Failed to register local dest ram block!\n");
             break;

Reply via email to