On Mon, Feb 11, 2013 at 05:49:53PM -0500, Michael R. Hines wrote: > From: "Michael R. Hines" <mrhi...@us.ibm.com> > > > Signed-off-by: Michael R. Hines <mrhi...@us.ibm.com> > --- > include/qemu/rdma.h | 281 ++++++++++ > migration-rdma.c | 1444 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 1725 insertions(+) > create mode 100644 include/qemu/rdma.h > create mode 100644 migration-rdma.c
Could you document the protocol used, including sync using send etc? A good place to put it would be docs/ > diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h > new file mode 100644 > index 0000000..2dc2519 > --- /dev/null > +++ b/include/qemu/rdma.h > @@ -0,0 +1,281 @@ > +/* > + * Copyright (C) 2013 Michael R. Hines <mrhi...@us.ibm.com> > + * Copyright (C) 2013 Jiuxing Liu <j...@us.ibm.com> > + * > + * RDMA data structures and helper functions (for migration) > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; under version 2 of the License. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#ifndef _RDMA_H > +#define _RDMA_H > + > +#include "config-host.h" > +#ifdef CONFIG_RDMA > +#include <rdma/rdma_cma.h> > +#endif > +#include "monitor/monitor.h" > + > +extern int rdmaport; > +extern char rdmahost[64]; > + > +struct rdma_context { > + /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in > + cm_id->verbs, cm_id->channel, and cm_id->qp. */ > + struct rdma_cm_id *cm_id; > + struct rdma_cm_id *listen_id; > + > + struct ibv_context *verbs; > + struct rdma_event_channel *channel; > + struct ibv_qp *qp; > + > + struct ibv_comp_channel *comp_channel; > + struct ibv_pd *pd; > + struct ibv_cq *cq; > +}; > + > +static inline void rdma_init_context(struct rdma_context *rdma_ctx) > +{ > + rdma_ctx->cm_id = NULL; > + rdma_ctx->listen_id = NULL; > + rdma_ctx->verbs = NULL; > + rdma_ctx->channel = NULL; > + rdma_ctx->qp = NULL; > + rdma_ctx->comp_channel = NULL; > + rdma_ctx->pd = NULL; > + rdma_ctx->cq = NULL; > +} > + > +void cpu_physical_memory_reset_dirty_all(void); > + > +int rdma_resolve_host(struct rdma_context *rdma_ctx, > + const char *host, int port); > +int rdma_alloc_pd_cq(struct rdma_context *rdma_ctx); > +int rdma_alloc_qp(struct rdma_context *rdma_ctx); > +int rdma_migrate_connect(struct rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len); > +int rdma_migrate_accept(struct rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len); > +void rdma_migrate_disconnect(struct rdma_context *rdma_ctx); > + > +/* Instead of registering whole ram blocks, we can register them in smaller > + * chunks. This may be benefial if the ram blocks have holes in them */ > +#define RDMA_CHUNK_REGISTRATION > + > +#define RDMA_LAZY_REGISTRATION > + > +#define RDMA_REG_CHUNK_SHIFT 20 > +#define RDMA_REG_CHUNK_SIZE (1UL << (RDMA_REG_CHUNK_SHIFT)) > +#define RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \ > + (((unsigned long)(host_addr) >> RDMA_REG_CHUNK_SHIFT) - \ > + ((unsigned long)(start_addr) >> RDMA_REG_CHUNK_SHIFT)) > +#define RDMA_REG_NUM_CHUNKS(rdma_ram_block) \ > + (RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\ > + (rdma_ram_block)->local_host_addr +\ > + (rdma_ram_block)->length) + 1) > +#define RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\ > + ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \ > + RDMA_REG_CHUNK_SHIFT) + (i)) << \ > + RDMA_REG_CHUNK_SHIFT)) > +#define RDMA_REG_CHUNK_END(rdma_ram_block, i) \ > + (RDMA_REG_CHUNK_START(rdma_ram_block, i) + \ > + RDMA_REG_CHUNK_SIZE) > + > +struct rdma_ram_block { > + uint8_t *local_host_addr; > + uint64_t remote_host_addr; > + uint64_t offset; > + uint64_t length; > + struct ibv_mr **pmr; > + struct ibv_mr *mr; > + uint32_t remote_rkey; > +}; > + > +struct rdma_remote_ram_block { > + uint64_t remote_host_addr; > + uint64_t offset; > + uint64_t length; > + uint32_t remote_rkey; > +}; > + > +#define RDMA_MAX_RAM_BLOCKS 64 > + > +struct rdma_ram_blocks { > + int num_blocks; > + struct rdma_ram_block block[RDMA_MAX_RAM_BLOCKS]; > +}; > + > +struct rdma_remote_ram_blocks { > + int num_blocks; > + struct rdma_remote_ram_block block[RDMA_MAX_RAM_BLOCKS]; > +}; > + > +int rdma_init_ram_blocks(struct rdma_ram_blocks *rdma_ram_blocks); > +int rdma_reg_chunk_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks); > +int rdma_reg_whole_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks); > +int rdma_server_reg_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks); > +int rdma_client_reg_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks); > +void rdma_dereg_ram_blocks(struct rdma_ram_blocks *rdma_ram_blocks); > + > +void rdma_copy_to_remote_ram_blocks(struct rdma_ram_blocks *local, > + struct rdma_remote_ram_blocks *remote); > +int rdma_process_remote_ram_blocks(struct rdma_ram_blocks *local, > + struct rdma_remote_ram_blocks *remote); > + > +int rdma_search_ram_block(uint64_t offset, uint64_t length, > + struct rdma_ram_blocks *blocks, > + int *block_index, int *chunk_index); > + > +struct rdma_data { > + char *host; > + int port; > + int enabled; > + int gidx; > + union ibv_gid gid; > + > + struct rdma_context rdma_ctx; > + struct rdma_ram_blocks rdma_ram_blocks; > + > + /* This is used for synchronization: We use > + IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs > + are done. When the receiver gets it, it can be certain > + that all the RDMAs are completed. */ > + int sync; > + struct ibv_mr *sync_mr; > + > + /* This is used for the server to write the remote > + ram blocks info. */ > + struct rdma_remote_ram_blocks remote_info; > + struct ibv_mr *remote_info_mr; > + > + /* The rest is only for the initiator of the migration. */ > + int client_init_done; > + > + /* number of outstanding unsignaled send */ > + int num_unsignaled_send; > + > + /* number of outstanding signaled send */ > + int num_signaled_send; > + > + /* store info about current buffer so that we can > + merge it with future sends */ > + uint64_t current_offset; > + uint64_t current_length; > + /* index of ram block the current buffer belongs to */ > + int current_index; > + /* index of the chunk in the current ram block */ > + int current_chunk; > + > + uint64_t total_bytes; > + > +}; > + > +extern struct rdma_data rdma_mdata; > + > +#define rdma_update_capability(state) \ > + state->enabled_capabilities[MIGRATION_CAPABILITY_RDMA] = \ > + migration_use_rdma() ? true : false > + > +static inline int migration_use_rdma(void) > +{ > + /* port will be non-zero if user wants to use RDMA. */ > + return rdma_mdata.port != -1 && rdma_mdata.host; > +} > + > +static inline int migrate_rdma_enabled(void) > +{ > + return rdma_mdata.enabled; > +} > + > +void rdma_disable(void); > + > +#define RDMA_BLOCKING > +#define RDMA_EXTRA_SYNC > + > +enum { > + RDMA_WRID_NONE = 0, > + RDMA_WRID_RDMA, > + RDMA_WRID_SEND_SYNC, > + RDMA_WRID_RECV_SYNC, > + RDMA_WRID_SEND_REMOTE_INFO, > + RDMA_WRID_RECV_REMOTE_INFO, > + RDMA_WRID_SEND_EXTRA_SYNC, > + RDMA_WRID_RECV_EXTRA_SYNC, > +}; > + > +int rdma_migrate_listen(struct rdma_data *mdata, char *host, > + int port); > +int rdma_reg_sync(struct rdma_data *mdata); > +int rdma_dereg_sync(struct rdma_data *mdata); > +int rdma_post_recv_sync(struct rdma_data *mdata, > + int wr_id); > + > +int rdma_reg_remote_info( > + struct rdma_data *mdata); > +int rdma_dereg_remote_info( > + struct rdma_data *mdata); > +int rdma_post_send_remote_info( > + struct rdma_data *mdata); > +int rdma_post_recv_remote_info( > + struct rdma_data *mdata); > + > +int rdma_poll_for_wrid( > + struct rdma_data *mdata, > + int wrid); > +int rdma_block_for_wrid( > + struct rdma_data *mdata, > + int wrid); > + > + > +#ifdef CONFIG_RDMA > +int rdma_wait_for_connect(int fd, void * opaque); > +int rdma_start_incoming_migration(int s); > +void rdma_cleanup(struct rdma_data *mdata); > +int rdma_client_init(struct rdma_data *mdata); > +int rdma_client_connect(struct rdma_data *mdata); > +void rdma_data_init(struct rdma_data *mdata); > +int rdma_server_init(struct rdma_data *mdata); > +int rdma_server_prepare(struct rdma_data *mdata); > +int rdma_accept_incoming_migration( > + struct rdma_data *mdata); > +int rdma_write(struct rdma_data *mdata, > + uint64_t addr, uint64_t len); > +int rdma_write_flush(struct rdma_data *mdata); > +int rdma_poll(struct rdma_data *mdata); > +int rdma_post_send_sync(struct rdma_data *mdata, > + int wr_id); > +int rdma_wait_for_wrid( > + struct rdma_data *mdata, > + int wrid); > +#else > +#define rdma_cleanup(...) do { printf("WARN: rdma not enabled\n"); } while(0) > +#define rdma_data_init(...) do { printf("WARN: rdma not enabled\n"); } > while(0) > +#define rdma_wait_for_connect(...) 0 > +#define rdma_start_incoming_migration(...) 0 > +#define rdma_client_init(...) 0 > +#define rdma_client_connect(...) 0 > +#define rdma_server_init(...) 0 > +#define rdma_server_prepare(...) 0 > +#define rdma_accept_incoming_migration(...) 0 > +#define rdma_write(...) 0 > +#define rdma_write_flush(...) 0 > +#define rdma_poll(...) 0 > +#define rdma_post_send_sync(...) 0 > +#define rdma_wait_for_wrid(...) 0 > +#endif > + > +#endif > diff --git a/migration-rdma.c b/migration-rdma.c > new file mode 100644 > index 0000000..a64f350 > --- /dev/null > +++ b/migration-rdma.c > @@ -0,0 +1,1444 @@ > +/* > + * Copyright (C) 2013 Michael R. Hines <mrhi...@us.ibm.com> > + * Copyright (C) 2013 Jiuxing Liu <j...@us.ibm.com> > + * > + * RDMA data structures and helper functions (for migration) > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; under version 2 of the License. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > +#include "qemu/rdma.h" > +#include "qemu-common.h" > +#include "migration/migration.h" > +#include <stdio.h> > +#include <sys/types.h> > +#include <sys/socket.h> > +#include <netdb.h> > +#include <arpa/inet.h> > +#include <string.h> > + > +#define RDMA_RESOLVE_TIMEOUT_MS 10000 > +#define RDMA_CQ_SIZE 2000 > +#define RDMA_QP_SIZE 1000 > + > +//#define DEBUG_MIGRATION_RDMA > + > +#ifdef DEBUG_MIGRATION_RDMA > +#define DPRINTF(fmt, ...) \ > + do { printf("migration-rdma: " fmt, ## __VA_ARGS__); } while (0) > +#else > +#define DPRINTF(fmt, ...) \ > + do { } while (0) > +#endif > + > +static void *rdma_mallocz(size_t size) > +{ > + void *ptr; > + ptr = malloc(size); > + memset(ptr, 0, size); > + return ptr; > +} > + > +static void rdma_dump_id(const char * who, struct ibv_context * verbs) > +{ > + printf("%s RDMA verbs Device opened: kernel name %s uverbs device name > %s, infiniband_verbs class device path %s, infiniband class device path > %s\n", who, verbs->device->name, verbs->device->dev_name, > verbs->device->dev_path, verbs->device->ibdev_path); > +} > + > +static void rdma_dump_gid(const char * who, struct rdma_cm_id * id) > +{ > + char sgid[33]; > + char dgid[33]; > + inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid); > + inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid); > + printf("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid); > +} > + > +int rdma_resolve_host(struct rdma_context *rdma_ctx, > + const char *host, int port) > +{ > + int ret; > + struct addrinfo *res, *src; > + char port_str[16]; > + struct rdma_cm_event *cm_event; > + const char * srchost = "r9"; > + > + > + if (host == NULL || !strcmp(host, "")) { > + printf("RDMA hostname has not been set\n"); > + return -1; > + } > + > + /* create CM channel */ > + rdma_ctx->channel = rdma_create_event_channel(); > + if (!rdma_ctx->channel) { > + printf("could not create CM channel\n"); > + return -1; > + } > + > + /* create CM id */ > + ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL, > + RDMA_PS_TCP); > + if (ret) { > + printf("could not create channel id\n"); > + goto err_resolve_create_id; > + } > + > + snprintf(port_str, 16, "%d", port); > + port_str[15] = '\0'; > + > + ret = getaddrinfo(host, port_str, NULL, &res); > + if (ret < 0) { > + printf("could not getaddrinfo destination address %s\n", host); > + goto err_resolve_get_addr; > + } > + > + ret = getaddrinfo(srchost, port_str, NULL, &src); > + if (ret < 0) { > + printf("could not getaddrinfo source address %s\n", srchost); > + goto err_resolve_get_addr; > + } > + > + /* There may be multiple RDMA devices. Choose the right one. > + * (We may need to select specific ports in the future, too.) > + */ > + ret = rdma_bind_addr(rdma_ctx->cm_id, src->ai_addr); > + if(ret < 0) { > + printf("could not bind source client to local rdma device with src > %s\n", srchost); > + goto err_resolve_get_addr; > + } > + > + rdma_dump_id("client_bind", rdma_ctx->cm_id->verbs); > + rdma_dump_gid("client_bind", rdma_ctx->cm_id); > + > + /* resolve the first address */ > + ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr, > + RDMA_RESOLVE_TIMEOUT_MS); > + if (ret) { > + printf("could not resolve address %s\n", host); > + goto err_resolve_get_addr; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + printf("could not perform event_addr_resolved\n"); > + goto err_resolve_get_addr; > + } > + > + if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { > + printf("result not equal to event_addr_resolved %s\n", > rdma_event_str(cm_event->event)); > + perror("rdma_resolve_addr"); > + rdma_ack_cm_event(cm_event); > + goto err_resolve_get_addr; > + } > + rdma_ack_cm_event(cm_event); > + rdma_dump_gid("client_resolved", rdma_ctx->cm_id); > + > + /* resolve route */ > + ret = rdma_resolve_route(rdma_ctx->cm_id, RDMA_RESOLVE_TIMEOUT_MS); > + if (ret) { > + printf("could not resolve rdma route\n"); > + goto err_resolve_get_addr; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + printf("could not perform event_route_resolved\n"); > + goto err_resolve_get_addr; > + } > + if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { > + printf("result not equal to event_route_resolved: %s\n", > rdma_event_str(cm_event->event)); > + rdma_ack_cm_event(cm_event); > + goto err_resolve_get_addr; > + } > + rdma_ack_cm_event(cm_event); > + rdma_ctx->verbs = rdma_ctx->cm_id->verbs; > + return 0; > + > +err_resolve_get_addr: > + rdma_destroy_id(rdma_ctx->cm_id); > +err_resolve_create_id: > + rdma_destroy_event_channel(rdma_ctx->channel); > + rdma_ctx->channel = NULL; > + > + return -1; > +} > + > +int rdma_alloc_pd_cq(struct rdma_context *rdma_ctx) > +{ > + > + /* allocate pd */ > + rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs); > + if (!rdma_ctx->pd) { > + return -1; > + } > + > +#ifdef RDMA_BLOCKING > + /* create completion channel */ > + rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs); > + if (!rdma_ctx->comp_channel) { > + goto err_alloc_pd_cq; > + } > +#endif > + > + /* create cq */ > + rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, RDMA_CQ_SIZE, > + NULL, rdma_ctx->comp_channel, 0); > + if (!rdma_ctx->cq) { > + goto err_alloc_pd_cq; > + } > + > + return 0; > + > +err_alloc_pd_cq: > + if (rdma_ctx->pd) { > + ibv_dealloc_pd(rdma_ctx->pd); > + } > + if (rdma_ctx->comp_channel) { > + ibv_destroy_comp_channel(rdma_ctx->comp_channel); > + } > + rdma_ctx->pd = NULL; > + rdma_ctx->comp_channel = NULL; > + return -1; > + > +} > + > +int rdma_alloc_qp(struct rdma_context *rdma_ctx) > +{ > + struct ibv_qp_init_attr attr = { 0 }; > + int ret; > + > + attr.cap.max_send_wr = RDMA_QP_SIZE; > + attr.cap.max_recv_wr = 2; > + attr.cap.max_send_sge = 1; > + attr.cap.max_recv_sge = 1; > + attr.send_cq = rdma_ctx->cq; > + attr.recv_cq = rdma_ctx->cq; > + attr.qp_type = IBV_QPT_RC; > + > + ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr); > + if (ret) { > + return -1; > + } > + > + rdma_ctx->qp = rdma_ctx->cm_id->qp; > + return 0; > +} > + > +int rdma_wait_for_connect(int fd, void * opaque) > +{ > + if (rdma_client_init(&rdma_mdata)) { > + return -1; > + } > + > + if (rdma_client_connect(&rdma_mdata)) { > + return -1; > + } > + > + return 0; > +} > +int rdma_migrate_connect(struct rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len) > +{ > + int ret; > + struct rdma_conn_param conn_param = { 0 }; > + struct rdma_cm_event *cm_event; > + > + conn_param.initiator_depth = 2; > + conn_param.retry_count = 5; > + conn_param.private_data = out_data; > + conn_param.private_data_len = out_len; > + > + ret = rdma_connect(rdma_ctx->cm_id, &conn_param); > + if (ret) { > + return -1; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + return -1; > + } > + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { > + return -1; > + } > + > + if (in_len) { > + if (*in_len > cm_event->param.conn.private_data_len) { > + *in_len = cm_event->param.conn.private_data_len; > + } > + if (*in_len) { > + memcpy(in_data, cm_event->param.conn.private_data, *in_len); > + } > + } > + > + rdma_ack_cm_event(cm_event); > + > + return 0; > +} > + > +int rdma_migrate_listen(struct rdma_data *mdata, char *host, > + int port) > +{ > + int ret; > + struct rdma_cm_event *cm_event; > + struct rdma_context *rdma_ctx = &mdata->rdma_ctx; > + struct ibv_context *verbs; > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + goto err_listen; > + } > + > + if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { > + rdma_ack_cm_event(cm_event); > + goto err_listen; > + } > + > + rdma_ctx->cm_id = cm_event->id; > + verbs = cm_event->id->verbs; > + printf("verbs context after listen: %p\n", verbs); > + rdma_ack_cm_event(cm_event); > + > + if (!rdma_ctx->verbs) { > + rdma_ctx->verbs = verbs; > + ret = rdma_server_prepare(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error preparing server!\n"); > + goto err_listen; > + } > + } else if (rdma_ctx->verbs != verbs) { > + fprintf(stderr, "ibv context not matching %p, %p!\n", > + rdma_ctx->verbs, verbs); > + goto err_listen; > + } > + /* xxx destroy listen_id ??? */ > + > + return 0; > + > +err_listen: > + > + return -1; > + > +} > + > +int rdma_migrate_accept(struct rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len) > +{ > + int ret; > + struct rdma_conn_param conn_param = { 0 }; > + struct rdma_cm_event *cm_event; > + > + conn_param.responder_resources = 2; > + conn_param.private_data = out_data; > + conn_param.private_data_len = out_len; > + > + ret = rdma_accept(rdma_ctx->cm_id, &conn_param); > + if (ret) { > + fprintf(stderr, "rdma_accept returns %d!\n", ret); > + return -1; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + return -1; > + } > + > + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { > + rdma_ack_cm_event(cm_event); > + return -1; > + } > + > + if (in_len) { > + if (*in_len > cm_event->param.conn.private_data_len) { > + *in_len = cm_event->param.conn.private_data_len; > + } > + if (*in_len) { > + memcpy(in_data, cm_event->param.conn.private_data, *in_len); > + } > + } > + > + rdma_ack_cm_event(cm_event); > + > + return 0; > +} > + > +void rdma_migrate_disconnect(struct rdma_context *rdma_ctx) > +{ > + int ret; > + struct rdma_cm_event *cm_event; > + > + ret = rdma_disconnect(rdma_ctx->cm_id); > + if (ret) { > + return; > + } > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + return; > + } > + rdma_ack_cm_event(cm_event); > +} > + > +int rdma_reg_chunk_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks) > +{ > + int i, j; > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + struct rdma_ram_block *block = &(rdma_ram_blocks->block[i]); > + int num_chunks = RDMA_REG_NUM_CHUNKS(block); > + /* allocate memory to store chunk MRs */ > + rdma_ram_blocks->block[i].pmr = rdma_mallocz( > + num_chunks * sizeof(struct ibv_mr *)); > + > + if (!block->pmr) { > + goto err_reg_chunk_ram_blocks; > + } > + > + for (j = 0; j < num_chunks; j++) { > + uint8_t *start_addr = RDMA_REG_CHUNK_START(block, j); > + uint8_t *end_addr = RDMA_REG_CHUNK_END(block, j); > + if (start_addr < block->local_host_addr) { > + start_addr = block->local_host_addr; > + } > + if (end_addr > block->local_host_addr + block->length) { > + end_addr = block->local_host_addr + block->length; > + } > + block->pmr[j] = ibv_reg_mr(rdma_ctx->pd, > + start_addr, > + end_addr - start_addr, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (!block->pmr[j]) { > + break; > + } > + } > + if (j < num_chunks) { > + for (j--; j >= 0; j--) { > + ibv_dereg_mr(block->pmr[j]); > + } > + block->pmr[i] = NULL; > + goto err_reg_chunk_ram_blocks; > + } > + } > + > + return 0; > + > +err_reg_chunk_ram_blocks: > + for (i--; i >= 0; i--) { > + int num_chunks = > + RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); > + for (j = 0; j < num_chunks; j++) { > + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); > + } > + free(rdma_ram_blocks->block[i].pmr); > + rdma_ram_blocks->block[i].pmr = NULL; > + } > + > + return -1; > + > +} > + > +int rdma_reg_whole_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks) > +{ > + int i; > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + rdma_ram_blocks->block[i].mr = > + ibv_reg_mr(rdma_ctx->pd, > + rdma_ram_blocks->block[i].local_host_addr, > + rdma_ram_blocks->block[i].length, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (!rdma_ram_blocks->block[i].mr) { > + break; > + } > + } > + > + if (i >= rdma_ram_blocks->num_blocks) { > + return 0; > + } > + > + for (i--; i >= 0; i--) { > + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); > + } > + > + return -1; > + > +} > + > +int rdma_client_reg_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks) > +{ > +#ifdef RDMA_CHUNK_REGISTRATION > +#ifdef RDMA_LAZY_REGISTRATION > + return 0; > +#else > + return rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks); > +#endif > +#else > + return rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); > +#endif > +} > + > +int rdma_server_reg_ram_blocks(struct rdma_context *rdma_ctx, > + struct rdma_ram_blocks *rdma_ram_blocks) > +{ > + return rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); > +} > + > +void rdma_dereg_ram_blocks(struct rdma_ram_blocks *rdma_ram_blocks) > +{ > + int i, j; > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + int num_chunks; > + if (!rdma_ram_blocks->block[i].pmr) { > + continue; > + } > + num_chunks = RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); > + for (j = 0; j < num_chunks; j++) { > + if (!rdma_ram_blocks->block[i].pmr[j]) { > + continue; > + } > + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); > + } > + free(rdma_ram_blocks->block[i].pmr); > + rdma_ram_blocks->block[i].pmr = NULL; > + } > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + if (!rdma_ram_blocks->block[i].mr) { > + continue; > + } > + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); > + rdma_ram_blocks->block[i].mr = NULL; > + } > +} > + > +void rdma_copy_to_remote_ram_blocks(struct rdma_ram_blocks *local, > + struct rdma_remote_ram_blocks *remote) > +{ > + int i; > + remote->num_blocks = local->num_blocks; > + for (i = 0; i < local->num_blocks; i++) { > + remote->block[i].remote_host_addr = > + (uint64_t)(local->block[i].local_host_addr); > + remote->block[i].remote_rkey = local->block[i].mr->rkey; > + remote->block[i].offset = local->block[i].offset; > + remote->block[i].length = local->block[i].length; > + } > +} > + > +int rdma_process_remote_ram_blocks(struct rdma_ram_blocks *local, > + struct rdma_remote_ram_blocks *remote) > +{ > + int i, j; > + > + if (local->num_blocks != remote->num_blocks) { > + return -1; > + } > + > + for (i = 0; i < remote->num_blocks; i++) { > + /* search local ram blocks */ > + for (j = 0; j < local->num_blocks; j++) { > + if (remote->block[i].offset != local->block[j].offset) { > + continue; > + } > + if (remote->block[i].length != local->block[j].length) { > + return -1; > + } > + local->block[j].remote_host_addr = > + remote->block[i].remote_host_addr; > + local->block[j].remote_rkey = remote->block[i].remote_rkey; > + break; > + } > + if (j >= local->num_blocks) { > + return -1; > + } > + } > + > + return 0; > +} > + > +int rdma_search_ram_block(uint64_t offset, uint64_t length, > + struct rdma_ram_blocks *blocks, > + int *block_index, int *chunk_index) > +{ > + int i; > + for (i = 0; i < blocks->num_blocks; i++) { > + if (offset < blocks->block[i].offset) { > + continue; > + } > + if (offset + length > > + blocks->block[i].offset + blocks->block[i].length) { > + continue; > + } > + *block_index = i; > + if (chunk_index) { > + uint8_t *host_addr = blocks->block[i].local_host_addr + > + (offset - blocks->block[i].offset); > + *chunk_index = RDMA_REG_CHUNK_INDEX( > + blocks->block[i].local_host_addr, host_addr); > + } > + return 0; > + } > + return -1; > +} > + > +static int rdma_get_lkey(struct rdma_context *rdma_ctx, > + struct rdma_ram_block *block, uint64_t host_addr, > + uint32_t *lkey) > +{ > + int chunk; > + if (block->mr) { > + *lkey = block->mr->lkey; > + return 0; > + } > + if (!block->pmr) { > + int num_chunks = RDMA_REG_NUM_CHUNKS(block); > + /* allocate memory to store chunk MRs */ > + block->pmr = rdma_mallocz(num_chunks * > + sizeof(struct ibv_mr *)); > + if (!block->pmr) { > + return -1; > + } > + } > + chunk = RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr); > + if (!block->pmr[chunk]) { > + uint8_t *start_addr = RDMA_REG_CHUNK_START(block, chunk); > + uint8_t *end_addr = RDMA_REG_CHUNK_END(block, chunk); > + if (start_addr < block->local_host_addr) { > + start_addr = block->local_host_addr; > + } > + if (end_addr > block->local_host_addr + block->length) { > + end_addr = block->local_host_addr + block->length; > + } > + block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd, > + start_addr, > + end_addr - start_addr, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (!block->pmr[chunk]) { > + return -1; > + } > + } > + *lkey = block->pmr[chunk]->lkey; > + return 0; > +} > + > +/* Do not merge data if larger than this. */ > +#define RDMA_MERGE_MAX (4 * 1024 * 1024) > + > +#define RDMA_UNSIGNALED_SEND_MAX 64 > + > +int rdma_reg_sync(struct rdma_data *mdata) > +{ > + mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd, > + &mdata->sync, > + sizeof mdata->sync, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (mdata->sync_mr) { > + return 0; > + } > + return -1; > +} > + > +int rdma_dereg_sync(struct rdma_data *mdata) > +{ > + return ibv_dereg_mr(mdata->sync_mr); > +} > + > +int rdma_reg_remote_info( > + struct rdma_data *mdata) > +{ > + mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd, > + &mdata->remote_info, > + sizeof mdata->remote_info, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (mdata->remote_info_mr) { > + return 0; > + } > + return -1; > +} > + > +int rdma_dereg_remote_info( > + struct rdma_data *mdata) > +{ > + return ibv_dereg_mr(mdata->remote_info_mr); > +} > + > + > +int rdma_post_send_sync(struct rdma_data *mdata, > + int wr_id) > +{ > + struct ibv_sge sge; > + struct ibv_send_wr send_wr = { 0 }; > + struct ibv_send_wr *bad_wr; > + > + mdata->sync = 1; > + > + sge.addr = (uint64_t)(&mdata->sync); > + sge.length = sizeof mdata->sync; > + sge.lkey = mdata->sync_mr->lkey; > + > + send_wr.wr_id = wr_id; > + send_wr.opcode = IBV_WR_SEND; > + send_wr.send_flags = IBV_SEND_SIGNALED; > + send_wr.sg_list = &sge; > + send_wr.num_sge = 1; > + > + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > +} > + > +int rdma_post_recv_sync(struct rdma_data *mdata, > + int wr_id) > +{ > + struct ibv_sge sge; > + struct ibv_recv_wr recv_wr = { 0 }; > + struct ibv_recv_wr *bad_wr; > + > + mdata->sync = 1; > + > + sge.addr = (uint64_t)(&mdata->sync); > + sge.length = sizeof mdata->sync; > + sge.lkey = mdata->sync_mr->lkey; > + > + recv_wr.wr_id = wr_id; > + recv_wr.sg_list = &sge; > + recv_wr.num_sge = 1; > + > + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > + > +} > + > +int rdma_post_send_remote_info( > + struct rdma_data *mdata) > +{ > + struct ibv_sge sge; > + struct ibv_send_wr send_wr = { 0 }; > + struct ibv_send_wr *bad_wr; > + > + sge.addr = (uint64_t)(&mdata->remote_info); > + sge.length = sizeof mdata->remote_info; > + sge.lkey = mdata->remote_info_mr->lkey; > + > + send_wr.wr_id = RDMA_WRID_SEND_REMOTE_INFO; > + send_wr.opcode = IBV_WR_SEND; > + send_wr.send_flags = IBV_SEND_SIGNALED; > + send_wr.sg_list = &sge; > + send_wr.num_sge = 1; > + > + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { > + return -1; > + } > + > + mdata->num_signaled_send--; > + return 0; > +} > + > +int rdma_post_recv_remote_info( > + struct rdma_data *mdata) > +{ > + struct ibv_sge sge; > + struct ibv_recv_wr recv_wr = { 0 }; > + struct ibv_recv_wr *bad_wr; > + > + sge.addr = (uint64_t)(&mdata->remote_info); > + sge.length = sizeof mdata->remote_info; > + sge.lkey = mdata->remote_info_mr->lkey; > + > + recv_wr.wr_id = RDMA_WRID_RECV_REMOTE_INFO; > + recv_wr.sg_list = &sge; > + recv_wr.num_sge = 1; > + > + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > +} > + > +static int rdma_write_actual(struct rdma_context *rdma_ctx, > + struct rdma_ram_block *block, > + uint64_t offset, uint64_t length, > + uint64_t wr_id, enum ibv_send_flags flag) > +{ > + struct ibv_sge sge; > + struct ibv_send_wr send_wr = { 0 }; > + struct ibv_send_wr *bad_wr; > + > + sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset)); > + sge.length = length; > + if (rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) { > + fprintf(stderr, "cannot get lkey!\n"); > + return -1; > + } > + send_wr.wr_id = wr_id; > + send_wr.opcode = IBV_WR_RDMA_WRITE; > + send_wr.send_flags = flag; > + send_wr.sg_list = &sge; > + send_wr.num_sge = 1; > + send_wr.wr.rdma.rkey = block->remote_rkey; > + send_wr.wr.rdma.remote_addr = block->remote_host_addr + > + (offset - block->offset); > + > + if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > +} > + > +int rdma_write_flush(struct rdma_data *mdata) > +{ > + int ret; > + enum ibv_send_flags flags = 0; > + > + if (!mdata->current_length) { > + return 0; > + } > + if (mdata->num_unsignaled_send >= > + RDMA_UNSIGNALED_SEND_MAX) { > + flags = IBV_SEND_SIGNALED; > + } > + ret = rdma_write_actual(&mdata->rdma_ctx, > + &(mdata->rdma_ram_blocks.block[mdata->current_index]), > + mdata->current_offset, > + mdata->current_length, > + RDMA_WRID_RDMA, flags); > + > + if (ret) { > + if (ret < 0) { > + fprintf(stderr, "rdma migration: write flush error!\n"); > + } > + return ret; > + } > + > + if (mdata->num_unsignaled_send >= > + RDMA_UNSIGNALED_SEND_MAX) { > + mdata->num_unsignaled_send = 0; > + mdata->num_signaled_send++; > + } else { > + mdata->num_unsignaled_send++; > + } > + > + mdata->total_bytes += mdata->current_length; > + mdata->current_length = 0; > + mdata->current_offset = 0; > + > + return 0; > +} > + > +static inline int rdma_in_current_block( > + struct rdma_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + struct rdma_ram_block *block = > + &(mdata->rdma_ram_blocks.block[mdata->current_index]); > + if (mdata->current_index < 0) { > + return 0; > + } > + if (offset < block->offset) { > + return 0; > + } > + if (offset + len > block->offset + block->length) { > + return 0; > + } > + return 1; > +} > + > +static inline int rdma_in_current_chunk( > + struct rdma_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + struct rdma_ram_block *block = > + &(mdata->rdma_ram_blocks.block[mdata->current_index]); > + uint8_t *chunk_start, *chunk_end, *host_addr; > + if (mdata->current_chunk < 0) { > + return 0; > + } > + host_addr = block->local_host_addr + (offset - block->offset); > + chunk_start = RDMA_REG_CHUNK_START(block, mdata->current_chunk); > + if (chunk_start < block->local_host_addr) { > + chunk_start = block->local_host_addr; > + } > + if (host_addr < chunk_start) { > + return 0; > + } > + chunk_end = RDMA_REG_CHUNK_END(block, mdata->current_chunk); > + if (chunk_end > chunk_start + block->length) { > + chunk_end = chunk_start + block->length; > + } > + if (host_addr + len > chunk_end) { > + return 0; > + } > + return 1; > +} > + > +static inline int rdma_buffer_mergable( > + struct rdma_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + if (mdata->current_length == 0) { > + return 0; > + } > + if (offset != mdata->current_offset + mdata->current_length) { > + return 0; > + } > + if (!rdma_in_current_block(mdata, offset, len)) { > + return 0; > + } > +#ifdef RDMA_CHUNK_REGISTRATION > + if (!rdma_in_current_chunk(mdata, offset, len)) { > + return 0; > + } > +#endif > + return 1; > +} > + > +/* Note that buffer must be within a single block/chunk. */ > +int rdma_write(struct rdma_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + int index = mdata->current_index; > + int chunk_index = mdata->current_chunk; > + int ret; > + > + /* If we cannot merge it, we flush the current buffer first. */ > + if (!rdma_buffer_mergable(mdata, offset, len)) { > + ret = rdma_write_flush(mdata); > + if (ret) { > + return ret; > + } > + mdata->current_length = 0; > + mdata->current_offset = offset; > + > + if (rdma_search_ram_block(offset, len, > + &mdata->rdma_ram_blocks, &index, &chunk_index)) { > + return -1; > + } > + mdata->current_index = index; > + mdata->current_chunk = chunk_index; > + } > + > + /* merge it */ > + mdata->current_length += len; > + > + /* flush it if buffer is too large */ > + if (mdata->current_length >= RDMA_MERGE_MAX) { > + return rdma_write_flush(mdata); > + } > + > + return 0; > +} > + > +int rdma_poll(struct rdma_data *mdata) > +{ > + int ret; > + struct ibv_wc wc; > + > + ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc); > + if (!ret) { > + return RDMA_WRID_NONE; > + } > + if (ret < 0) { > + fprintf(stderr, "ibv_poll_cq return %d!\n", ret); > + return ret; > + } > + if (wc.status != IBV_WC_SUCCESS) { > + fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n", > + wc.status, ibv_wc_status_str(wc.status)); > + fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id); > + > + return -1; > + } > + > + if (!(wc.opcode & IBV_WC_RECV)) { > + mdata->num_signaled_send--; > + } > + > + return (int)wc.wr_id; > +} > + > +int rdma_wait_for_wrid( > + struct rdma_data *mdata, > + int wrid) > +{ > +#ifdef RDMA_BLOCKING > + return rdma_block_for_wrid(mdata, wrid); > +#else > + return rdma_poll_for_wrid(mdata, wrid); > +#endif > +} > + > +int rdma_poll_for_wrid( > + struct rdma_data *mdata, > + int wrid) > +{ > + int r = RDMA_WRID_NONE; > + while (r != wrid) { > + r = rdma_poll(mdata); > + if (r < 0) { > + return r; > + } > + } > + return 0; > +} > + > +int rdma_block_for_wrid( > + struct rdma_data *mdata, > + int wrid) > +{ > + int num_cq_events = 0; > + int r = RDMA_WRID_NONE; > + struct ibv_cq *cq; > + void *cq_ctx; > + > + if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) { > + return -1; > + } > + /* poll cq first */ > + while (r != wrid) { > + r = rdma_poll(mdata); > + if (r < 0) { > + return r; > + } > + if (r == RDMA_WRID_NONE) { > + break; > + } > + } > + if (r == wrid) { > + return 0; > + } > + > + while (1) { > + if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel, > + &cq, &cq_ctx)) { > + goto err_block_for_wrid; > + } > + num_cq_events++; > + if (ibv_req_notify_cq(cq, 0)) { > + goto err_block_for_wrid; > + } > + /* poll cq */ > + while (r != wrid) { > + r = rdma_poll(mdata); > + if (r < 0) { > + goto err_block_for_wrid; > + } > + if (r == RDMA_WRID_NONE) { > + break; > + } > + } > + if (r == wrid) { > + goto success_block_for_wrid; > + } > + } > + > +success_block_for_wrid: > + if (num_cq_events) { > + ibv_ack_cq_events(cq, num_cq_events); > + } > + return 0; > + > +err_block_for_wrid: > + if (num_cq_events) { > + ibv_ack_cq_events(cq, num_cq_events); > + } > + return -1; > +} > + > +void rdma_cleanup(struct rdma_data *mdata) > +{ > + struct rdma_context *rdma_ctx = &mdata->rdma_ctx; > + > + mdata->enabled = 0; > + if (mdata->sync_mr) { > + rdma_dereg_sync(mdata); > + } > + if (mdata->remote_info_mr) { > + rdma_dereg_remote_info(mdata); > + } > + mdata->sync_mr = NULL; > + mdata->remote_info_mr = NULL; > + rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks); > + mdata->rdma_ram_blocks.num_blocks = 0; > + > + if (rdma_ctx->qp) { > + ibv_destroy_qp(rdma_ctx->qp); > + } > + if (rdma_ctx->cq) { > + ibv_destroy_cq(rdma_ctx->cq); > + } > + if (rdma_ctx->comp_channel) { > + ibv_destroy_comp_channel(rdma_ctx->comp_channel); > + } > + if (rdma_ctx->pd) { > + ibv_dealloc_pd(rdma_ctx->pd); > + } > + if (rdma_ctx->listen_id) { > + rdma_destroy_id(rdma_ctx->listen_id); > + } > + if (rdma_ctx->cm_id) { > + rdma_destroy_id(rdma_ctx->cm_id); > + } > + if (rdma_ctx->channel) { > + rdma_destroy_event_channel(rdma_ctx->channel); > + } > + > + rdma_data_init(mdata); > +} > + > +int rdma_client_init(struct rdma_data *mdata) > +{ > + int ret; > + > + if (mdata->client_init_done) { > + return 0; > + } > + > + ret = rdma_resolve_host(&mdata->rdma_ctx, > + mdata->host, mdata->port); > + if (ret) { > + fprintf(stderr, "rdma migration: error resolving host!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_alloc_pd_cq(&mdata->rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_alloc_qp(&mdata->rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating qp!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_init_ram_blocks(&mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_client_reg_ram_blocks(&mdata->rdma_ctx, > + &mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_reg_sync(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering sync data!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_reg_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering remote info!\n"); > + goto err_rdma_client_init; > + } > + > + ret = rdma_post_recv_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error posting remote info recv!\n"); > + goto err_rdma_client_init; > + } > + > + mdata->client_init_done = 1; > + DPRINTF("rdma_client_init success\n"); > + return 0; > + > +err_rdma_client_init: > + rdma_cleanup(mdata); > + return -1; > +} > + > +int rdma_client_connect(struct rdma_data *mdata) > +{ > + int ret; > + > + ret = rdma_migrate_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0); > + if (ret) { > + fprintf(stderr, "rdma migration: error connecting!\n"); > + goto err_rdma_client_connect; > + } > + > + /* wait for remote info */ > + ret = rdma_wait_for_wrid(&rdma_mdata, > + RDMA_WRID_RECV_REMOTE_INFO); > + if (ret < 0) { > + fprintf(stderr, "rdma migration: polling remote info error!\n"); > + goto err_rdma_client_connect; > + } > + > + ret = rdma_process_remote_ram_blocks( > + &mdata->rdma_ram_blocks, &mdata->remote_info); > + if (ret) { > + fprintf(stderr, > + "rdma migration: error processing remote ram blocks!\n"); > + goto err_rdma_client_connect; > + } > + > + rdma_mdata.total_bytes = 0; > + rdma_mdata.enabled = 1; > + DPRINTF("rdma_client_connect success\n"); > + return 0; > + > +err_rdma_client_connect: > + rdma_cleanup(mdata); > + return -1; > +} > + > +int rdma_start_incoming_migration(int s) > +{ > + int ret = rdma_server_init(&rdma_mdata); > + if (ret) { > + return -1; > + } > + > + ret = rdma_server_prepare(&rdma_mdata); > + if (ret) { > + return -1; > + } > + return 0; > +} > + > +int rdma_server_init(struct rdma_data *mdata) > +{ > + > + int ret; > + struct sockaddr_in sin; > + struct rdma_cm_id *listen_id; > + struct rdma_context *rdma_ctx = &mdata->rdma_ctx; > + > + if(mdata->host == NULL) { > + printf("Error: RDMA host is not set! port: %d\n", mdata->port); > + return -1; > + } > + /* create CM channel */ > + rdma_ctx->channel = rdma_create_event_channel(); > + if (!rdma_ctx->channel) { > + return -1; > + } > + > + /* create CM id */ > + ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL, > + RDMA_PS_TCP); > + if (ret) { > + goto err_server_init_create_listen_id; > + } > + > + memset(&sin, 0, sizeof(sin)); > + sin.sin_family = AF_INET; > + sin.sin_port = htons(mdata->port); > + > + if (mdata->host && strcmp("", mdata->host)) { > + struct hostent *server_addr; > + server_addr = gethostbyname(mdata->host); > + if (!server_addr) { > + goto err_server_init_bind_addr; > + } > + memcpy(&sin.sin_addr.s_addr, server_addr->h_addr, > + server_addr->h_length); > + } else { > + sin.sin_addr.s_addr = INADDR_ANY; > + } > + > + ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin); > + if (ret) { > + goto err_server_init_bind_addr; > + } > + > + rdma_ctx->listen_id = listen_id; > + if (listen_id->verbs) { > + rdma_ctx->verbs = listen_id->verbs; > + } > + rdma_dump_id("server_init", rdma_ctx->verbs); > + rdma_dump_gid("server_init", listen_id); > + return 0; > + > +err_server_init_bind_addr: > + rdma_destroy_id(listen_id); > +err_server_init_create_listen_id: > + rdma_destroy_event_channel(rdma_ctx->channel); > + rdma_ctx->channel = NULL; > + fprintf(stderr, "rdma migration: error init server!\n"); > + > + return -1; > + > +} > + > +int rdma_server_prepare(struct rdma_data *mdata) > +{ > + int ret; > + struct rdma_context *rdma_ctx = &mdata->rdma_ctx; > + > + if (!rdma_ctx->verbs) { > + return 0; > + } > + > + ret = rdma_alloc_pd_cq(rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = rdma_init_ram_blocks(&mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = rdma_server_reg_ram_blocks(rdma_ctx, > + &mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = rdma_reg_sync(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering sync data!\n"); > + goto err_rdma_server_prepare; > + } > + > + rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks, > + &mdata->remote_info); > + > + ret = rdma_reg_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering remote info!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = rdma_listen(rdma_ctx->listen_id, 5); > + if (ret) { > + fprintf(stderr, "rdma migration: error listening on socket!\n"); > + goto err_rdma_server_prepare; > + } > + > + return 0; > + > +err_rdma_server_prepare: > + rdma_cleanup(mdata); > + fprintf(stderr, "rdma migration: error preparing server!\n"); > + return -1; > +} > + > +int rdma_accept_incoming_migration( > + struct rdma_data *mdata) > +{ > + > + int ret; > + > + ret = rdma_migrate_listen(mdata, mdata->host, mdata->port); > + if (ret) { > + fprintf(stderr, "rdma migration: error listening!\n"); > + goto err_rdma_server_wait; > + } > + > + ret = rdma_alloc_qp(&mdata->rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating qp!\n"); > + goto err_rdma_server_wait; > + } > + > +#ifdef RDMA_EXTRA_SYNC > + ret = rdma_post_recv_sync(mdata, > + RDMA_WRID_RECV_EXTRA_SYNC); > + if (ret) { > + fprintf(stderr, "rdma migration: error posting extra sync > receive!\n"); > + goto err_rdma_server_wait; > + } > +#endif > + > + ret = rdma_post_recv_sync(mdata, > + RDMA_WRID_RECV_SYNC); > + if (ret) { > + fprintf(stderr, "rdma migration: error posting sync receive!\n"); > + goto err_rdma_server_wait; > + } > + > + ret = rdma_migrate_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0); > + if (ret) { > + fprintf(stderr, "rdma migration: error accepting connection!\n"); > + goto err_rdma_server_wait; > + } > + > + /* send remote info */ > + ret = rdma_post_send_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error sending remote info!\n"); > + goto err_rdma_server_wait; > + } > + > + /* wait for completion */ > + ret = rdma_wait_for_wrid(&rdma_mdata, > + RDMA_WRID_SEND_REMOTE_INFO); > + if (ret < 0) { > + fprintf(stderr, "rdma migration: polling remote info error!\n"); > + goto err_rdma_server_wait; > + } > + > + rdma_mdata.total_bytes = 0; > + rdma_mdata.enabled = 1; > + rdma_dump_gid("server_connect", mdata->rdma_ctx.cm_id); > + return 0; > + > +err_rdma_server_wait: > + fprintf(stderr, "rdma migration: error waiting for client!\n"); > + rdma_cleanup(mdata); > + return -1; > + > +} > + > +void rdma_data_init(struct rdma_data *mdata) > +{ > + rdma_init_context(&mdata->rdma_ctx); > + printf("rdma port: %d\n", rdmaport); > + printf("rdma host: %s\n", rdmahost); > + mdata->port = rdmaport; > + mdata->host = rdmahost; > + mdata->enabled = 0; > + mdata->rdma_ram_blocks.num_blocks = 0; > + mdata->client_init_done = 0; > + mdata->num_unsignaled_send = 0; > + mdata->num_signaled_send = 0; > + mdata->current_offset = 0; > + mdata->current_length = 0; > + mdata->current_index = -1; > + mdata->current_chunk = -1; > + mdata->sync = 0; > + mdata->sync_mr = NULL; > + mdata->remote_info_mr = NULL; > + rdma_update_capability(migrate_get_current()); > +} > + > +void rdma_disable(void) > +{ > + rdma_mdata.port = -1; > + rdma_mdata.enabled = 0; > +} > -- > 1.7.10.4 >