asynchronous operation with poll()

2010-11-09 Thread Jonathan Rosser
I have a client and server test program to explore fully asynchronous 
communication written as close to a conventional sockets application as 
possible and am encountering difficulty.


Both programs run the same code in a thread, sending buffers to each 
other as fast as possible. On the client side only, my poll() call never 
blocks and cm_id-send_cq_channel-fd always seems to be readable. This 
causes the program to loop wildly and consume 100% CPU.


Any ideas? I have ensured that O_NONBLOCK is set on the underlying file 
descriptors. I'm not sure why the server side should run with almost no 
cpu usage yet the client does not.


Here is the client/server loop:


  struct ibv_mr *mr;
  int ret;
  int send_buf_num = 0;
  int recv_buf_num = 0;

  #define NUM_BUFFERS 20
  #define SIZE 1024*1024
  uint8_t *buffer = (uint8_t*)malloc(SIZE * NUM_BUFFERS * 2);
  uint8_t *send_msg[NUM_BUFFERS];
  uint8_t *recv_msg[NUM_BUFFERS];

  for(int i=0; iNUM_BUFFERS; i++) {
send_msg[i] = buffer + (i*SIZE);
recv_msg[i] = buffer + ((i+NUM_BUFFERS) * SIZE);
  }

  //
  // setup
  fprintf(stderr, rdma_reg_msgs\n);
  mr = rdma_reg_msgs(cm_id, buffer, SIZE*NUM_BUFFERS*2);
  if (!mr) {
perror(rdma_reg_msgs);
  }

  //prepare to for the first receive before connecting
  for(int i=0; i10; i++) {
fprintf(stderr, rdma_post_recv\n);
ret = rdma_post_recv(cm_id, NULL, recv_msg[recv_buf_num++], SIZE, mr);
recv_buf_num %= NUM_BUFFERS;
if (ret) {
  perror(rdma_post_recv);
}
  }

  //connect
  fprintf(stderr, rdma_connect\n);
  ret = rdma_connect(cm_id, NULL);
  if (ret) {
perror(rdma_connect);
  }

  const int NUM_FDS = 4;

  const int POLL_CM = 0;
  const int POLL_RECV_CQ = 1;
  const int POLL_SEND_CQ = 2;
  const int POLL_WAKE = 3;
  struct pollfd fds[NUM_FDS];

  //prime notification of events on the recv completion queue
  ibv_req_notify_cq(cm_id-recv_cq, 0);
  //

  //
  // main loop
  while(ret == 0)
  {
memset(fds, 0, sizeof(pollfd) * NUM_FDS);
fds[POLL_CM].fd = cm_channel-fd;
fds[POLL_CM].events = POLLIN;

fds[POLL_RECV_CQ].fd = cm_id-recv_cq_channel-fd;
fds[POLL_RECV_CQ].events = POLLIN;

fds[POLL_SEND_CQ].fd = cm_id-send_cq_channel-fd;
fds[POLL_SEND_CQ].events = POLLIN;

fds[POLL_WAKE].fd = wake_fds[0];
fds[POLL_WAKE].events = POLLIN;

int nready = poll(fds, NUM_FDS, -1);
if(nready  0) {
  perror(poll);
}

if(fds[POLL_CM].revents  POLLIN) {
  struct rdma_cm_event *cm_event;
  ret = rdma_get_cm_event(cm_channel, cm_event);
  if(ret) {
perror(client connection rdma_get_cm_event);
  }
  fprintf(stderr, Got cm event %s\n, rdma_event_str(cm_event-event));

  if(cm_event-event == RDMA_CM_EVENT_ESTABLISHED) {
//send as soon as we are connected
ibv_req_notify_cq(cm_id-send_cq, 0);
ret = rdma_post_send(cm_id, NULL, send_msg[send_buf_num], SIZE, mr, 0);
send_buf_num++;
send_buf_num %= NUM_BUFFERS;
if (ret) {
  perror(rdma_post_send);
}
  }

  int finish=0;
  if(cm_event-event == RDMA_CM_EVENT_DISCONNECTED ||
 cm_event-event == RDMA_CM_EVENT_DEVICE_REMOVAL)
  finish = 1;

  rdma_ack_cm_event(cm_event);
  if(finish) {
goto out;
  }
}

//if the send completed
if(fds[POLL_SEND_CQ].revents  POLLIN) {
  struct ibv_cq *cq;
  struct ibv_wc wc[10];
  void *context;
  int num_send = ibv_poll_cq(cm_id-send_cq, 10, wc[0]);

  if(num_send == 0) fprintf(stderr, .);

  for(int i=0; inum_send; i++) {
fprintf(stderr,Got SEND CQ event : %d of %d %s\n, i, num_send, 
ibv_wc_status_str(wc[i].status));
ibv_get_cq_event(cm_id-send_cq_channel, cq, context);
assert(cq == cm_id-send_cq);

//our send completed, send some more right away
fprintf(stderr, rdma_post_send\n);
ret = rdma_post_send(cm_id, NULL, send_msg[send_buf_num++], SIZE, mr, 
0);
send_buf_num %= NUM_BUFFERS;
if (ret) {
  perror(rdma_post_send);
}
  }

  //expensive call, ack all received events together
  ibv_ack_cq_events(cm_id-send_cq, num_send);
  ibv_req_notify_cq(cm_id-send_cq, 0);
}

//if the receive completed, prepare to receive more
if(fds[POLL_RECV_CQ].revents  POLLIN) {
  struct ibv_cq *cq;
  struct ibv_wc wc[10];
  void *context;
  int num_recv=ibv_poll_cq(cm_id-recv_cq, 10, wc[0]);

  for(int i=0; inum_recv; i++) {
fprintf(stderr,Got RECV CQ event : %d of %d %s\n, i, num_recv, 
ibv_wc_status_str(wc[i].status));
ibv_get_cq_event(cm_id-recv_cq_channel, cq, context);
assert(cq == cm_id-recv_cq);

//we received some payload, prepare to receive more
fprintf(stderr, rdma_post_recv\n);
ret = rdma_post_recv(cm_id, 

ib receive completion error

2010-11-09 Thread Usha Srinivasan
Hello,
Can someone from Mellanox tell me what the vendor error 0x32 means?   I am 
getting this error for wc.opcode 128 (IB_WC_RECV) wc.status 4 
(IB_WC_LOC_PROT_ERR).  I am running ofed 1.5.2 and am getting it on both rhel5 
and sles11

Thanks in advance!
Usha

___
Usha Srinivasan
Software Engineer
QLogic Corporation
780 5th Ave, Suite A
King of Prussia, PA 19406
(610) 233-4844
(610) 233-4777 (Fax)
(610) 233-4838 (Main Desk)

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps

2010-11-09 Thread Sasha Khapyorsky
On 19:43 Thu 04 Feb , Eli Dorfman (Voltaire) wrote:
 
 Subject: [PATCH] Wrong handling of MC create and delete traps
 
 For these traps the GID in the data details is the MGID and
 not the source port gid.
 So the SM should check that subscriber port has the pkey of the MC group.
 There was also an error in comparing the subnet prefix and guid due to
 host/network order mismatch.
 
 Signed-off-by: Eli Dorfman e...@voltaire.com

Rebased and applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: asynchronous operation with poll()

2010-11-09 Thread Jason Gunthorpe
On Tue, Nov 09, 2010 at 03:58:27PM +, Jonathan Rosser wrote:
 I have a client and server test program to explore fully
 asynchronous communication written as close to a conventional
 sockets application as possible and am encountering difficulty.

Broadly it looks to me like your actions are in the wrong order.
A poll based RDMA loop should look like this:

- exit poll
- Check poll bit
- call ibv_get_cq_event
- call ibv_req_notify_cq
- repeatedly call ibv_poll_cq (while rc == num requested)
- Issue new work
- return to poll

Generally, for your own sanity, I recommend splitting into 3 functions
- Do the stuff with ibv_get_cq_event
- Drain and process WC's
- Issue new work

Most real use cases will also want to call the latter two functions
from other waiters in the poll loop (ie whatever your wak_fds is for).

Some random mild comments for you:

   const int NUM_FDS = 4;
 
   const int POLL_CM = 0;
   const int POLL_RECV_CQ = 1;
   const int POLL_SEND_CQ = 2;
   const int POLL_WAKE = 3;

You can use an enum for these constants

   //prime notification of events on the recv completion queue
   ibv_req_notify_cq(cm_id-recv_cq, 0);

Do this earlier, before posting recvs, otherwise you could race
getting your first recv.


   while(ret == 0)
   {
 memset(fds, 0, sizeof(pollfd) * NUM_FDS);
 fds[POLL_CM].fd = cm_channel-fd;
 fds[POLL_CM].events = POLLIN;
 
 fds[POLL_RECV_CQ].fd = cm_id-recv_cq_channel-fd;
 fds[POLL_RECV_CQ].events = POLLIN;
 
 fds[POLL_SEND_CQ].fd = cm_id-send_cq_channel-fd;
 fds[POLL_SEND_CQ].events = POLLIN;
 
 fds[POLL_WAKE].fd = wake_fds[0];
 fds[POLL_WAKE].events = POLLIN;

The efficient use of poll does not put these inside the main loop. You
only need to initialize fd and events once at the start.

 if(fds[POLL_CM].revents  POLLIN) {
   struct rdma_cm_event *cm_event;
   ret = rdma_get_cm_event(cm_channel, cm_event);
   if(ret) {
 perror(client connection rdma_get_cm_event);
   }
   fprintf(stderr, Got cm event %s\n, rdma_event_str(cm_event-event));
 
   if(cm_event-event == RDMA_CM_EVENT_ESTABLISHED) {
 //send as soon as we are connected
 ibv_req_notify_cq(cm_id-send_cq, 0);

Again, this should be done once, right after the cq is created.

 //if the send completed
 if(fds[POLL_SEND_CQ].revents  POLLIN) {
   struct ibv_cq *cq;
   struct ibv_wc wc[10];
   void *context;
   int num_send = ibv_poll_cq(cm_id-send_cq, 10, wc[0]);
   if(num_send == 0) fprintf(stderr, .);

Check that num_sends == 10 and loop again

   for(int i=0; inum_send; i++) {
 fprintf(stderr,Got SEND CQ event : %d of %d %s\n, i, num_send, 
  ibv_wc_status_str(wc[i].status));
 ibv_get_cq_event(cm_id-send_cq_channel, cq, context);

cq_events are not tied to send WC's, this should be done
exactly once, prior to calling ibv_poll_cq

   //expensive call, ack all received events together
   ibv_ack_cq_events(cm_id-send_cq, num_send);

You don't have to do this at all in the loop unless you are
doing multithreaded things. Using num_send is wrong, I use this:

bool checkCQPoll(struct pollfd p)
{
if ((p.revents  POLLIN) == 0 ||
ibv_get_cq_event(comp,jnk1,jnk2) != 0)
return false;

compEvents++;
if (compEvents = INT_MAX)
{
ibv_ack_cq_events(cq,compEvents);
compEvents = 0;
}
int rc;
if ((rc = ibv_req_notify_cq(cq,0)) == -1)
{
errno = rc;
[..]

And then call ibv_ack_cq_events(cq,compEvents) before trying to
destroy the CQ. All it used for it synchronizing exits between threads.

   ibv_req_notify_cq(cm_id-send_cq, 0);

Do right after calling ibv_get_cq_event

 //if the receive completed, prepare to receive more
 if(fds[POLL_RECV_CQ].revents  POLLIN) {
   struct ibv_cq *cq;
   struct ibv_wc wc[10];
   void *context;
   int num_recv=ibv_poll_cq(cm_id-recv_cq, 10, wc[0]);

Same problems as for send, they should be the same. Implement a
function like my checkCQPoll example and call it for both cases.

Continually posting sends and recvs will get you into trouble, you
will run out of recvs and get RNR's. These days the wisdom for
implementing RDMA is that you should have explicit message flow
control. Ie for something simple like this you could say that getting
a recv means another send is OK, but you still need a mechanism to
wait for a send buffer to be returned on the send CQ - there is no
ordering guarantee.

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html