Hi, I used your changes and it helps in some cases, but there are still situations where the umad_send return with that error. I try to describe this situation:
(Node 1) -> (Node 2) -> (Node 3) Node 1: sends 100 SubnGets to Node 3 (Dr [0][1][1]) Node 2: traverse 100 SubGets to Node 3 and also traverse 100 SubnGetResp to Node 1 Node 3: response 100 times That works fine!! Please don't wonder that the Node2 gets the packets, that's because I changed the SMI. But if I start now the sender on Node 1 again, so that it sends another 100 SubnGets the Node 2 produces umad_send errors. The error didn't come every time. The receive are allways ok and also the packets are. Below I attach the main code from the router tool on Node 2. I also tested to allocate a packet for every single receive and send, but that didn't work as well. What is about the size of the packet, could there be any error? Thanks Michael while(run){ bcopy((char*)&fd_ports,(char*)&fd_ports_tmp,sizeof(fd_ports)); activ = select(highest_fd+1, (fd_set*)&fd_ports_tmp, (fd_set*)0, (fd_set*)0,(struct timeval*)0); if (activ < 0 ){ if (run) printf("Error: select : %i\n",activ); run = 0; } else if (activ == 0) printf("Nothing to do\n"); else { // ++ Alloc MAD ++ //printf("... Alloc UMAD ......................."); if (!(umad = umad_alloc(Port_ID_cnt, umad_size() + IB_MAD_SIZE))){ printf("Error: umad_alloc\n"); goto Exit; } //printf("done\n"); // ++ Alloc SMP Pointer ++ //printf("... Alloc SMP ........................"); smp = (struct drsmp**) malloc(Port_ID_cnt * sizeof(struct drsmp*)); for (i = 0; i < Port_ID_cnt; i++) smp[i] = (struct drsmp*) umad_get_mad(umad + (i * (umad_size() + IB_MAD_SIZE))); //printf("done\n"); // ++ Check All Ports where something is to do ++ for (i = 0; i < Port_ID_cnt; i++) { if ( (Port_ID[i] >= 0) && (Agent_ID[i] >= 0) && (FD_ISSET(umad_get_fd(Port_ID[i]),(fd_set*)&fd_ports_tmp))) { smplength = IB_MAD_SIZE; packet_size = umad_size() + IB_MAD_SIZE; printf("... Recv Mad (Port: %i (ID:%i).....",i+1,Port_ID[i]); // ++ Receive ++ if ((ret = umad_recv(Port_ID[i], umad + (i * packet_size), &smplength, timeout_ms_r)) != Agent_ID[i]){ printf("Error: umad_recv: %s ,Nr: %i\n", drmad_status_str(smp[i]),ret); if (optExitRecvFail) run = 0; } else { // ++ Drop Echo ++ if (smp[i]->initial_path[1] != 0) { // ++ Keep TID in Mind with supporting turning algorithm ++ if ( !( (smp[i]->initial_path[smp[i]->hop_ptr] == i+1) && (smp[i]->status & DIRECTION) && (smp[i]->hop_cnt == smp[i]->hop_ptr) && (smp[i]->initial_path[smp[i]->hop_ptr] != smp[i]->initial_path[smp[i]->hop_ptr - 1]) ) && ( (Agent_TIDs[i] == -1) || (Agent_TIDs[i] != (own_ntoh64(smp[i]->tid) >> 32)) ) ) Agent_TIDs[i] = smp[i]->tid; printf("TID: 0x%lx\n",own_ntoh64(Agent_TIDs[i])); // ++ Message Logging ++ if (optMsgLog) { fprintf(MsgLogFile,"...............................................................................................\n"); fprintf(MsgLogFile,"... Recv Mad (Port: %i (ID:%i)...............\n",i+1,Port_ID[i]); fprintf(MsgLogFile,"... Recv TID: 0x%lx \n",own_ntoh64(Agent_TIDs[i])); dump_dr_smp(smp[i], MsgLogFile); } // ++ Looking up the Out-Port ++ Out_Port_index = routing(smp[i],Devices_Info,Devices_cnt); if ((Out_Port_index >= 0) && (Port_ID[Out_Port_index] >=0)){ printf("... Send Mad (Port: %i (ID:%i).....",Out_Port_index+1,Port_ID[Out_Port_index]); // ++ Replace TID if (Agent_TIDs[Out_Port_index] != -1) smp[i]->tid = (uint64_t) Agent_TIDs[Out_Port_index]; // ++ Sending ++ //printf("%i\n",timeout_ms_s); //= (smp[i]->status & DIRECTION)? 0 : 200; if ((ret = umad_send(Port_ID[Out_Port_index], Agent_ID[Out_Port_index], umad + (i * packet_size), smplength, (smp[i]->status & DIRECTION)? 0 : timeout_ms_s, 3)) < 0){ printf("Error: umad_send Nr: %i \n",ret); if (optExitSendFail) run = 0; } else printf("TID: 0x%lx \n",own_ntoh64(Agent_TIDs[Out_Port_index])); if (optMsgLog) { fprintf(MsgLogFile,"... Send TID: 0x%lx \n",own_ntoh64(Agent_TIDs[Out_Port_index])); fprintf(MsgLogFile,"... Send Mad (Port: %i (ID:%i)(%s)(%i)...............\n",Out_Port_index+1,Port_ID[Out_Port_index],(ret >= 0)?"OK":"Fail",(smp[i]->status & DIRECTION)? 0 : timeout_ms_s); fprintf(MsgLogFile,"...............................................................................................\n"); fflush(MsgLogFile); } traversed++; } } else { printf("dropped, probably there is missing a response mad\n"); dropped++; } } } } if (umad) umad_free(umad); } printf("... Traversed Packets (%i)(%i) .............................\n",traversed,dropped); } _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general