[openib-general] [PATCH 3 of 3] mad: large RMPP support
patch 3 of 3 --- Large RMPP support, send side: split a multipacket MAD buffer to a list of segments, (multipacket_list) and send these using an gather list of size 2. Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]> Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> Index: last_stable/drivers/infiniband/core/mad_rmpp.c === --- last_stable.orig/drivers/infiniband/core/mad_rmpp.c +++ last_stable/drivers/infiniband/core/mad_rmpp.c @@ -570,16 +532,23 @@ start_rmpp(struct ib_mad_agent_private * return mad_recv_wc; } -static inline u64 get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr) +static inline void *get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr) { - return mad_send_wr->sg_list[0].addr + mad_send_wr->data_offset + - (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset) * - (mad_send_wr->seg_num - 1); + struct ib_mad_multipacket_seg *seg; + int i = 2; + + list_for_each_entry(seg, &mad_send_wr->multipacket_list, list) { + if (i == mad_send_wr->seg_num) + return seg->data; + i++; + } + return NULL; } -static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) +int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; + void *next_data; int timeout; u32 paylen; @@ -592,14 +561,14 @@ static int send_next_seg(struct ib_mad_s paylen = mad_send_wr->total_seg * IB_MGMT_RMPP_DATA - mad_send_wr->pad; rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); - mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad); } else { - mad_send_wr->send_wr.num_sge = 2; - mad_send_wr->sg_list[0].length = mad_send_wr->data_offset; - mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr); - mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) - -mad_send_wr->data_offset; - mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey; + next_data = get_seg_addr(mad_send_wr); + if (!next_data) { + printk(KERN_ERR PFX "send_next_seg: " + "could not find next segment\n"); + return -EINVAL; + } + mad_send_wr->send_buf.mad_payload = next_data; rmpp_mad->rmpp_hdr.paylen_newwin = 0; } @@ -838,7 +807,7 @@ out: int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; - int i, total_len, ret; + int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & @@ -848,20 +817,16 @@ int ib_send_rmpp_mad(struct ib_mad_send_ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) return IB_RMPP_RESULT_INTERNAL; - if (mad_send_wr->send_wr.num_sge > 1) - return -EINVAL; /* TODO: support num_sge > 1 */ + if (mad_send_wr->send_wr.num_sge != 2) + return -EINVAL; mad_send_wr->seg_num = 1; mad_send_wr->newwin = 1; mad_send_wr->data_offset = data_offset(rmpp_mad->mad_hdr.mgmt_class); - total_len = 0; - for (i = 0; i < mad_send_wr->send_wr.num_sge; i++) - total_len += mad_send_wr->send_wr.sg_list[i].length; - -mad_send_wr->total_seg = (total_len - mad_send_wr->data_offset) / + mad_send_wr->total_seg = (mad_send_wr->total_length - mad_send_wr->data_offset) / (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset); - mad_send_wr->pad = total_len - IB_MGMT_RMPP_HDR - + mad_send_wr->pad = mad_send_wr->total_length - IB_MGMT_RMPP_HDR - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); /* We need to wait for the final ACK even if there isn't a response */ Index: last_stable/drivers/infiniband/core/mad.c === --- last_stable.orig/drivers/infiniband/core/mad.c +++ last_stable/drivers/infiniband/core/mad.c @@ -779,6 +779,17 @@ static int get_buf_length(int hdr_len, i return hdr_len + data_len + pad; } +static void free_send_multipacket_list(struct ib_mad_send_wr_private * + mad_send_wr) +{ + struct ib_mad_multipacket_seg *s, *t; + + list_for_each_entry_safe(s, t, &mad_send_wr->multipacket_list, list) { + list_del(&s->list); + kfree(s); + } +} + struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int
[openib-general] [PATCH 2 of 3] mad: large RMPP support
patch 2 of 3 --- Large RMPP support, receive side: copy the arriving MADs to chunks instead of coalescing to one large buffer in kernel space. Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]> Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> Index: last_stable/drivers/infiniband/core/mad_rmpp.c === --- last_stable.orig/drivers/infiniband/core/mad_rmpp.c +++ last_stable/drivers/infiniband/core/mad_rmpp.c @@ -433,44 +433,6 @@ static struct ib_mad_recv_wc * complete_ return rmpp_wc; } -void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf) -{ - struct ib_mad_recv_buf *seg_buf; - struct ib_rmpp_mad *rmpp_mad; - void *data; - int size, len, offset; - u8 flags; - - len = mad_recv_wc->mad_len; - if (len <= sizeof(struct ib_mad)) { - memcpy(buf, mad_recv_wc->recv_buf.mad, len); - return; - } - - offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class); - - list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) { - rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad; - flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr); - - if (flags & IB_MGMT_RMPP_FLAG_FIRST) { - data = rmpp_mad; - size = sizeof(*rmpp_mad); - } else { - data = (void *) rmpp_mad + offset; - if (flags & IB_MGMT_RMPP_FLAG_LAST) - size = len; - else - size = sizeof(*rmpp_mad) - offset; - } - - memcpy(buf, data, size); - len -= size; - buf += size; - } -} -EXPORT_SYMBOL(ib_coalesce_recv_mad); - static struct ib_mad_recv_wc * continue_rmpp(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) Index: last_stable/drivers/infiniband/core/user_mad.c === --- last_stable.orig/drivers/infiniband/core/user_mad.c +++ last_stable/drivers/infiniband/core/user_mad.c @@ -176,6 +177,88 @@ static int queue_packet(struct ib_umad_f return ret; } +static int data_offset(u8 mgmt_class) +{ + if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) + return IB_MGMT_SA_HDR; + else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && +(mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) + return IB_MGMT_VENDOR_HDR; + else + return IB_MGMT_RMPP_HDR; +} + +static int copy_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, + struct ib_umad_packet *packet) +{ + struct ib_mad_recv_buf *seg_buf; + struct ib_rmpp_mad *rmpp_mad; + void *data; + struct ib_mad_multipacket_seg *seg; + int size, len, offset; + u8 flags; + + len = mad_recv_wc->mad_len; + if (len <= sizeof(struct ib_mad)) { + memcpy(&packet->mad.data, mad_recv_wc->recv_buf.mad, len); + return 0; + } + + /* Multipacket (RMPP) MAD */ + offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class); + + list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) { + rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad; + flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr); + + if (flags & IB_MGMT_RMPP_FLAG_FIRST) { + size = sizeof(*rmpp_mad); + memcpy(&packet->mad.data, rmpp_mad, size); + } else { + data = (void *) rmpp_mad + offset; + if (flags & IB_MGMT_RMPP_FLAG_LAST) + size = len; + else + size = sizeof(*rmpp_mad) - offset; + seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) + + sizeof(struct ib_rmpp_mad) - offset, + GFP_KERNEL); + if (!seg) + return -ENOMEM; + memcpy(seg->data, data, size); + list_add_tail(&seg->list, &packet->seg_list); + } + len -= size; + } + return 0; +} + +static struct ib_umad_packet *alloc_packet(void) +{ + struct ib_umad_packet *packet; + int length = sizeof *packet + sizeof(struct ib_mad); + + packet = kzalloc(length, GFP_KERNEL); + if (!packet) { + printk(KERN_ERR "alloc_packet: mem alloc failed for length %d\n", + length); + return NULL; + } + INIT_LIST_HEAD(&packet->seg_list); + return packet; +} + +static void free_packet(struct ib_umad_packet *packet) +{ + struct ib_mad_multipacket_seg *seg, *
[openib-general] [PATCH 1 of 3] mad: large RMPP support
patch 1 of 3 --- Large RMPP support: changes/additions to underlying data structures and prototypes. Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]> Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> Index: last_stable/drivers/infiniband/include/rdma/ib_mad.h === --- last_stable.orig/drivers/infiniband/include/rdma/ib_mad.h +++ last_stable/drivers/infiniband/include/rdma/ib_mad.h @@ -141,6 +141,12 @@ struct ib_rmpp_hdr { __be32 paylen_newwin; }; +struct ib_mad_multipacket_seg { + struct list_head list; + u32 size; + u8 data[0]; +}; + typedef u64 __bitwise ib_sa_comp_mask; #define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n)) @@ -220,7 +226,9 @@ struct ib_class_port_info */ struct ib_mad_send_buf { struct ib_mad_send_buf *next; - void*mad; + void*mad; /* RMPP: first segment, +including the MAD header */ + void*mad_payload; /* RMPP: changed per segment */ struct ib_mad_agent *mad_agent; struct ib_ah*ah; void*context[2]; @@ -485,17 +493,6 @@ int ib_unregister_mad_agent(struct ib_ma int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf); -/** - * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer. - * @mad_recv_wc: Work completion information for a received MAD. - * @buf: User-provided data buffer to receive the coalesced buffers. The - * referenced buffer should be at least the size of the mad_len specified - * by @mad_recv_wc. - * - * This call copies a chain of received MAD segments into a single data buffer, - * removing duplicated headers. - */ -void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf); /** * ib_free_recv_mad - Returns data buffers used to receive a MAD. @@ -601,6 +598,16 @@ struct ib_mad_send_buf * ib_create_send_ gfp_t gfp_mask); /** + * ib_get_multipacket_seg - returns a segment of an RMPP multipacket mad send + * @send_buf: Previously allocated send data buffer. + * @seg_num: number of the segment to return. + * + * This routine returns a pointer to a segment of a multipacket RMPP message. + */ +struct ib_mad_multipacket_seg *ib_get_multipacket_seg(struct ib_mad_send_buf * + send_buf, int seg_num); + +/** * ib_free_send_mad - Returns data buffers used to send a MAD. * @send_buf: Previously allocated send data buffer. */ Index: last_stable/drivers/infiniband/core/mad_priv.h === --- last_stable.orig/drivers/infiniband/core/mad_priv.h +++ last_stable/drivers/infiniband/core/mad_priv.h @@ -119,7 +119,8 @@ struct ib_mad_send_wr_private { struct list_head agent_list; struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_buf send_buf; - DECLARE_PCI_UNMAP_ADDR(mapping) + DECLARE_PCI_UNMAP_ADDR(header_mapping) + DECLARE_PCI_UNMAP_ADDR(payload_mapping) struct ib_send_wr send_wr; struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; __be64 tid; @@ -130,9 +131,11 @@ struct ib_mad_send_wr_private { enum ib_wc_status status; /* RMPP control */ + struct list_head multipacket_list; int last_ack; int seg_num; int newwin; + int total_length; int total_seg; int data_offset; int pad; Index: last_stable/drivers/infiniband/core/user_mad.c === --- last_stable.orig/drivers/infiniband/core/user_mad.c +++ last_stable/drivers/infiniband/core/user_mad.c @@ -123,6 +123,7 @@ struct ib_umad_packet { struct ib_mad_send_buf *msg; struct list_head list; intlength; + struct list_head seg_list; struct ib_user_mad mad; }; ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT2.0immediatedataproposal
>The requirement is to provide an API that supports RDMA writes with immediate >data. A send that follows an RDMA write is not immediate data, and the API >should not be constructed around trying to make it so. To be clear, I believe that write with immediate should be part of the normal APIs, rather than an extension, but should be designed around those devices that provide it natively. - Sean ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: [git patch review 2/2] IB: Don't doublefree pages from scatterlist
Hugh> It's now looking like this change won't be needed after all: Hugh> Andi has just posted a patch in the "ipr" thread which Hugh> should stop x86_64 from interfering with the scatterlist Hugh> *page,offset,length fields, so what IB and others were doing Hugh> should then work safely (current thinking is that x86_64 is Hugh> the only architecture which coalesced in that way). OK, I'll drop this from my tree. - R. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0immediatedataproposal
>I am not clear what you are proposing? >A transport specific API? > >The current proposal provides on sending side: >single post, and single completion in the error free case. >This is commonality that simplify ULP. App 1 - transport aware: if (transport == IB) Do something else Do something different App 2 - transport independent: if (immediate data flag set) if (DTO == 1) Do something else do something else else do something different All you've done is add flags in order to call the API "transport neutral". The result to the application is the same, except that the interface is more complex than it needs to be, and causes confusion on the receiving side. And on the sending side, the application still needs to check the flag to see if immediate data is supported. A true transport neutral API wouldn't need flags that specify the actual differences between the transports. The requirement is to provide an API that supports RDMA writes with immediate data. A send that follows an RDMA write is not immediate data, and the API should not be constructed around trying to make it so. If you want to add a new requirement to the API to support posting multiple work requests with a single call, that is a different requirement. - Sean ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] relocation error / link time reference error
-- Original message -- Date: 05 Feb 2006 11:44:52 -0500 From: Hal Rosenstock <[EMAIL PROTECTED]> Reply-To: Hal Rosenstock <[EMAIL PROTECTED]> To: Sean Hubbell <[EMAIL PROTECTED]> Subject: Re: [openib-general] relocation error / link time reference error On Sun, 2006-02-05 at 09:40, Sean Hubbell wrote: > Hal, > > I removed and rebuilt everything. And everything's OK now ? -- Hal Nope, I still have the link time reference problem. I'll download the latest svn tree again in the morning and rebuild. How do you typically download and rebuild? Here are the steps that I follow: 1) Download the openib code. 2) Copy a version of the Kernel Source Tree and copy over the infiniband directory to the drivers dir. 3) Removed the include/rdma directory and all of the .svn directories 4) Get a second version of the Kernel Source Tree and build a patch file for the infiniband changes. 5) I add the patch file to the linux-2.6.15.spec file 6) I rebuild the kernel (rpm based kernel) and then install the rpms (smp, numa, ...). 7) I reboot. 8) I then remove all of the openib modules 9) I then rebuilt openib tools from the commands listed on the wiki FAQ. 10) That's it ... How do you rebuilt openib? Do you pull from a particular tag or the trunk? If anyone has a better way to build the kernel, please let me know. I only want to make sure that I can built it as an rpm because I like the ability to figure out what file goes with what package. Any and all suggestions would be appreciated. Thanks again for all of the help Hal. Sean Hubbell ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] questions about gen2 srp driver
chas> it seems to take scsi_host->host_lock with a spin_lock_irq() chas> inside a couple of work queues. i believe work queues run chas> at process context and not interrupt context. therefore, chas> one should probably use spin_lock_irqsave()? Yes, it's exactly because we know that work queues run in process context with interrupts enabled which lets us use spin_lock_irq. chas> if there is only a single set of rdma keys how can the chas> driver support more than one command (particularly on a chas> target with multiple lun's) outstanding command? i didn't chas> think the srp_post_send() was synchronus with respect to the chas> completion of the current rdma request? There's no limitation on number of outstanding RDMAs targeting a single R_Key. - R. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
I am not clear what you are proposing? A transport specific API? The current proposal provides on sending side: single post, and single completion in the error free case. This is commonality that simplify ULP. Arkady Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Larsen, Roy K [mailto:[EMAIL PROTECTED] > Sent: Monday, February 06, 2006 6:50 PM > To: Kanevsky, Arkady; Caitlin Bestler; > [EMAIL PROTECTED]; Sean Hefty > Cc: openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > > > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > >Sent: Monday, February 06, 2006 2:27 PM > > > >Roy, > >comments inline. > > > > Mine too > > >> > >> >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > >> >Roy, > >> >Can you explain, please? > >> > > >> >For IB the operation will be layered properly on Transport > primitive. > >> >And on Recv side it will indicate in completion event DTO that it > >> >matches RDMA Write with Immediate and that Immediate Data is > >> in event. > >> > > >> >For iWARP I expect initially, it will be layered on RDMA > >> Write followed > >> >by Send. The Provider can do post more efficiently than > Consumer and > >> >guarantee atomicity. > >> >On Recv side Consumer will get Recv DTO completion in event and > >> >Immediate Data inline as specified by Provider Attribute. > >> > > >> >From the performance point of view Consumers who program > to IB only > >> >will have no performance degradation at all. But this API > >> also allows > >> >Consumers to write ULP to be transport independent with minimal > >> >penalty: one binary comparison and extra 4 bytes in recv buffer. > >> > >> If the application could be written transport > independently, I would > >> have no objection at all. Instead, it must be written in a > >> transport-adaptive way and to be able to adapt to all possible > >> implementations, the application could not send arbitrary > >> "immediate"-sized data as messages because there is no way to > >> distinguish between them on the receiving side. That is > HUGE! It is > >> my experience that send/receive is generally used for > small messages > >> and to take away particular message sizes or to depend on > the so the > >> application can "adapt" to whatever the immediate size is for a > >> particular transport, if even needed, is a very weak facility to > >> offer. > > > >But the remote side does posts Recv. Since it anticipate > that this Recv > >will be matched against the RDMA Write with immediate it > posts the recv > >buffer which fits. Yes, there is an issue for > Transport-independent ULP > >that it does needs a buffer. > >For IB it is possible to post 0-size buffer. But if this is the case > >Recv end Consumer DOES know that it will be macthed against > RDMA Write > >so ULP DOES know what it will be matched against. > >So in the worst case Consumer does have to pay the price of creating > >LMR to handle 4 byte buffer to match RDMA Write Immediate data. > > I think you missed my larger point. The point was that the > application must be written in such a way that it could > inferred when immediate data arrived for a variety of > immediate data sizes and that places a constraint on the > application wrt to data it may want to send/receive normally. > Where as, if the application embraced the fact that it was > responsible for sending a message to indicate a write > completion, it is free to send whatever amount of data best > met its needs. > > Transports that support true immediate data do not require > the ULP to perform buffer matching. They can post a series > of receive buffers that may or may not indicate immediate > data. The ULP does not have to know ahead of time when > immediate data will arrive **against other data receives**. > The fact that an IB oriented application never needs to back > a receive request with a buffer if they were only used to > indicate immediate data is orthogonal. > > > > >> > >> It also affects interface resource allocation. Send queue > sizes will > >> have to adapt to possibly twice there size. > >> > > > >That is correct. We argued about it at the meeting. > >One alternative is to have EP and EVD attr. But this will not be > >efficient since it will double the queue size where a > smaller increment > >is possible due to the depth of the RDMA Write pipeline outstanding. > > > >> It just dawned on me that the immediate data must be in registered > >> memory to be sent in a message. This means the API must > be amended > >> to pass an LMR or, even worse, the provider would have to register > >> memory in the speed path or create and manipulate its own queue of > >> "immediate" >
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0immediatedataproposal
>I would think an array of pointers and a count to standard work requests >would do it. And of course, each work request can control whether is >solicits a completion so a write/send sequence can generate a single >completion event on both ends. Use the EVD lock to guard against other >threads injecting requests on the queue during a combined request >operation and the ULP has everything it needs. This is what the OpenIB stack does today. The difference is that it uses a linked list, rather than an array. - Sean ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
>From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] >Sent: Monday, February 06, 2006 2:27 PM > >Roy, >comments inline. > Mine too >> >> >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] >> >Roy, >> >Can you explain, please? >> > >> >For IB the operation will be layered properly on Transport primitive. >> >And on Recv side it will indicate in completion event DTO that it >> >matches RDMA Write with Immediate and that Immediate Data is >> in event. >> > >> >For iWARP I expect initially, it will be layered on RDMA >> Write followed >> >by Send. The Provider can do post more efficiently than Consumer and >> >guarantee atomicity. >> >On Recv side Consumer will get Recv DTO completion in event and >> >Immediate Data inline as specified by Provider Attribute. >> > >> >From the performance point of view Consumers who program to IB only >> >will have no performance degradation at all. But this API >> also allows >> >Consumers to write ULP to be transport independent with minimal >> >penalty: one binary comparison and extra 4 bytes in recv buffer. >> >> If the application could be written transport independently, >> I would have no objection at all. Instead, it must be >> written in a transport-adaptive way and to be able to adapt >> to all possible implementations, the application could not >> send arbitrary "immediate"-sized data as messages because >> there is no way to distinguish between them on the receiving >> side. That is HUGE! It is my experience that send/receive >> is generally used for small messages and to take away >> particular message sizes or to depend on the so the >> application can "adapt" to whatever the immediate size is for >> a particular transport, if even needed, is a very weak >> facility to offer. > >But the remote side does posts Recv. Since it anticipate that >this Recv will be matched against the RDMA Write with immediate >it posts the recv buffer which fits. Yes, there is an issue >for Transport-independent ULP that it does needs a buffer. >For IB it is possible to post 0-size buffer. But if this is the case >Recv end Consumer DOES know that it will be macthed against RDMA >Write so ULP DOES know what it will be matched against. >So in the worst case Consumer does have to pay the price of creating >LMR to handle 4 byte buffer to match RDMA Write Immediate data. I think you missed my larger point. The point was that the application must be written in such a way that it could inferred when immediate data arrived for a variety of immediate data sizes and that places a constraint on the application wrt to data it may want to send/receive normally. Where as, if the application embraced the fact that it was responsible for sending a message to indicate a write completion, it is free to send whatever amount of data best met its needs. Transports that support true immediate data do not require the ULP to perform buffer matching. They can post a series of receive buffers that may or may not indicate immediate data. The ULP does not have to know ahead of time when immediate data will arrive **against other data receives**. The fact that an IB oriented application never needs to back a receive request with a buffer if they were only used to indicate immediate data is orthogonal. > >> >> It also affects interface resource allocation. Send queue >> sizes will have to adapt to possibly twice there size. >> > >That is correct. We argued about it at the meeting. >One alternative is to have EP and EVD attr. But this will not >be efficient since it will double the queue size where >a smaller increment is possible due to the depth of the RDMA Write >pipeline outstanding. > >> It just dawned on me that the immediate data must be in >> registered memory to be sent in a message. This means the >> API must be amended to pass an LMR or, even worse, the >> provider would have to register memory in the speed path or >> create and manipulate its own queue of "immediate" >> data buffers/LMRs. Of course, LMRs are not needed and an >> overhead for transports that provide true immediate data. > >No registration on the speed path. It is Consumer responsibility >to provide Recv Buffer of the right size. >Yes for IB only ULP this can be avoided. >But ULP can be written to the proposed API to take full >advantage of IB performance but that code will not be transport >independent. I was referring to the sending side. Source data of a message send must be from registered memory. For transports that will emulate this service with a write/send sequence, user specified immediate data will need to be copied to a provider managed pool of "immediate" data buffers/LMRs or the interface changed to specify an LMR. > >But this API allows to write transport independent code >albeit with certain price attached. > >> >> Oh, and another thing. InfiniBand indicates the size of the >> RDMA write in the receive completion. That is something that >> will have to be addressed in a "transport independent" way or >>
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0immediatedataproposal
>But the remote side does posts Recv. Since it anticipate that >this Recv will be matched against the RDMA Write with immediate >it posts the recv buffer which fits. Yes, there is an issue >for Transport-independent ULP that it does needs a buffer. >For IB it is possible to post 0-size buffer. But if this is the case >Recv end Consumer DOES know that it will be macthed against RDMA >Write so ULP DOES know what it will be matched against. >So in the worst case Consumer does have to pay the price of creating >LMR to handle 4 byte buffer to match RDMA Write Immediate data. How does the remote ULP know this? A DAPL implementation has no idea what a receive will match up against. You're pushing a requirement on the ordering of sends/writes to the application that was not there before. >> It just dawned on me that the immediate data must be in >> registered memory to be sent in a message. This means the >> API must be amended to pass an LMR or, even worse, the >> provider would have to register memory in the speed path or >> create and manipulate its own queue of "immediate" >> data buffers/LMRs. Of course, LMRs are not needed and an >> overhead for transports that provide true immediate data. > >No registration on the speed path. It is Consumer responsibility >to provide Recv Buffer of the right size. >Yes for IB only ULP this can be avoided. >But ULP can be written to the proposed API to take full >advantage of IB performance but that code will not be transport >independent. The immediate data needs to be registered before being sent. This will need to be hidden from the user. >But this API allows to write transport independent code >albeit with certain price attached. What good does it do to have "transport independent" code, when the feature being invoked is "transport dependent"? There's no requirement that immediate data be supported. Why define an API so that it can be emulated? Define the right API, and let transports that don't support immediate data indicate so. A "transport independent" application can check this bit and take whatever action is necessary. They need to do so anyway, since the bit may or may not be set. - Sean ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: [git patch review 2/2] IB: Don't doublefree pages from scatterlist
On Sat, 4 Feb 2006, Roland Dreier wrote: > On some architectures, mapping the scatterlist may coalesce entries: > if that coalesced list is then used for freeing the pages afterwards, > there's a danger that pages may be doubly freed (and others leaked). > > Fix Infiniband's __ib_umem_release by freeing from a separate array > beyond the scatterlist: IB_UMEM_MAX_PAGE_CHUNK lowered to fit one page. It's now looking like this change won't be needed after all: Andi has just posted a patch in the "ipr" thread which should stop x86_64 from interfering with the scatterlist *page,offset,length fields, so what IB and others were doing should then work safely (current thinking is that x86_64 is the only architecture which coalesced in that way). Hugh ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
Roy, comments inline. Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Larsen, Roy K [mailto:[EMAIL PROTECTED] > Sent: Monday, February 06, 2006 4:25 PM > To: Kanevsky, Arkady; Caitlin Bestler; > [EMAIL PROTECTED]; Sean Hefty > Cc: openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > > > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > >Roy, > >Can you explain, please? > > > >For IB the operation will be layered properly on Transport primitive. > >And on Recv side it will indicate in completion event DTO that it > >matches RDMA Write with Immediate and that Immediate Data is > in event. > > > >For iWARP I expect initially, it will be layered on RDMA > Write followed > >by Send. The Provider can do post more efficiently than Consumer and > >guarantee atomicity. > >On Recv side Consumer will get Recv DTO completion in event and > >Immediate Data inline as specified by Provider Attribute. > > > >From the performance point of view Consumers who program to IB only > >will have no performance degradation at all. But this API > also allows > >Consumers to write ULP to be transport independent with minimal > >penalty: one binary comparison and extra 4 bytes in recv buffer. > > If the application could be written transport independently, > I would have no objection at all. Instead, it must be > written in a transport-adaptive way and to be able to adapt > to all possible implementations, the application could not > send arbitrary "immediate"-sized data as messages because > there is no way to distinguish between them on the receiving > side. That is HUGE! It is my experience that send/receive > is generally used for small messages and to take away > particular message sizes or to depend on the so the > application can "adapt" to whatever the immediate size is for > a particular transport, if even needed, is a very weak > facility to offer. But the remote side does posts Recv. Since it anticipate that this Recv will be matched against the RDMA Write with immediate it posts the recv buffer which fits. Yes, there is an issue for Transport-independent ULP that it does needs a buffer. For IB it is possible to post 0-size buffer. But if this is the case Recv end Consumer DOES know that it will be macthed against RDMA Write so ULP DOES know what it will be matched against. So in the worst case Consumer does have to pay the price of creating LMR to handle 4 byte buffer to match RDMA Write Immediate data. > > It also affects interface resource allocation. Send queue > sizes will have to adapt to possibly twice there size. > That is correct. We argued about it at the meeting. One alternative is to have EP and EVD attr. But this will not be efficient since it will double the queue size where a smaller increment is possible due to the depth of the RDMA Write pipeline outstanding. > It just dawned on me that the immediate data must be in > registered memory to be sent in a message. This means the > API must be amended to pass an LMR or, even worse, the > provider would have to register memory in the speed path or > create and manipulate its own queue of "immediate" > data buffers/LMRs. Of course, LMRs are not needed and an > overhead for transports that provide true immediate data. No registration on the speed path. It is Consumer responsibility to provide Recv Buffer of the right size. Yes for IB only ULP this can be avoided. But ULP can be written to the proposed API to take full advantage of IB performance but that code will not be transport independent. But this API allows to write transport independent code albeit with certain price attached. > > Oh, and another thing. InfiniBand indicates the size of the > RDMA write in the receive completion. That is something that > will have to be addressed in a "transport independent" way or > dropped as part of the service. Good point. I will augment Spec accordingly. > > The bottom line here is that it is NOT transport independent. implementation is not transport independent. But API allows to write Transport-specific ULP with full perfromance as well Transport-independent ULP with better performance than without proposed API and with "minimal" performance penalty for Transports that provide it. > > Now, the atomicity argument between write and send has some > credibility. > If an application chooses to "adapt" to an explicit > write/send semantic for write completion notification in > environments that can't provide it natively, this could be > addressed by a generalized combined request API that can > guarantee thread-based atomicity to the send queue. This > seems much more straightforward to me since, in essence, to >
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
good point. I will add this to the requirements and augement the necessary transfered_length text. Arkady Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Davis, Arlin R [mailto:[EMAIL PROTECTED] > Sent: Monday, February 06, 2006 4:17 PM > To: Kanevsky, Arkady; Sean Hefty > Cc: [EMAIL PROTECTED]; openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > I just want to get consensus on the requirements before we > get too far. > One thing I forgot is that with Infiniband, the receive with > immediate provides the size of the rdma write that just > completed. I think we should include this in the requirements > since there is ULP value here. > > -arlin > > >-Original Message- > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > >Sent: Monday, February 06, 2006 11:08 AM > >To: Kanevsky, Arkady; Davis, Arlin R; Sean Hefty > >Cc: [EMAIL PROTECTED]; openib-general@openib.org > >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > > >Arlin, > >It is too strong to state that Consumer should never send a message > >equal in size to the size of immediate data. > >Consumer knows from the context which one it is. > >it may be based on dedicated connection, or based on ULP protocol > >ordering. > >Arkady > > > >Arkady Kanevsky email: [EMAIL PROTECTED] > >Network Appliance Inc. phone: 781-768-5395 > >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > >Waltham, MA 02451 central phone: 781-768-5300 > > > > > >> -Original Message- > >> From: Kanevsky, Arkady > >> Sent: Monday, February 06, 2006 2:05 PM > >> To: Davis, Arlin R; Sean Hefty > >> Cc: [EMAIL PROTECTED]; openib-general@openib.org > >> Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > >> immediatedataproposal > >> > >> Arlin, > >> On Friday we agreed that receiver can not distinguish between > >> 4 byte of Send or 4 bytes of Immediate data if RDMA Write > with Immed > >> is implemented as 2 operations: > >> RDMA Write followed by Send. > >> > >> ULP Reciever "expects" Immediate data that is why it posts Recv. > >> Depending on Transport capability it MAY complete as Recv or as > >> Recv_RDMA_Write_with_Immed_in_event. > >> > >> Neither Provider not Consumer can distinguish between the cases > >> unless there is additional info. > >> > >> Arkady > >> > >> Arkady Kanevsky email: [EMAIL PROTECTED] > >> Network Appliance Inc. phone: 781-768-5395 > >> 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > >> Waltham, MA 02451 central phone: 781-768-5300 > >> > >> > >> > -Original Message- > >> > From: Davis, Arlin R [mailto:[EMAIL PROTECTED] > >> > Sent: Monday, February 06, 2006 1:25 PM > >> > To: Kanevsky, Arkady; Sean Hefty > >> > Cc: [EMAIL PROTECTED]; openib-general@openib.org > >> > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > >> > immediate dataproposal > >> > > >> > > >> > Arkady, > >> > > >> > Your requirements are slightly different then the > proposed set of > >> > requirements. > >> > > >> > "iii) DAPL Provider does not provide any identification > >> that that the > >> > Receive operation matches remote RDMA Write with Immediate > >> data if it > >> > completes as Receive DTO. > >> > > >> > - It is up to an ULP to separate Receive completion of remote > >> > Send from remote RDMA Write with Immediate Data." > >> > > >> > Tell me how this is possible? How can the application > distinguish > >> > between a 4 byte message and a 4 byte immediate data > >> message? We would > >> > have to add a new requirement... "If the provider supports > >> immediate > >> > data in the payload the ULP cannot send a message equal to the > >> > immediate data size". > >> > > >> > -arlin > >> > > >> > >-Original Message- > >> > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > >> > >Sent: Monday, February 06, 2006 8:08 AM > >> > >To: Sean Hefty; Davis, Arlin R > >> > >Cc: [EMAIL PROTECTED]; openib-general@openib.org > >> > >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT > >> > 2.0 immediate > >> > dataproposal > >> > > > >> > >Here are the changes to the existing requirements chapters > >> for RDMA > >> > >Write with Immediate Data. > >> > > > >> > >Feedback please. > >> > >Arkady > >> > > > >> > >Arkady Kanevsky email: [EMAIL PROTECTED] > >> > >Network Appliance Inc. phone: 781-768-5395 > >> > >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > >> > >Waltham, MA 02451 central phone: 781-768-5300 > >> > > > >> > > > >> > >> -Original Message- > >> > >> From: Sean Heft
[openib-general] Re: [ANNOUNCE] DAPL BOF
On Mon, 6 Feb 2006, James Lentini wrote: > > There will be a DAPL BOF this evening from 19:30-20:00 in the Palm > Tree Salon. Correct. We will be in the Lavender room. > I plan to setup a conference call for those of you who would like to > participate remotely. Here is the info: > > phone: 888-867-8686 > id: 1068642 ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
>From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] >Roy, >Can you explain, please? > >For IB the operation will be layered properly on Transport primitive. >And on Recv side it will indicate in completion event DTO >that it matches RDMA Write with Immediate and that Immediate Data >is in event. > >For iWARP I expect initially, it will be layered on RDMA Write >followed by Send. The Provider can do post more efficiently >than Consumer and guarantee atomicity. >On Recv side Consumer will get Recv DTO completion in event >and Immediate Data inline as specified by Provider Attribute. > >From the performance point of view Consumers who program to IB >only will have no performance degradation at all. But this API also >allows Consumers to write ULP to be transport independent >with minimal penalty: one binary comparison and extra 4 bytes in recv >buffer. If the application could be written transport independently, I would have no objection at all. Instead, it must be written in a transport-adaptive way and to be able to adapt to all possible implementations, the application could not send arbitrary "immediate"-sized data as messages because there is no way to distinguish between them on the receiving side. That is HUGE! It is my experience that send/receive is generally used for small messages and to take away particular message sizes or to depend on the so the application can "adapt" to whatever the immediate size is for a particular transport, if even needed, is a very weak facility to offer. It also affects interface resource allocation. Send queue sizes will have to adapt to possibly twice there size. It just dawned on me that the immediate data must be in registered memory to be sent in a message. This means the API must be amended to pass an LMR or, even worse, the provider would have to register memory in the speed path or create and manipulate its own queue of "immediate" data buffers/LMRs. Of course, LMRs are not needed and an overhead for transports that provide true immediate data. Oh, and another thing. InfiniBand indicates the size of the RDMA write in the receive completion. That is something that will have to be addressed in a "transport independent" way or dropped as part of the service. The bottom line here is that it is NOT transport independent. Now, the atomicity argument between write and send has some credibility. If an application chooses to "adapt" to an explicit write/send semantic for write completion notification in environments that can't provide it natively, this could be addressed by a generalized combined request API that can guarantee thread-based atomicity to the send queue. This seems much more straightforward to me since, in essence, to adapt to non-native immediate data services, they would have to allocate resources and behave in virtually the same way as if they did write/send explicitly. It is obvious that the proposed service is not one of immediate data in the sense defined by InfiniBand. Since true immediate data is a transport specific speed path service, it needs to be implemented as a transport specific extension. To allow an application to initiate multiple request sequences that must be queued sequentially to explicitly create a write completion notification or any other order-based sequence, a generalized combined request API should be defined. > >Arkady Kanevsky email: [EMAIL PROTECTED] >Network Appliance Inc. phone: 781-768-5395 >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 >Waltham, MA 02451 central phone: 781-768-5300 > > ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
[EMAIL PROTECTED] wrote: > I just want to get consensus on the requirements before we get too > far. One thing I forgot is that with Infiniband, the receive with > immediate provides the size of the rdma write that just > completed. I think we should include this in the requirements > since there is ULP value here. > > -arlin > That *could* be done, it would be an eight byte message over iWARP, 4 for length and 4 for the message tag. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
I just want to get consensus on the requirements before we get too far. One thing I forgot is that with Infiniband, the receive with immediate provides the size of the rdma write that just completed. I think we should include this in the requirements since there is ULP value here. -arlin >-Original Message- >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] >Sent: Monday, February 06, 2006 11:08 AM >To: Kanevsky, Arkady; Davis, Arlin R; Sean Hefty >Cc: [EMAIL PROTECTED]; openib-general@openib.org >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal > >Arlin, >It is too strong to state that Consumer should never send a message >equal in size to the size of immediate data. >Consumer knows from the context which one it is. >it may be based on dedicated connection, or based on ULP protocol >ordering. >Arkady > >Arkady Kanevsky email: [EMAIL PROTECTED] >Network Appliance Inc. phone: 781-768-5395 >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 >Waltham, MA 02451 central phone: 781-768-5300 > > >> -Original Message- >> From: Kanevsky, Arkady >> Sent: Monday, February 06, 2006 2:05 PM >> To: Davis, Arlin R; Sean Hefty >> Cc: [EMAIL PROTECTED]; openib-general@openib.org >> Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 >> immediatedataproposal >> >> Arlin, >> On Friday we agreed that receiver can not distinguish between >> 4 byte of Send or 4 bytes of Immediate data if RDMA Write >> with Immed is implemented as 2 operations: >> RDMA Write followed by Send. >> >> ULP Reciever "expects" Immediate data that is why it posts >> Recv. Depending on Transport capability it MAY complete as >> Recv or as Recv_RDMA_Write_with_Immed_in_event. >> >> Neither Provider not Consumer can distinguish between the >> cases unless there is additional info. >> >> Arkady >> >> Arkady Kanevsky email: [EMAIL PROTECTED] >> Network Appliance Inc. phone: 781-768-5395 >> 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 >> Waltham, MA 02451 central phone: 781-768-5300 >> >> >> > -Original Message- >> > From: Davis, Arlin R [mailto:[EMAIL PROTECTED] >> > Sent: Monday, February 06, 2006 1:25 PM >> > To: Kanevsky, Arkady; Sean Hefty >> > Cc: [EMAIL PROTECTED]; openib-general@openib.org >> > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 >> > immediate dataproposal >> > >> > >> > Arkady, >> > >> > Your requirements are slightly different then the proposed set of >> > requirements. >> > >> > "iii) DAPL Provider does not provide any identification >> that that the >> > Receive operation matches remote RDMA Write with Immediate >> data if it >> > completes as Receive DTO. >> > >> >- It is up to an ULP to separate Receive completion of remote >> > Send from remote RDMA Write with Immediate Data." >> > >> > Tell me how this is possible? How can the application distinguish >> > between a 4 byte message and a 4 byte immediate data >> message? We would >> > have to add a new requirement... "If the provider supports >> immediate >> > data in the payload the ULP cannot send a message equal to the >> > immediate >> > data size". >> > >> > -arlin >> > >> > >-Original Message- >> > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] >> > >Sent: Monday, February 06, 2006 8:08 AM >> > >To: Sean Hefty; Davis, Arlin R >> > >Cc: [EMAIL PROTECTED]; openib-general@openib.org >> > >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT >> > 2.0 immediate >> > dataproposal >> > > >> > >Here are the changes to the existing requirements chapters >> for RDMA >> > >Write with Immediate Data. >> > > >> > >Feedback please. >> > >Arkady >> > > >> > >Arkady Kanevsky email: [EMAIL PROTECTED] >> > >Network Appliance Inc. phone: 781-768-5395 >> > >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 >> > >Waltham, MA 02451 central phone: 781-768-5300 >> > > >> > > >> > >> -Original Message- >> > >> From: Sean Hefty [mailto:[EMAIL PROTECTED] >> > >> Sent: Friday, February 03, 2006 7:30 PM >> > >> To: Davis, Arlin R >> > >> Cc: [EMAIL PROTECTED]; openib-general@openib.org >> > >> Subject: Re: [dat-discussions] [openib-general] [RFC] DAT 2.0 >> > >> immediate dataproposal >> > >> >> > >> Davis, Arlin R wrote: >> > >> > "Applications need an optimized mechanism to notify the >> > >> receiving end >> > >> > that RDMA write data has completed beyond the two >> > operation method >> > >> > currently used (RDMA write followed by message send). >> > This new RDMA >> > >> > write feature will support 4-bytes of inline data that >> > will be sent >> > >> >> > >> Is there any reason to restrict the size of the immediate data? >> > >> Could you define the API such that the size is variable? >> I.e. the >> > >> provider can simply give the immediate data size, with 0 >> > indicating >> > >> that it is not support
[openib-general] questions about gen2 srp driver
i have been looking at the srp driver in the gen2 trunk (and the version that is in the latest 2.6.15 kernels). i have a couple questions about its behavior and i am hoping someone can answer them. it seems to take scsi_host->host_lock with a spin_lock_irq() inside a couple of work queues. i believe work queues run at process context and not interrupt context. therefore, one should probably use spin_lock_irqsave()? secondly, there seems to be only one pair of lkeys/rkeys for a given srp "virtual" host. in srp_map_data() i see the rkey is assigned to the buffer: buf->key = cpu_to_be32(target->srp_host->mr->rkey); but the virtual host adapter template says: .can_queue = SRP_SQ_SIZE, .cmd_per_lun= SRP_SQ_SIZE, if there is only a single set of rdma keys how can the driver support more than one command (particularly on a target with multiple lun's) outstanding command? i didn't think the srp_post_send() was synchronus with respect to the completion of the current rdma request? ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [ANNOUNCE] DAPL BOF
There will be a DAPL BOF this evening from 19:30-20:00 in the Palm Tree Salon. I plan to setup a conference call for those of you who would like to participate remotely. Here is the info: phone: 888-867-8686 id: 1068642 ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
Roy, Can you explain, please? For IB the operation will be layered properly on Transport primitive. And on Recv side it will indicate in completion event DTO that it matches RDMA Write with Immediate and that Immediate Data is in event. For iWARP I expect initially, it will be layered on RDMA Write followed by Send. The Provider can do post more efficiently than Consumer and guarantee atomicity. On Recv side Consumer will get Recv DTO completion in event and Immediate Data inline as specified by Provider Attribute. >From the performance point of view Consumers who program to IB only will have no performance degradation at all. But this API also allows Consumers to write ULP to be transport independent with minimal penalty: one binary comparison and extra 4 bytes in recv buffer. Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Larsen, Roy K [mailto:[EMAIL PROTECTED] > Sent: Monday, February 06, 2006 2:10 PM > To: Caitlin Bestler; [EMAIL PROTECTED]; > Kanevsky, Arkady; Sean Hefty > Cc: openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > If it is up to the ULP to separate out "normal" receive data > from that associated with a write immediate, how is this > different from the ULP doing a write followed by a send? If > there is no difference, then what we're really talking about > is a convenience to the initiating ULP. > > Perhaps what would be best is to construct an API that allows > the ULP to perform standard write/send operations into one > call which the underlying provider could optimize into one > transaction with the associated interconnect interface. > Better yet, a general request combining interface would have > even more value, but calling this write/send "immediate" data > is a stretch, if not downright silly. Some transports have > true immediate data that provides unique value. There is > nothing unique in a write/send sequence - ULPs do it all the time... > > Roy > > -Original Message- > From: [EMAIL PROTECTED] > [mailto:[EMAIL PROTECTED] On Behalf Of > Caitlin Bestler > Sent: Monday, February 06, 2006 10:48 AM > To: [EMAIL PROTECTED]; Kanevsky, Arkady; Sean Hefty > Cc: openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > [EMAIL PROTECTED] wrote: > > Arkady, > > > > Your requirements are slightly different then the proposed set of > > requirements. > > > > "iii) DAPL Provider does not provide any identification > that that the > > Receive operation matches remote RDMA Write with Immediate > data if it > > completes as Receive DTO. > > > > - It is up to an ULP to separate Receive completion of remote > > Send from remote RDMA Write with Immediate Data." > > > > Tell me how this is possible? How can the application distinguish > > between a 4 byte message and a 4 byte immediate data > message? We would > > have to add a new requirement... "If the provider supports > immediate > > data in the payload the ULP cannot send a message equal to the > > immediate data size". > > > > The data sink knows whether the 4 bytes was sent as a message > or as an immediate because it is clear in the ULP context. > Possible methods: > The expected completion is an immediate. > All 4 byte messages are immediates. > All 4 byte messages where the ms-byte is X are immediate. > If its Tuesday its an immediate. > If it's a prime number its an immediate > ... > > But there is no clue from the transport layer. > > ___ > openib-general mailing list > openib-general@openib.org > http://openib.org/mailman/listinfo/openib-general > > To unsubscribe, please visit > http://openib.org/mailman/listinfo/openib-general > ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
Larsen, Roy K wrote: > If it is up to the ULP to separate out "normal" receive data > from that associated with a write immediate, how is this > different from the ULP doing a write followed by a send? If > there is no difference, then what we're really talking about > is a convenience to the initiating ULP. > > Perhaps what would be best is to construct an API that allows > the ULP to perform standard write/send operations into one > call which the underlying provider could optimize into one > transaction with the associated interconnect interface. > Better yet, a general request combining interface would have > even more value, but calling this write/send "immediate" data > is a stretch, if not downright silly. Some transports have > true immediate data that provides unique value. There is > nothing unique in a write/send sequence - ULPs do it all the time... > The data provided is to identify the completion notification that completes the RDMA Write to the data sink. So, yes, it is not really an "immediate" value. We could consider a better name for it, much as we renamed QP to something better. But the meaning is "the tag value associated with a specific RDMA Message". It is delivered in order, after that RDMA Message has fully completed. What varies by transport is *how* it is is delivered. We are considering identifying it as a single work request so that transport-specific contraction to a single wire message is enabled. But we don't want to change any of the semantics vs. the application doing Write then Send. The new call enables an optimization, but should not change the overall semantics. That could extend as far as having the the receiver recognize the alternate reception. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediate dataproposal
I should stress that the only "additional" requirement I had added beyond the DAT meeting agreement is Provider attribute for the size of Immediate Data. It will be set to 4 bytes in DAT now . But this may not be cast in stone permanently. Arkady Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Kanevsky, Arkady > Sent: Monday, February 06, 2006 11:08 AM > To: Sean Hefty; Davis, Arlin R > Cc: [EMAIL PROTECTED]; openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediate dataproposal > > Here are the changes to the existing requirements chapters > for RDMA Write with Immediate Data. > > Feedback please. > Arkady > > Arkady Kanevsky email: [EMAIL PROTECTED] > Network Appliance Inc. phone: 781-768-5395 > 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > Waltham, MA 02451 central phone: 781-768-5300 > > > > -Original Message- > > From: Sean Hefty [mailto:[EMAIL PROTECTED] > > Sent: Friday, February 03, 2006 7:30 PM > > To: Davis, Arlin R > > Cc: [EMAIL PROTECTED]; openib-general@openib.org > > Subject: Re: [dat-discussions] [openib-general] [RFC] DAT 2.0 > > immediate dataproposal > > > > Davis, Arlin R wrote: > > > "Applications need an optimized mechanism to notify the > > receiving end > > > that RDMA write data has completed beyond the two > operation method > > > currently used (RDMA write followed by message send). > This new RDMA > > > write feature will support 4-bytes of inline data that > will be sent > > > > Is there any reason to restrict the size of the immediate > data? Could > > you define the API such that the size is variable? I.e. > the provider > > can simply give the immediate data size, with 0 indicating > that it is > > not supported. > > > > > It should avoid > > > any latency penalties normally associated with a two > > operation method. > > > > I would state this as a requirement. A write followed by a send > > should be pushed to the application, since they may be able > to provide > > additional optimizations (such as combining > > operations) beyond what a provider could. > > > > > The initiating side must expose a 4-byte immediate data > > parameter for > > > the application to set the inline data. The receiving side must > > > provide a mechanism to accept the 4-byte immediate data. On the > > > receiving side, the write with immediate completion > notification is > > > indicated through a receive completion. It is the > responsibility of > > > the provider to identify to the application 4-byte > > immediate data from > > > a normal 4-byte send message. The inline byte ordering is > > application specific." > > > > Requirements look good to me. > > > > - Sean > > ___ > > openib-general mailing list > > openib-general@openib.org > > http://openib.org/mailman/listinfo/openib-general > > > > To unsubscribe, please visit > > http://openib.org/mailman/listinfo/openib-general > > > > > > Yahoo! Groups Links > > <*> To visit your group on the web, go to: > http://groups.yahoo.com/group/dat-discussions/ > > <*> To unsubscribe from this group, send an email to: > [EMAIL PROTECTED] > > <*> Your use of Yahoo! Groups is subject to: > http://docs.yahoo.com/info/terms/ > > ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
If it is up to the ULP to separate out "normal" receive data from that associated with a write immediate, how is this different from the ULP doing a write followed by a send? If there is no difference, then what we're really talking about is a convenience to the initiating ULP. Perhaps what would be best is to construct an API that allows the ULP to perform standard write/send operations into one call which the underlying provider could optimize into one transaction with the associated interconnect interface. Better yet, a general request combining interface would have even more value, but calling this write/send "immediate" data is a stretch, if not downright silly. Some transports have true immediate data that provides unique value. There is nothing unique in a write/send sequence - ULPs do it all the time... Roy -Original Message- From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED] On Behalf Of Caitlin Bestler Sent: Monday, February 06, 2006 10:48 AM To: [EMAIL PROTECTED]; Kanevsky, Arkady; Sean Hefty Cc: openib-general@openib.org Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal [EMAIL PROTECTED] wrote: > Arkady, > > Your requirements are slightly different then the proposed set of > requirements. > > "iii) DAPL Provider does not provide any identification that > that the Receive operation matches remote RDMA Write with > Immediate data if it completes as Receive DTO. > > - It is up to an ULP to separate Receive completion of remote > Send from remote RDMA Write withImmediate Data." > > Tell me how this is possible? How can the application > distinguish between a 4 byte message and a 4 byte immediate > data message? We would have to add a new requirement... "If > the provider supports immediate data in the payload the ULP > cannot send a message equal to the immediate > data size". > The data sink knows whether the 4 bytes was sent as a message or as an immediate because it is clear in the ULP context. Possible methods: The expected completion is an immediate. All 4 byte messages are immediates. All 4 byte messages where the ms-byte is X are immediate. If its Tuesday its an immediate. If it's a prime number its an immediate ... But there is no clue from the transport layer. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediatedataproposal
Arlin, It is too strong to state that Consumer should never send a message equal in size to the size of immediate data. Consumer knows from the context which one it is. it may be based on dedicated connection, or based on ULP protocol ordering. Arkady Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Kanevsky, Arkady > Sent: Monday, February 06, 2006 2:05 PM > To: Davis, Arlin R; Sean Hefty > Cc: [EMAIL PROTECTED]; openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediatedataproposal > > Arlin, > On Friday we agreed that receiver can not distinguish between > 4 byte of Send or 4 bytes of Immediate data if RDMA Write > with Immed is implemented as 2 operations: > RDMA Write followed by Send. > > ULP Reciever "expects" Immediate data that is why it posts > Recv. Depending on Transport capability it MAY complete as > Recv or as Recv_RDMA_Write_with_Immed_in_event. > > Neither Provider not Consumer can distinguish between the > cases unless there is additional info. > > Arkady > > Arkady Kanevsky email: [EMAIL PROTECTED] > Network Appliance Inc. phone: 781-768-5395 > 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > Waltham, MA 02451 central phone: 781-768-5300 > > > > -Original Message- > > From: Davis, Arlin R [mailto:[EMAIL PROTECTED] > > Sent: Monday, February 06, 2006 1:25 PM > > To: Kanevsky, Arkady; Sean Hefty > > Cc: [EMAIL PROTECTED]; openib-general@openib.org > > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > > immediate dataproposal > > > > > > Arkady, > > > > Your requirements are slightly different then the proposed set of > > requirements. > > > > "iii) DAPL Provider does not provide any identification > that that the > > Receive operation matches remote RDMA Write with Immediate > data if it > > completes as Receive DTO. > > > > - It is up to an ULP to separate Receive completion of remote > > Send from remote RDMA Write with Immediate Data." > > > > Tell me how this is possible? How can the application distinguish > > between a 4 byte message and a 4 byte immediate data > message? We would > > have to add a new requirement... "If the provider supports > immediate > > data in the payload the ULP cannot send a message equal to the > > immediate > > data size". > > > > -arlin > > > > >-Original Message- > > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > > >Sent: Monday, February 06, 2006 8:08 AM > > >To: Sean Hefty; Davis, Arlin R > > >Cc: [EMAIL PROTECTED]; openib-general@openib.org > > >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT > > 2.0 immediate > > dataproposal > > > > > >Here are the changes to the existing requirements chapters > for RDMA > > >Write with Immediate Data. > > > > > >Feedback please. > > >Arkady > > > > > >Arkady Kanevsky email: [EMAIL PROTECTED] > > >Network Appliance Inc. phone: 781-768-5395 > > >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > > >Waltham, MA 02451 central phone: 781-768-5300 > > > > > > > > >> -Original Message- > > >> From: Sean Hefty [mailto:[EMAIL PROTECTED] > > >> Sent: Friday, February 03, 2006 7:30 PM > > >> To: Davis, Arlin R > > >> Cc: [EMAIL PROTECTED]; openib-general@openib.org > > >> Subject: Re: [dat-discussions] [openib-general] [RFC] DAT 2.0 > > >> immediate dataproposal > > >> > > >> Davis, Arlin R wrote: > > >> > "Applications need an optimized mechanism to notify the > > >> receiving end > > >> > that RDMA write data has completed beyond the two > > operation method > > >> > currently used (RDMA write followed by message send). > > This new RDMA > > >> > write feature will support 4-bytes of inline data that > > will be sent > > >> > > >> Is there any reason to restrict the size of the immediate data? > > >> Could you define the API such that the size is variable? > I.e. the > > >> provider can simply give the immediate data size, with 0 > > indicating > > >> that it is not supported. > > >> > > >> > It should avoid > > >> > any latency penalties normally associated with a two > > >> operation method. > > >> > > >> I would state this as a requirement. A write followed by a send > > >> should be pushed to the application, since they may be able to > > >> provide additional optimizations (such as combining > > >> operations) beyond what a provider could. > > >> > > >> > The initiating side must expose a 4-byte immediate data > > >> parameter for > > >> > the application to set the inline data. The receiving > side must > > >> > provide a mechanism to accept the 4-byte immediate > data. On the > > >> > receiving side, the write with im
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediate dataproposal
Arlin, On Friday we agreed that receiver can not distinguish between 4 byte of Send or 4 bytes of Immediate data if RDMA Write with Immed is implemented as 2 operations: RDMA Write followed by Send. ULP Reciever "expects" Immediate data that is why it posts Recv. Depending on Transport capability it MAY complete as Recv or as Recv_RDMA_Write_with_Immed_in_event. Neither Provider not Consumer can distinguish between the cases unless there is additional info. Arkady Arkady Kanevsky email: [EMAIL PROTECTED] Network Appliance Inc. phone: 781-768-5395 1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 Waltham, MA 02451 central phone: 781-768-5300 > -Original Message- > From: Davis, Arlin R [mailto:[EMAIL PROTECTED] > Sent: Monday, February 06, 2006 1:25 PM > To: Kanevsky, Arkady; Sean Hefty > Cc: [EMAIL PROTECTED]; openib-general@openib.org > Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 > immediate dataproposal > > > Arkady, > > Your requirements are slightly different then the proposed > set of requirements. > > "iii) DAPL Provider does not provide any identification that > that the Receive operation matches remote RDMA Write with > Immediate data if it completes as Receive DTO. > > - It is up to an ULP to separate Receive completion of remote > Send from remote RDMA Write withImmediate Data." > > Tell me how this is possible? How can the application > distinguish between a 4 byte message and a 4 byte immediate > data message? We would have to add a new requirement... "If > the provider supports immediate data in the payload the ULP > cannot send a message equal to the immediate > data size". > > -arlin > > >-Original Message- > >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] > >Sent: Monday, February 06, 2006 8:08 AM > >To: Sean Hefty; Davis, Arlin R > >Cc: [EMAIL PROTECTED]; openib-general@openib.org > >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT > 2.0 immediate > dataproposal > > > >Here are the changes to the existing requirements chapters for RDMA > >Write with Immediate Data. > > > >Feedback please. > >Arkady > > > >Arkady Kanevsky email: [EMAIL PROTECTED] > >Network Appliance Inc. phone: 781-768-5395 > >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 > >Waltham, MA 02451 central phone: 781-768-5300 > > > > > >> -Original Message- > >> From: Sean Hefty [mailto:[EMAIL PROTECTED] > >> Sent: Friday, February 03, 2006 7:30 PM > >> To: Davis, Arlin R > >> Cc: [EMAIL PROTECTED]; openib-general@openib.org > >> Subject: Re: [dat-discussions] [openib-general] [RFC] DAT 2.0 > >> immediate dataproposal > >> > >> Davis, Arlin R wrote: > >> > "Applications need an optimized mechanism to notify the > >> receiving end > >> > that RDMA write data has completed beyond the two > operation method > >> > currently used (RDMA write followed by message send). > This new RDMA > >> > write feature will support 4-bytes of inline data that > will be sent > >> > >> Is there any reason to restrict the size of the immediate data? > >> Could you define the API such that the size is variable? I.e. the > >> provider can simply give the immediate data size, with 0 > indicating > >> that it is not supported. > >> > >> > It should avoid > >> > any latency penalties normally associated with a two > >> operation method. > >> > >> I would state this as a requirement. A write followed by a send > >> should be pushed to the application, since they may be able to > >> provide additional optimizations (such as combining > >> operations) beyond what a provider could. > >> > >> > The initiating side must expose a 4-byte immediate data > >> parameter for > >> > the application to set the inline data. The receiving side must > >> > provide a mechanism to accept the 4-byte immediate data. On the > >> > receiving side, the write with immediate completion > notification is > >> > indicated through a receive completion. It is the > responsibility of > >> > the provider to identify to the application 4-byte > >> immediate data from > >> > a normal 4-byte send message. The inline byte ordering is > >> application specific." > >> > >> Requirements look good to me. > >> > >> - Sean > >> ___ > >> openib-general mailing list > >> openib-general@openib.org > >> http://openib.org/mailman/listinfo/openib-general > >> > >> To unsubscribe, please visit > >> http://openib.org/mailman/listinfo/openib-general > >> > ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [ANNOUNCE] iSER BOF at OpenIB workshop
For those of you at the OpenIB workshop, there will be an iSER BOF this evening from 18:30-19:30 in the Palm Tree Salon. This BOF will cover iSER in general and discuss the development of an open source Linux iSER target on the OpenIB stack in particular. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediate dataproposal
[EMAIL PROTECTED] wrote: > Arkady, > > Your requirements are slightly different then the proposed set of > requirements. > > "iii) DAPL Provider does not provide any identification that > that the Receive operation matches remote RDMA Write with > Immediate data if it completes as Receive DTO. > > - It is up to an ULP to separate Receive completion of remote > Send from remote RDMA Write withImmediate Data." > > Tell me how this is possible? How can the application > distinguish between a 4 byte message and a 4 byte immediate > data message? We would have to add a new requirement... "If > the provider supports immediate data in the payload the ULP > cannot send a message equal to the immediate > data size". > The data sink knows whether the 4 bytes was sent as a message or as an immediate because it is clear in the ULP context. Possible methods: The expected completion is an immediate. All 4 byte messages are immediates. All 4 byte messages where the ms-byte is X are immediate. If its Tuesday its an immediate. If it's a prime number its an immediate ... But there is no clue from the transport layer. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: ipoib_mcast_send.patch
Michael> Maybe the description is not clear enough. There are two Michael> issues here with two separate fixes. OK, got it now. Michael> 1. IPOIB_MCAST_STARTED - solves the first issue Michael> 2. Checking priv->broadcast in ipoib_mcast_send here: + Michael> if (!test_bit(IPOIB_MCAST_STARTED, &priv->flags) || Michael> !priv->broadcast) { - solves the second issue Makes sense. Related to this, the way priv->broadcast is initialized in ipoib_mcast_join_task() looks somewhat unsafe, since there's no lock and conceivable a send-only join could complete before priv->broadcast is fully set up. What do you think? Michael> They just got rolled into one patch because they touch Michael> the same code lines. Michael> Do you want me to split them up? No, I can handle it. Thanks... - R. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: mthca: gid index bug?
> Roland, in mthca_qp.c we have > > path->mgid_index = ah->grh.sgid_index; > > Shouldnt the port number be taken into account, like it > is with mthca_av, where we have > av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len + I really don't know. The PRM just says "index to port GID table". Can you check it out at Mellanox and (even better) generate a patch if it's wrong? Thanks, Roland ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediate dataproposal
Arkady, Your requirements are slightly different then the proposed set of requirements. "iii) DAPL Provider does not provide any identification that that the Receive operation matches remote RDMA Write with Immediate data if it completes as Receive DTO. - It is up to an ULP to separate Receive completion of remote Send from remote RDMA Write with Immediate Data." Tell me how this is possible? How can the application distinguish between a 4 byte message and a 4 byte immediate data message? We would have to add a new requirement... "If the provider supports immediate data in the payload the ULP cannot send a message equal to the immediate data size". -arlin >-Original Message- >From: Kanevsky, Arkady [mailto:[EMAIL PROTECTED] >Sent: Monday, February 06, 2006 8:08 AM >To: Sean Hefty; Davis, Arlin R >Cc: [EMAIL PROTECTED]; openib-general@openib.org >Subject: RE: [dat-discussions] [openib-general] [RFC] DAT 2.0 immediate dataproposal > >Here are the changes to the existing requirements chapters >for RDMA Write with Immediate Data. > >Feedback please. >Arkady > >Arkady Kanevsky email: [EMAIL PROTECTED] >Network Appliance Inc. phone: 781-768-5395 >1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195 >Waltham, MA 02451 central phone: 781-768-5300 > > >> -Original Message- >> From: Sean Hefty [mailto:[EMAIL PROTECTED] >> Sent: Friday, February 03, 2006 7:30 PM >> To: Davis, Arlin R >> Cc: [EMAIL PROTECTED]; openib-general@openib.org >> Subject: Re: [dat-discussions] [openib-general] [RFC] DAT 2.0 >> immediate dataproposal >> >> Davis, Arlin R wrote: >> > "Applications need an optimized mechanism to notify the >> receiving end >> > that RDMA write data has completed beyond the two operation method >> > currently used (RDMA write followed by message send). This new RDMA >> > write feature will support 4-bytes of inline data that will be sent >> >> Is there any reason to restrict the size of the immediate >> data? Could you define the API such that the size is >> variable? I.e. the provider can simply give the immediate >> data size, with 0 indicating that it is not supported. >> >> > It should avoid >> > any latency penalties normally associated with a two >> operation method. >> >> I would state this as a requirement. A write followed by a >> send should be pushed to the application, since they may be >> able to provide additional optimizations (such as combining >> operations) beyond what a provider could. >> >> > The initiating side must expose a 4-byte immediate data >> parameter for >> > the application to set the inline data. The receiving side must >> > provide a mechanism to accept the 4-byte immediate data. On the >> > receiving side, the write with immediate completion notification is >> > indicated through a receive completion. It is the responsibility of >> > the provider to identify to the application 4-byte >> immediate data from >> > a normal 4-byte send message. The inline byte ordering is >> application specific." >> >> Requirements look good to me. >> >> - Sean >> ___ >> openib-general mailing list >> openib-general@openib.org >> http://openib.org/mailman/listinfo/openib-general >> >> To unsubscribe, please visit >> http://openib.org/mailman/listinfo/openib-general >> ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] Re: [PATCH] change Mellanox SDP workaround to a moduleparameter
Michael> Do we want this as a compile-time option too, for people Michael> that might compile SDP in kernel? module options can be set on the kernel command line. - R. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] mthca: gid index bug?
Roland, in mthca_qp.c we have path->mgid_index = ah->grh.sgid_index; Shouldnt the port number be taken into account, like it is with mthca_av, where we have av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len + -- Michael S. Tsirkin Staff Engineer, Mellanox Technologies ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: [PATCH] use set_current_state() in SDP
Quoting r. Ralph Campbell <[EMAIL PROTECTED]>: > > On Fri, 2006-02-03 at 17:06 -0800, Roland Dreier wrote: > > I think both of these places can use __set_current_state(). > > > > - R. > > Good point. Here is the updated patch. > > Signed-off-by: Ralph Campbell <[EMAIL PROTECTED]> Hmm. We would be using wait_event_exclusive except there is no such a beast. I wander whether we can switch to at least use prepare_to_wait_exclusive/ finish_wait? -- Michael S. Tsirkin Staff Engineer, Mellanox Technologies ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: [PATCH] change Mellanox SDP workaround to a moduleparameter
Quoting r. Ralph Campbell <[EMAIL PROTECTED]>: > Subject: [PATCH] change Mellanox SDP workaround to a moduleparameter > > This patch changes the hardwired MTU limit of 1024 in SDP > into a module parameter so it can be disabled for HCAs > without the RC performance problem. > > Signed-off-by: Ralph Campbell <[EMAIL PROTECTED]> Hmm. Do we want this as a compile-time option too, for people that might compile SDP in kernel? +module_param(sdp_path_mtu_max, int, 0); Why 0? Lets make this editable from sysfs? -- Michael S. Tsirkin Staff Engineer, Mellanox Technologies ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] iser: 4 change sets to the code
r5314 | ogerlitz | 2006-02-06 17:47:06 +0200 (Mon, 06 Feb 2006) | 5 lines connection establishment error flow bugfixes: dont call rdma_destory_id from the cma callback flow and dont call sock_release when the socket might be touched later. Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> r5312 | ogerlitz | 2006-02-06 17:39:26 +0200 (Mon, 06 Feb 2006) | 4 lines moved the code of conn init/connect/release from iser_conn.c to iser_verbs.c, cleanups Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> r5311 | ogerlitz | 2006-02-06 17:34:23 +0200 (Mon, 06 Feb 2006) | 4 lines deallocate adaptor (shared IB resources among iser connections) when there's no demand Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> r5309 | ogerlitz | 2006-02-06 17:26:57 +0200 (Mon, 06 Feb 2006) | 4 lines various cleanups, cosmetic changes for coding conventions Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] Re: [PATCH] Opensm - osm_reg_sig_handler in Windows
On 14:25 Mon 06 Feb , Michael S. Tsirkin wrote: > > As was pointed out several times, we dont really need a signal > handler in linux, The signals are used. For instance SIGHUP will initiate re-sweep (I use it frequently), clean exit is done with SIGINT and SIGTERM. If signals are not supported in windows this can be simply masked in less aggressive way, Something like: #define signal(a,b) , or #define cl_reg_sig_hdl(a,b) (or something better) in windows specific common header file. Sasha. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [openib-general] Get Table Records for SA Attribute ID ?
Hi, There are a couple of issues with the below. 1. SA MAD structure is missing the RMPP header. Once I saw that I didn't check for further issues with the format. 2. I will assume your register call sets RMPP. 3. SA class version is 2. What SM are you using ? If you are using OpenSM, you can turn on verbose and see if the packet is seen by the SM. You could also enable madeye (in utils) to see if the packet is sent (and if anything is received back). -- Hal From: [EMAIL PROTECTED] on behalf of Takshak C. Sent: Mon 2/6/2006 8:00 AM To: openib-general@openib.org Subject: [openib-general] Get Table Records for SA Attribute ID ? Hi, I m trying to get the table records for SA attribute ID in following way. But, I m not getting a single record, could anyone comment on the problem. 1. I have created saMadFormat structure described in the specification as below: struct saMadFormat { uint8_t base_version ; uint8_t mgmt_class ; uint8_t class_version ; uint8_t sa_method ; uint16_tstatus ; uint16_tnot_used ; uint64_ttid ; uint16_tattr_id ; uint16_tresv ; uint32_tattr_mod ; uint64_tsa_key; uint64_tsm_key ; uint32_tseg_num ; uint32_tpayload_len ; uint8_t frag_flag ; uint8_t edit_mod ; uint16_twindow ; uint32_tendRID ; uint64_tcomp_mask ; uint8_t adminData[192] ; }; 2. Then I have done all the basic operations like umad_open, umad_register for the IB_SA_CLASS and umad_open_port etc successfully. 3. struct saMadFormat *saQuery = (struct saMadFormat*)(umad_get_mad(umad)); memset(saQuery, 0, sizeof(*saQuery)); saQuery->base_version = 1; saQuery->mgmt_class = IB_SA_CLASS ; saQuery->class_version = 1 ; saQuery->sa_method = IB_MAD_METHOD_GET_TABLE ; saQuery->attr_id = IB_SA_ATTR_PATHRECORD ; saQuery->attr_mod = 0 ; saQuery->tid = htonll(drmad_tid++); saQuery->endRID = 0 ; umad_set_addr(umad, lid, 1, 0, IB_DEFAULT_QP1_QKEY); umad_set_grh(umad, 0); umad_set_pkey(umad, 0x); 4. length = IB_MAD_SIZE; if (umad_send(portid, mad_agent, umad, length, timeout_ms, 0) < 0) IBPANIC("send failed"); if (umad_recv(portid, umad, &length, -1) != mad_agent) IBPANIC("recv error: %s", drmad_status_str(saQuery)); if (!dump_char) { xdump(stdout, 0, saQuery->adminData, 192); return 0; } I m expecting that, I will get the resultant data in saQuery->adminData. Is this correct ? If not then, how should I retrieve the table records ? Any Idea ? Thanks - Takshak ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
RE: [openib-general] Re: does the mthca driver support RTS->SQD event request?
Roland> I'm not sure whether that interpretation is correct or not. In any Roland> case, it seems that Mellanox HCAs only support enabling the event on Roland> the RTS->SQD transition. This is correct Tziporet ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Get Table Records for SA Attribute ID ?
Hi, I m trying to get the table records for SA attribute ID in following way. But, I m not getting a single record, could anyone comment on the problem. 1. I have created saMadFormat structure described in the specification as below: struct saMadFormat { uint8_t base_version ; uint8_t mgmt_class ; uint8_t class_version ; uint8_t sa_method ; uint16_tstatus ; uint16_tnot_used ; uint64_ttid ; uint16_tattr_id ; uint16_tresv ; uint32_tattr_mod ; uint64_tsa_key; uint64_tsm_key ; uint32_tseg_num ; uint32_tpayload_len ; uint8_t frag_flag ; uint8_t edit_mod ; uint16_twindow ; uint32_tendRID ; uint64_tcomp_mask ; uint8_t adminData[192] ; }; 2. Then I have done all the basic operations like umad_open, umad_register for the IB_SA_CLASS and umad_open_port etc successfully. 3. struct saMadFormat *saQuery = (struct saMadFormat*)(umad_get_mad(umad)); memset(saQuery, 0, sizeof(*saQuery)); saQuery->base_version = 1; saQuery->mgmt_class = IB_SA_CLASS ; saQuery->class_version = 1 ; saQuery->sa_method = IB_MAD_METHOD_GET_TABLE ; saQuery->attr_id = IB_SA_ATTR_PATHRECORD ; saQuery->attr_mod = 0 ; saQuery->tid = htonll(drmad_tid++); saQuery->endRID = 0 ; umad_set_addr(umad, lid, 1, 0, IB_DEFAULT_QP1_QKEY); umad_set_grh(umad, 0); umad_set_pkey(umad, 0x); 4. length = IB_MAD_SIZE; if (umad_send(portid, mad_agent, umad, length, timeout_ms, 0) < 0) IBPANIC("send failed"); if (umad_recv(portid, umad, &length, -1) != mad_agent) IBPANIC("recv error: %s", drmad_status_str(saQuery)); if (!dump_char) { xdump(stdout, 0, saQuery->adminData, 192); return 0; } I m expecting that, I will get the resultant data in saQuery->adminData. Is this correct ? If not then, how should I retrieve the table records ? Any Idea ? Thanks - Takshak ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: [PATCH] Opensm - osm_reg_sig_handler in Windows
Quoting r. Yael Kalka <[EMAIL PROTECTED]>: > The signal handling for catching ^C (SIGINT) was deleted before. Oops, should have looked at the context. You are right, sorry. -- Michael S. Tsirkin Staff Engineer, Mellanox Technologies ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - add syslog prints in windows
Hi Hal, Currently SYSLOG prints are not executed under Windows. The following patch adds these printings to the Windows stack as well. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: opensm/osm_log.c === --- opensm/osm_log.c(revision 5307) +++ opensm/osm_log.c(working copy) @@ -105,6 +105,8 @@ osm_log( usecs = time_usecs % 100; localtime_r(&tim, &result); +#endif /* WIN32 */ + /* If this is a call to syslog - always print it */ if ( verbosity & OSM_LOG_SYS ) { @@ -122,16 +124,21 @@ osm_log( } /* send it also to the log file */ +#ifdef WIN32 +GetLocalTime(&st); +fprintf( p_log->out_port, "[%02d:%02d:%02d:%03d][%04X] -> %s", + st.wHour, st.wMinute, st.wSecond, st.wMilliseconds, + pid, buffer); +#else fprintf( p_log->out_port, "%s %02d %02d:%02d:%02d %06d [%04X] -> %s\n", (result.tm_mon < 12 ? month_str[result.tm_mon] : "???"), result.tm_mday, result.tm_hour, result.tm_min, result.tm_sec, usecs, pid, buffer); fflush( p_log->out_port ); - +#endif } -#endif /* WIN32 */ /* SYS messages go to the log anyways */ if (p_log->level & verbosity) ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - clean osm_vendor_mlx_sa.c code
Hi Hal, Currently in osm_vendor_mlx_sa.c the sent context is saved arbitrarily as nodeInfo_context. This results in need for strange castings from long to pointer and vice-versa. The following patch adds another possible context - arbitrary context, which will be used in this case. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: libvendor/osm_vendor_mlx_sa.c === --- libvendor/osm_vendor_mlx_sa.c (revision 5307) +++ libvendor/osm_vendor_mlx_sa.c (working copy) @@ -96,9 +96,9 @@ __osmv_sa_mad_rcv_cb( goto Exit; } - /* obtain the sent context since we store it during send in the ni_ctx */ + /* obtain the sent context */ p_query_req_copy = - (osmv_query_req_t *)CAST_P2LONG(p_req_madw->context.ni_context.node_guid); + (osmv_query_req_t *)(p_req_madw->context.arb_context.context1); /* provide the context of the original request in the result */ query_res.query_context = p_query_req_copy->query_context; @@ -207,7 +207,7 @@ __osmv_sa_mad_err_cb( /* Obtain the sent context etc */ p_query_req_copy = - (osmv_query_req_t *)CAST_P2LONG(p_madw->context.ni_context.node_guid); + (osmv_query_req_t *)(p_madw->context.arb_context.context1); /* provide the context of the original request in the result */ query_res.query_context = p_query_req_copy->query_context; @@ -561,10 +561,17 @@ __osmv_send_sa_req( /* Provide the address to send to */ + /* Patch to handle IBAL - host order , where it should take destination lid in network order */ +#ifdef OSM_VENDOR_INTF_AL + p_madw->mad_addr.dest_lid = p_bind->sm_lid; +#else p_madw->mad_addr.dest_lid = cl_hton16(p_bind->sm_lid); +#endif p_madw->mad_addr.addr_type.smi.source_lid = cl_hton16(p_bind->lid); p_madw->mad_addr.addr_type.gsi.remote_qp = CL_HTON32(1); + p_madw->mad_addr.addr_type.gsi.remote_qkey = IB_QP1_WELL_KNOWN_Q_KEY; + p_madw->mad_addr.addr_type.gsi.pkey = IB_DEFAULT_PKEY; p_madw->resp_expected = TRUE; p_madw->fail_msg = CL_DISP_MSGID_NONE; @@ -574,12 +581,11 @@ __osmv_send_sa_req( Since we can not rely on the client to keep it arroud until the response - we duplicate it and will later dispose it (in CB). To store on the MADW we cast it into what opensm has: -p_madw->context.ni_context.node_guid +p_madw->context.arb_context.context1 */ p_query_req_copy = cl_malloc(sizeof(*p_query_req_copy)); *p_query_req_copy = *p_query_req; - p_madw->context.ni_context.node_guid = -(ib_net64_t)CAST_P2LONG(p_query_req_copy); + p_madw->context.arb_context.context1 = p_query_req_copy; /* we can support async as well as sync calls */ sync = ((p_query_req->flags & OSM_SA_FLAGS_SYNC) == OSM_SA_FLAGS_SYNC); Index: include/opensm/osm_madw.h === --- include/opensm/osm_madw.h (revision 5307) +++ include/opensm/osm_madw.h (working copy) @@ -315,6 +315,22 @@ typedef struct _osm_vla_context boolean_t set_method; } osm_vla_context_t; /*/ +/s* OpenSM: MAD Wrapper/osm_arbitrary_context_t +* NAME +* osm_sa_context_t +* +* DESCRIPTION +* Context needed by arbitrary recipient. +* +* SYNOPSIS +*/ +typedef struct _osm_arbitrary_context +{ + void* context1; + void* context2; +} osm_arbitrary_context_t; +/*/ + /s* OpenSM: MAD Wrapper/osm_madw_context_t * NAME * osm_madw_context_t @@ -335,6 +351,7 @@ typedef union _osm_madw_context osm_smi_context_t smi_context; osm_slvl_context_t slvl_context; osm_pkey_context_t pkey_context; + osm_arbitrary_context_t arb_context; } osm_madw_context_t; /*/ @@ -880,6 +897,34 @@ osm_madw_get_vla_context_ptr( } /* * PARAMETERS +* p_madw +* [in] Pointer to an osm_madw_t object. +* +* RETURN VALUES +* Pointer to the start of the context structure. +* +* NOTES +* +* SEE ALSO +*/ + +/f* OpenSM: MAD Wrapper/osm_madw_get_arbitrary_context_ptr +* NAME +* osm_madw_get_arbitrary_context_ptr +* +* DESCRIPTION +* Gets a pointer to the arbitrary context in this MAD. +* +* SYNOPSIS +*/ +static inline osm_arbitrary_context_t* +osm_madw_get_arbitrary_context_ptr( + IN const osm_madw_t* const p_madw ) +{ + return( (osm_arbitrary_context_t*)&p_madw->context ); +} +/* +* PARAMETERS * p_madw * [in] Pointer to an osm_madw_t object. * ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] RE: [PATCH] Opensm - osm_reg_sig_handler in Windows
Michael, The signal handling for catching ^C (SIGINT) was deleted before. There are other signalling caught by OpenSM, for example SIGHUP, that enables triggering the OpenSM to do another heavy sweep. We do not want to remove this. Yael -Original Message- From: Michael S. Tsirkin Sent: Monday, February 06, 2006 2:25 PM To: Yael Kalka Cc: [EMAIL PROTECTED]; openib-general@openib.org Subject: Re: [PATCH] Opensm - osm_reg_sig_handler in Windows Quoting r. Yael Kalka <[EMAIL PROTECTED]>: > Subject: [PATCH] Opensm - osm_reg_sig_handler in Windows > > > Hi Hal, > > The osm_reg_sig_handler function is not supported in Windows. > The following patch adds the function only if non-Windows stack. > > Thanks, > Yael > > Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> As was pointed out several times, we dont really need a signal handler in linux, either, since driver detects the application exiting automatically. Can we kill it completely please? Work around for broken drivers that cant detect application exiting belongs in the vendor layer, not in opensm proper. -- Michael S. Tsirkin Staff Engineer, Mellanox Technologies ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] Re: [PATCH] Opensm - osm_reg_sig_handler in Windows
Quoting r. Yael Kalka <[EMAIL PROTECTED]>: > Subject: [PATCH] Opensm - osm_reg_sig_handler in Windows > > > Hi Hal, > > The osm_reg_sig_handler function is not supported in Windows. > The following patch adds the function only if non-Windows stack. > > Thanks, > Yael > > Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> As was pointed out several times, we dont really need a signal handler in linux, either, since driver detects the application exiting automatically. Can we kill it completely please? Work around for broken drivers that cant detect application exiting belongs in the vendor layer, not in opensm proper. -- Michael S. Tsirkin Staff Engineer, Mellanox Technologies ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - osm_reg_sig_handler in Windows
Hi Hal, The osm_reg_sig_handler function is not supported in Windows. The following patch adds the function only if non-Windows stack. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: include/opensm/osm_opensm.h === --- include/opensm/osm_opensm.h (revision 5307) +++ include/opensm/osm_opensm.h (working copy) @@ -394,6 +394,7 @@ extern volatile int osm_exit_flag; * Set to one to cause all threads to leave */ +#ifndef __WIN__ /f* OpenSM: OpenSM/osm_reg_sig_handler * NAME * osm_reg_sig_handler @@ -417,6 +418,7 @@ IN osm_opensm_t* const p_osm); * * SEE ALSO */ +#endif /* __WIN__ */ END_C_DECLS Index: opensm/osm_opensm.c === --- opensm/osm_opensm.c (revision 5307) +++ opensm/osm_opensm.c (working copy) @@ -151,6 +151,7 @@ osm_opensm_create_mcgroups( /** * SHUT DOWN IS CONTROLLED BY A GLOBAL EXIT FLAG **/ +#ifndef __WIN__ static osm_opensm_t *__p_osm_to_signal; void @@ -191,6 +192,7 @@ osm_reg_sig_handler( return; } +#endif /* __WIN__ */ /** **/ ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - osm_ucast_mgr.c - use dynamic alloc
Hi Hal, The original static allocation doesn't compile in Windows. The attached patch replaces it with dynamic allocation. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: opensm/osm_ucast_mgr.c === --- opensm/osm_ucast_mgr.c (revision 5307) +++ opensm/osm_ucast_mgr.c (working copy) @@ -633,13 +633,31 @@ __osm_ucast_mgr_process_port( in providing better routing in LMC > 0 situations */ uint16_t lids_per_port = 1 << p_mgr->p_subn->opt.lmc; - uint64_t remote_sys_guids[lids_per_port]; - uint64_t remote_node_guids[lids_per_port]; + uint64_t* remote_sys_guids = NULL; + uint64_t* remote_node_guids = NULL; uint16_t num_used_sys = 0; uint16_t num_used_nodes = 0; OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_port ); + remote_sys_guids = cl_zalloc( sizeof(uint64_t) * lids_per_port ); + if( remote_sys_guids == NULL ) + { +osm_log( p_mgr->p_log, OSM_LOG_ERROR, + "__osm_ucast_mgr_process_port: ERR 3A09: " + "Cannot allocate array. Memory insufficient.\n"); +goto Exit; + } + + remote_node_guids = cl_zalloc( sizeof(uint64_t) * lids_per_port ); + if( remote_node_guids == NULL ) + { +osm_log( p_mgr->p_log, OSM_LOG_ERROR, + "__osm_ucast_mgr_process_port: ERR 3A0A: " + "Cannot allocate array. Memory insufficient.\n"); +goto Exit; + } + osm_port_get_lid_range_ho( p_port, &min_lid_ho, &max_lid_ho ); /* If the lids are zero - then there was some problem with the initialization. @@ -767,6 +785,8 @@ __osm_ucast_mgr_process_port( osm_switch_set_path( p_sw, lid_ho, port, is_ignored_by_port_prof); } Exit: + if (remote_sys_guids) cl_free(remote_sys_guids); + if (remote_node_guids) cl_free(remote_node_guids); OSM_LOG_EXIT( p_mgr->p_log ); } ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - osm_sa_path_record.c - variable declaration
Hi Hal, There was an issue discussed a while ago regarding declaration of several variables inside the function, in the code handling path record for multicast. Declaration in the middle of the function doesn't compile on windows, and in the past you said that the preffered approach by you is to add parenthesis on the code handling the multicast path records. This patch adds these parenthesis. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: opensm/osm_sa_path_record.c === --- opensm/osm_sa_path_record.c (revision 5307) +++ opensm/osm_sa_path_record.c (working copy) @@ -1753,7 +1753,7 @@ osm_pr_rcv_process( osm_log(p_rcv->p_log, OSM_LOG_DEBUG, "osm_pr_rcv_process: " "Multicast destination requested\n" ); - + { osm_mgrp_t *p_mgrp = NULL; ib_api_status_t status; osm_pr_item_t* p_pr_item; @@ -1815,6 +1815,7 @@ osm_pr_rcv_process( "MC group attributes don't match PathRecord request\n" ); } } + } /* Now, (finally) respond to the PathRecord request */ ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - fix casting for windows
Hi Hal, The following patch adds some missing casts and fixes object types to fix compilation errors in the windows stack, aadds some changes in osm_db_file.c to match the windows stack. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: opensm/osm_db_pack.c === --- opensm/osm_db_pack.c(revision 5307) +++ opensm/osm_db_pack.c(working copy) @@ -80,13 +80,13 @@ __osm_unpack_lids( if (! p_num) return 1; tmp = strtoul(p_num, NULL, 0); CL_ASSERT( tmp < 0x1 ); - *p_min_lid = tmp; + *p_min_lid = (uint16_t)tmp; p_num = strtok_r(NULL, " \t", &p_next); if (! p_num) return 1; tmp = strtoul(p_num, NULL, 0); CL_ASSERT( tmp < 0x1 ); - *p_max_lid = tmp; + *p_max_lid = (uint16_t)tmp; return 0; } Index: opensm/osm_lid_mgr.c === --- opensm/osm_lid_mgr.c(revision 5307) +++ opensm/osm_lid_mgr.c(working copy) @@ -742,7 +742,7 @@ void { cl_ptr_vector_t *p_discovered_vec = &p_mgr->p_subn->port_lid_tbl; uint16_t lid, min_lid, max_lid; - uint16_t max_tbl_lid = cl_ptr_vector_get_size( p_discovered_vec ); + uint16_t max_tbl_lid = (uint16_t)(cl_ptr_vector_get_size( p_discovered_vec )); osm_port_get_lid_range_ho(p_port, &min_lid, &max_lid); for (lid = min_lid; lid <= max_lid; lid++) Index: opensm/osm_pkey.c === --- opensm/osm_pkey.c (revision 5307) +++ opensm/osm_pkey.c (working copy) @@ -76,7 +76,7 @@ void osm_pkey_tbl_destroy( IN osm_pkey_tbl_t *p_pkey_tbl) { uint16_t num_blocks, i; - num_blocks = cl_ptr_vector_get_size( &p_pkey_tbl->blocks ); + num_blocks = (uint16_t)(cl_ptr_vector_get_size( &p_pkey_tbl->blocks )); for (i = 0; i < num_blocks; i++) cl_free(cl_ptr_vector_get( &p_pkey_tbl->blocks, i )); cl_ptr_vector_destroy( &p_pkey_tbl->blocks ); @@ -202,7 +202,8 @@ osm_physp_share_pkey( IN const osm_physp_t* const p_physp_1, IN const osm_physp_t* const p_physp_2 ) { - ib_net16_t *pkey1, *pkey2, pkey1_base, pkey2_base; + ib_net16_t *pkey1, *pkey2; + uint64_t pkey1_base, pkey2_base; const osm_pkey_tbl_t *pkey_tbl1, *pkey_tbl2; cl_map_iterator_t map_iter1, map_iter2; Index: opensm/osm_pkey_mgr.c === --- opensm/osm_pkey_mgr.c (revision 5307) +++ opensm/osm_pkey_mgr.c (working copy) @@ -234,7 +234,7 @@ osm_pkey_mgr_process( osm_node_t *p_node; osm_node_t *p_next_node; - uint32_t port_num; + uint8_t port_num; osm_physp_t *p_physp; osm_signal_t result = OSM_SIGNAL_DONE; Index: opensm/osm_trap_rcv.c === --- opensm/osm_trap_rcv.c (revision 5307) +++ opensm/osm_trap_rcv.c (working copy) @@ -135,7 +135,7 @@ osm_trap_rcv_aging_tracker_callback( /* We got an exit flag - do nothing */ return 0; - lid = (uint16_t)cl_ntoh16(( key & 0xULL) >> 32); + lid = cl_ntoh16((uint16_t)(( key & 0xULL) >> 32)); port_num = (uint8_t)(( key & 0x00FFULL) >> 48); p_physp = __get_physp_by_lid_and_num( p_rcv, lid, port_num ); Index: opensm/osm_ucast_updn.c === --- opensm/osm_ucast_updn.c (revision 5307) +++ opensm/osm_ucast_updn.c (working copy) @@ -620,7 +620,8 @@ updn_subn_rank( { /* Init local vars */ osm_port_t *p_root_port=NULL; - uint8_t tbl_size,rank=base_rank; + uint16_t tbl_size; + uint8_t rank=base_rank; osm_physp_t *p_physp, *p_remote_physp,*p_physp_temp; cl_list_t *p_currList,*p_nextList; cl_status_t did_cause_update; @@ -639,7 +640,7 @@ updn_subn_rank( p_currList = p_nextList; /* Check valid subnet & guid */ - tbl_size = cl_qmap_count(&(osm.subn.port_guid_tbl)); + tbl_size = (uint16_t)(cl_qmap_count(&(osm.subn.port_guid_tbl))); if (tbl_size == 0) { osm_log(&(osm.log), OSM_LOG_ERROR, @@ -1078,7 +1079,7 @@ osm_updn_find_root_nodes_by_min_hop( OUT uint8_t hop_val; uint16_t numHopBarsOverThd1 = 0; uint16_t numHopBarsOverThd2 = 0; -float thd1,thd2; +double thd1,thd2; p_sw = p_next_sw; /* Roll to the next switch */ ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - osm_db_file.c - windows fixes
Hi Hal, The following patch adds some changes in osm_db_file.c to match the windows stack. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: opensm/osm_db_files.c === --- opensm/osm_db_files.c (revision 5307) +++ opensm/osm_db_files.c (working copy) @@ -172,6 +172,12 @@ osm_db_init( if ( p_db_imp->db_dir_name == NULL ) p_db_imp->db_dir_name = OSM_DEFAULT_CACHE_DIR; + /* create the directory if it doesn't exist */ + /* There is difference between creating in windows and in linux */ +#ifdef __WIN__ + /* Check if the directory exists. If not - create it. */ + CreateDirectory(p_db_imp->db_dir_name, NULL); +#else /* __WIN__ */ /* make sure the directory exists */ if (lstat(p_db_imp->db_dir_name, &dstat)) { @@ -185,6 +191,7 @@ osm_db_init( return 1; } } +#endif p_db->p_log = p_log; p_db->p_db_imp = (void*)p_db_imp; @@ -466,6 +473,14 @@ osm_db_store( fclose(p_file); /* move the domain file */ + status = remove(p_domain_imp->file_name); + if (status) + { +osm_log( p_log, OSM_LOG_ERROR, + "osm_db_store: ERR 6909: " + " Fail to remove file:%s (err:%u)\n", + p_domain_imp->file_name, status); + } status = rename(p_tmp_file_name, p_domain_imp->file_name); if (status) { ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - cl_event_wheel casting
Hi Hal, The following patch adds the casting done in a clearer way - to avoid compilation errors in windows. Also - added a clear message if the timeout was trimmed (due to the casting). Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: complib/cl_event_wheel.c === --- complib/cl_event_wheel.c(revision 5307) +++ complib/cl_event_wheel.c(working copy) @@ -426,8 +426,18 @@ cl_event_wheel_reg( * cl_timer_stop(&p_event_wheel->timer); */ +/* The timeout for the cl_timer_start should be given as uint32_t. + if there is an overflow - warn about it. */ +if ( timeout > (uint32_t)timeout ) +{ + osm_log (p_event_wheel->p_log, OSM_LOG_INFO, + "cl_event_wheel_reg: " + "timeout requested is too large. Using timeout: %u \n", + (uint32_t)timeout ); +} + /* start the timer to the timeout [msec] */ -cl_status = cl_timer_start(&p_event_wheel->timer, timeout); +cl_status = cl_timer_start(&p_event_wheel->timer, (uint32_t)timeout); if (cl_status != CL_SUCCESS) { ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH] Opensm - asserts before OSM_LOG_ENTER - cont.
Hi Hal, The Patch Michael Tsirkin suggested for fixing the OSM_LOG_ENTER problem works fine both for windows and for linux. Here is the patch for this, instead of the previous one I sent. Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: include/opensm/osm_log.h === --- include/opensm/osm_log.h(revision 5307) +++ include/opensm/osm_log.h(working copy) @@ -71,17 +71,15 @@ BEGIN_C_DECLS #define LOG_ENTRY_SIZE_MAX 4096 #define BUF_SIZE LOG_ENTRY_SIZE_MAX -#define OSM_LOG_DEFINE_FUNC( NAME ) \ - static const char osm_log_func_name[] = #NAME +#define __func__ __FUNCTION__ #define OSM_LOG_ENTER( OSM_LOG_PTR, NAME ) \ - OSM_LOG_DEFINE_FUNC( NAME ); \ osm_log( OSM_LOG_PTR, OSM_LOG_FUNCS, \ -"%s: [\n", osm_log_func_name ); +"%s: [\n", __func__ ); #define OSM_LOG_EXIT( OSM_LOG_PTR ) \ osm_log( OSM_LOG_PTR, OSM_LOG_FUNCS, \ -"%s: ]\n", osm_log_func_name ); +"%s: ]\n", __func__ ); /h* OpenSM/Log * NAME ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general