[PATCH 3/4] cxgb4: Add CLIP support to store compressed IPv6 address
The Compressed LIP region is used to hold a limited number of Local IPv6 addresses. This region is primarily used to reduce the TCAM space consumed for an IPv6 offloaded connection. A 128-bit LIP will be reduced to 13-bit and stored in the TCAM if there is a match between the IPv6 tuple's LIP and the one stored in the CLIP region. Signed-off-by: Vipul Pandya vi...@chelsio.com --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 206 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 23 +++ 2 files changed, 229 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index d1d6ff7..4ae287c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -60,6 +60,7 @@ #include linux/workqueue.h #include net/neighbour.h #include net/netevent.h +#include net/addrconf.h #include asm/uaccess.h #include cxgb4.h @@ -68,6 +69,11 @@ #include t4fw_api.h #include l2t.h +#include ../drivers/net/bonding/bonding.h + +#ifdef DRV_VERSION +#undef DRV_VERSION +#endif #define DRV_VERSION 2.0.0-ko #define DRV_DESC Chelsio T4/T5 Network Driver @@ -3227,6 +3233,38 @@ static int tid_init(struct tid_info *t) return 0; } +static int cxgb4_clip_get(const struct net_device *dev, + const struct in6_addr *lip) +{ + struct adapter *adap; + struct fw_clip_cmd c; + + adap = netdev2adap(dev); + memset(c, 0, sizeof(c)); + c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE); + c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); + *(__be64 *)c.ip_hi = *(__be64 *)(lip-s6_addr); + *(__be64 *)c.ip_lo = *(__be64 *)(lip-s6_addr + 8); + return t4_wr_mbox_meat(adap, adap-mbox, c, sizeof(c), c, false); +} + +static int cxgb4_clip_release(const struct net_device *dev, + const struct in6_addr *lip) +{ + struct adapter *adap; + struct fw_clip_cmd c; + + adap = netdev2adap(dev); + memset(c, 0, sizeof(c)); + c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) | + FW_CMD_REQUEST | FW_CMD_READ); + c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); + *(__be64 *)c.ip_hi = *(__be64 *)(lip-s6_addr); + *(__be64 *)c.ip_lo = *(__be64 *)(lip-s6_addr + 8); + return t4_wr_mbox_meat(adap, adap-mbox, c, sizeof(c), c, false); +} + /** * cxgb4_create_server - create an IP server * @dev: the device @@ -3878,6 +3916,169 @@ int cxgb4_unregister_uld(enum cxgb4_uld type) } EXPORT_SYMBOL(cxgb4_unregister_uld); +/* Check if netdev on which event is occured belongs to us or not. Return + * suceess (1) if it belongs otherwise failure (0). + */ +static int cxgb4_netdev(struct net_device *netdev) +{ + struct adapter *adap; + int i; + + mutex_lock(uld_mutex); + list_for_each_entry(adap, adapter_list, list_node) + for (i = 0; i MAX_NPORTS; i++) + if (adap-port[i] == netdev) { + mutex_unlock(uld_mutex); + return 1; + } + mutex_unlock(uld_mutex); + return 0; +} + +static int clip_add(struct net_device *event_dev, struct inet6_ifaddr *ifa, + unsigned long event) +{ + int ret = NOTIFY_DONE; + + rcu_read_lock(); + if (cxgb4_netdev(event_dev)) { + switch (event) { + case NETDEV_UP: + ret = cxgb4_clip_get(event_dev, + (const struct in6_addr *)ifa-addr.s6_addr); + if (ret 0) { + rcu_read_unlock(); + return ret; + } + ret = NOTIFY_OK; + break; + case NETDEV_DOWN: + cxgb4_clip_release(event_dev, + (const struct in6_addr *)ifa-addr.s6_addr); + ret = NOTIFY_OK; + break; + default: + break; + } + } + rcu_read_unlock(); + return ret; +} + +static int cxgb4_inet6addr_handler(struct notifier_block *this, + unsigned long event, void *data) +{ + struct inet6_ifaddr *ifa = data; + struct net_device *event_dev; + int ret = NOTIFY_DONE; + int cnt; + struct bonding *bond = netdev_priv(ifa-idev-dev); + struct slave *slave; + struct pci_dev *first_pdev = NULL; + + if (ifa-idev-dev-priv_flags IFF_802_1Q_VLAN) { + event_dev = vlan_dev_real_dev(ifa-idev-dev); + ret = clip_add(event_dev, ifa, event); + } else if (ifa-idev-dev-flags IFF_MASTER) { + /* It is possible that two different
[PATCH 1/4] RDMA/cma: Add IPv6 support for iWARP.
From: Steve Wise sw...@opengridcomputing.com This patch modifies the type of local_addr and remote_addr fields in struct iw_cm_id from struct sockaddr_in to struct sockaddr_storage to hold IPv6 and IPv4 addresses uniformly. It changes the references of local_addr and remote_addr in RDMA/cxgb4, RDMA/cxgb3, RDMA/nes and amso drivers such that build failure is avoided. However to be able to actully run the traffic over IPv6 address respective drivers have to add supportive code. Signed-off-by: Steve Wise sw...@opengridcomputing.com --- drivers/infiniband/core/cma.c | 65 ++--- drivers/infiniband/hw/amso1100/c2_ae.c | 24 +++-- drivers/infiniband/hw/amso1100/c2_cm.c | 17 +++- drivers/infiniband/hw/cxgb3/iwch_cm.c | 46 ++--- drivers/infiniband/hw/cxgb4/cm.c | 62 - drivers/infiniband/hw/nes/nes_cm.c | 165 + include/rdma/iw_cm.h | 8 +- 7 files changed, 245 insertions(+), 142 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 34fbc2f..2ad22d9 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1335,7 +1335,6 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id-context; struct rdma_cm_event event; - struct sockaddr_in *sin; int ret = 0; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) @@ -1347,10 +1346,10 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: - sin = (struct sockaddr_in *) id_priv-id.route.addr.src_addr; - *sin = iw_event-local_addr; - sin = (struct sockaddr_in *) id_priv-id.route.addr.dst_addr; - *sin = iw_event-remote_addr; + memcpy(id_priv-id.route.addr.src_addr, iw_event-local_addr, + ip_addr_size((struct sockaddr *)iw_event-local_addr)); + memcpy(id_priv-id.route.addr.dst_addr, iw_event-remote_addr, + ip_addr_size((struct sockaddr *)iw_event-remote_addr)); switch (iw_event-status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -1400,7 +1399,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct sockaddr_in *sin; struct net_device *dev = NULL; struct rdma_cm_event event; int ret; @@ -1422,14 +1420,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, mutex_lock_nested(conn_id-handler_mutex, SINGLE_DEPTH_NESTING); conn_id-state = RDMA_CM_CONNECT; - dev = ip_dev_find(init_net, iw_event-local_addr.sin_addr.s_addr); - if (!dev) { - ret = -EADDRNOTAVAIL; - mutex_unlock(conn_id-handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - ret = rdma_copy_addr(conn_id-id.route.addr.dev_addr, dev, NULL); + ret = rdma_translate_ip((struct sockaddr *)iw_event-local_addr, + conn_id-id.route.addr.dev_addr); if (ret) { mutex_unlock(conn_id-handler_mutex); rdma_destroy_id(new_cm_id); @@ -1447,10 +1439,11 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, cm_id-context = conn_id; cm_id-cm_handler = cma_iw_handler; - sin = (struct sockaddr_in *) new_cm_id-route.addr.src_addr; - *sin = iw_event-local_addr; - sin = (struct sockaddr_in *) new_cm_id-route.addr.dst_addr; - *sin = iw_event-remote_addr; + memcpy(new_cm_id-route.addr.src_addr, iw_event-local_addr, + ip_addr_size((struct sockaddr *)iw_event-local_addr)); + memcpy(new_cm_id-route.addr.dst_addr, iw_event-remote_addr, + ip_addr_size((struct sockaddr *)iw_event-remote_addr)); + ret = ib_query_device(conn_id-id.device, attr); if (ret) { @@ -1526,7 +1519,6 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; - struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv-id.device, @@ -1537,8 +1529,9 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) id_priv-cm_id.iw = id; - sin = (struct sockaddr_in *) id_priv-id.route.addr.src_addr; - id_priv-cm_id.iw-local_addr = *sin; + memcpy(id_priv-cm_id.iw-local_addr, id_priv-id.route.addr.src_addr, + ip_addr_size((struct sockaddr *) + id_priv-id.route.addr.src_addr)); ret = iw_cm_listen(id_priv-cm_id.iw, backlog); @@ -2128,13 +2121,23 @@ int
[PATCH 2/4] cxgb4: Add routines to create and remove listening IPv6 servers
Add cxgb4_create_server6 and cxgb4_remove_server routines to create and remove listening IPv6 servers. Return success (0) from cxgb4_create_server in case of ctrl queue congestion since in case of congestion, passive open request gets queued and gets processed later. If non zero value is returned it can be treated as an error and ULD can free STID which can result into an error in passive open reply. Add cpl structure for active open request with IPv6 address for T5. Signed-off-by: Vipul Pandya vi...@chelsio.com --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 71 - drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 5 ++ drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 17 +- 3 files changed, 91 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 5a3256b..d1d6ff7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3246,6 +3246,7 @@ int cxgb4_create_server(const struct net_device *dev, unsigned int stid, struct sk_buff *skb; struct adapter *adap; struct cpl_pass_open_req *req; + int ret; skb = alloc_skb(sizeof(*req), GFP_KERNEL); if (!skb) @@ -3263,10 +3264,78 @@ int cxgb4_create_server(const struct net_device *dev, unsigned int stid, req-opt0 = cpu_to_be64(TX_CHAN(chan)); req-opt1 = cpu_to_be64(CONN_POLICY_ASK | SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue)); - return t4_mgmt_tx(adap, skb); + ret = t4_mgmt_tx(adap, skb); + return net_xmit_eval(ret); } EXPORT_SYMBOL(cxgb4_create_server); +/* cxgb4_create_server6 - create an IPv6 server + * @dev: the device + * @stid: the server TID + * @sip: local IPv6 address to bind server to + * @sport: the server's TCP port + * @queue: queue to direct messages from this server to + * + * Create an IPv6 server for the given port and address. + * Returns 0 on error and one of the %NET_XMIT_* values on success. + */ +int cxgb4_create_server6(const struct net_device *dev, unsigned int stid, +const struct in6_addr *sip, __be16 sport, +unsigned int queue) +{ + unsigned int chan; + struct sk_buff *skb; + struct adapter *adap; + struct cpl_pass_open_req6 *req; + int ret; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + adap = netdev2adap(dev); + req = (struct cpl_pass_open_req6 *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, stid)); + req-local_port = sport; + req-peer_port = htons(0); + req-local_ip_hi = *(__be64 *)(sip-s6_addr); + req-local_ip_lo = *(__be64 *)(sip-s6_addr + 8); + req-peer_ip_hi = cpu_to_be64(0); + req-peer_ip_lo = cpu_to_be64(0); + chan = rxq_to_chan(adap-sge, queue); + req-opt0 = cpu_to_be64(TX_CHAN(chan)); + req-opt1 = cpu_to_be64(CONN_POLICY_ASK | + SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue)); + ret = t4_mgmt_tx(adap, skb); + return net_xmit_eval(ret); +} +EXPORT_SYMBOL(cxgb4_create_server6); + +int cxgb4_remove_server(const struct net_device *dev, unsigned int stid, + unsigned int queue, bool ipv6) +{ + struct sk_buff *skb; + struct adapter *adap; + struct cpl_close_listsvr_req *req; + int ret; + + adap = netdev2adap(dev); + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + req = (struct cpl_close_listsvr_req *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid)); + req-reply_ctrl = htons(NO_REPLY(0) | (ipv6 ? LISTSVR_IPV6(1) : + LISTSVR_IPV6(0)) | QUEUENO(queue)); + ret = t4_mgmt_tx(adap, skb); + return net_xmit_eval(ret); +} +EXPORT_SYMBOL(cxgb4_remove_server); + /** * cxgb4_best_mtu - find the entry in the MTU table closest to an MTU * @mtus: the HW MTU table diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 4faf4d0..6f21f24 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -154,6 +154,11 @@ struct in6_addr; int cxgb4_create_server(const struct net_device *dev, unsigned int stid, __be32 sip, __be16 sport, __be16 vlan, unsigned int queue); +int cxgb4_create_server6(const struct net_device *dev, unsigned int stid, +const struct in6_addr *sip, __be16 sport, +unsigned int queue); +int cxgb4_remove_server(const struct net_device
[PATCH 0/4] Add IPv6 support for iWARP
Hi All, This patch series adds IPv6 support for iWARP. It enables Chelsio's T4 and T5 adapters to transmitt RDMA traffic over IPv6 address. It adds new apis and cpl messages in cxgb4 to support IPv6 operations and uses them in RDMA/cxgb4. The patch series modifies the type of local_addr and remote_addr fields in struct iw_cm_id from struct sockaddr_in to struct sockaddr_storage to hold IPv6 and IPv4 addresses uniformly. It changes the references of local_addr and remote_addr in RDMA/cxgb4, RDMA/cxgb3, RDMA/nes and amso drivers such that build failure is avoided. We would like to submit this patch series via Roland's infiniband tree for-next branch. However the series requires the latest changes pushed in cxgb4 driver in David Miller's net-next tree. We request Roland to merge cxgb4 and RDMA/cxgb4 drivers from net-next tree before applying this series on for-next branch. We have created this patch series on top of net-next tree. We have included all the maintainers of respective drivers. Kindly review the change and let us know in case of any review comments. Thanks, Vipul Pandya Steve Wise (1): RDMA/cma: Add IPv6 support for iWARP. Vipul Pandya (3): cxgb4: Add routines to create and remove listening IPv6 servers cxgb4: Add CLIP support to store compressed IPv6 address RDMA/cxgb4: Add support for active and passive open connection with IPv6 address drivers/infiniband/core/cma.c | 65 +- drivers/infiniband/hw/amso1100/c2_ae.c | 24 +- drivers/infiniband/hw/amso1100/c2_cm.c | 17 +- drivers/infiniband/hw/cxgb3/iwch_cm.c | 46 +- drivers/infiniband/hw/cxgb4/cm.c| 845 drivers/infiniband/hw/cxgb4/device.c| 116 +++- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 +- drivers/infiniband/hw/nes/nes_cm.c | 165 +++-- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 278 +++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 5 + drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 17 +- drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 23 + include/rdma/iw_cm.h| 8 +- 13 files changed, 1175 insertions(+), 438 deletions(-) -- 1.8.0 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] RDMA/cxgb4: Add support for active and passive open connection with IPv6 address
Add new cpl messages, cpl_act_open_req6 and cpl_t5_act_open_req6, for initiating active open connections. Use LLD api cxgb4_create_server and cxgb4_create_server6 for initiating passive open connections. Similarly use cxgb4_remove_server to remove the passive open connections in place of listen_stop. Add support for iWARP over VLAN device and enable IPv6 support on VLAN device. Make use of import_ep in c4iw_reconnect. Signed-off-by: Vipul Pandya vi...@chelsio.com --- drivers/infiniband/hw/cxgb4/cm.c | 835 + drivers/infiniband/hw/cxgb4/device.c | 116 +++-- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 +- 3 files changed, 635 insertions(+), 320 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index cdc443d..35d1cf1 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -44,6 +44,8 @@ #include net/netevent.h #include net/route.h #include net/tcp.h +#include net/ip6_route.h +#include net/addrconf.h #include iw_cxgb4.h @@ -333,19 +335,72 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) return skb; } -static struct rtable *find_route(struct c4iw_dev *dev, __be32 local_ip, +static struct net_device *get_real_dev(struct net_device *egress_dev) +{ + struct net_device *phys_dev = egress_dev; + if (egress_dev-priv_flags IFF_802_1Q_VLAN) + phys_dev = vlan_dev_real_dev(egress_dev); + return phys_dev; +} + +static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) +{ + int i; + + egress_dev = get_real_dev(egress_dev); + for (i = 0; i dev-rdev.lldi.nports; i++) + if (dev-rdev.lldi.ports[i] == egress_dev) + return 1; + return 0; +} + +static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip, +__u8 *peer_ip, __be16 local_port, +__be16 peer_port, u8 tos, +__u32 sin6_scope_id) +{ + struct flowi6 fl6; + struct dst_entry *dst; + + memset(fl6, 0, sizeof(fl6)); + memcpy(fl6.daddr, peer_ip, 16); + memcpy(fl6.saddr, local_ip, 16); + if (ipv6_addr_type(fl6.daddr) IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6_scope_id; + dst = ip6_route_output(init_net, NULL, fl6); + if (!dst) + goto out; + if (!our_interface(dev, ip6_dst_idev(dst)-dev) + !(ip6_dst_idev(dst)-dev-flags IFF_LOOPBACK)) { + dst_release(dst); + dst = NULL; + } +out: + return dst; +} + +static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos) { struct rtable *rt; struct flowi4 fl4; + struct neighbour *n; rt = ip_route_output_ports(init_net, fl4, NULL, peer_ip, local_ip, peer_port, local_port, IPPROTO_TCP, tos, 0); if (IS_ERR(rt)) return NULL; - return rt; + n = dst_neigh_lookup(rt-dst, peer_ip); + if (!n) + return NULL; + if (!our_interface(dev, n-dev)) { + dst_release(rt-dst); + return NULL; + } + neigh_release(n); + return rt-dst; } static void arp_failure_discard(void *handle, struct sk_buff *skb) @@ -512,15 +567,28 @@ static int send_connect(struct c4iw_ep *ep) { struct cpl_act_open_req *req; struct cpl_t5_act_open_req *t5_req; + struct cpl_act_open_req6 *req6; + struct cpl_t5_act_open_req6 *t5_req6; struct sk_buff *skb; u64 opt0; u32 opt2; unsigned int mtu_idx; int wscale; - int size = is_t4(ep-com.dev-rdev.lldi.adapter_type) ? - sizeof(struct cpl_act_open_req) : - sizeof(struct cpl_t5_act_open_req); - int wrlen = roundup(size, 16); + int wrlen; + int sizev4 = is_t4(ep-com.dev-rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req) : + sizeof(struct cpl_t5_act_open_req); + int sizev6 = is_t4(ep-com.dev-rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req6) : + sizeof(struct cpl_t5_act_open_req6); + struct sockaddr_in *la = (struct sockaddr_in *)ep-com.local_addr; + struct sockaddr_in *ra = (struct sockaddr_in *)ep-com.remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)ep-com.local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)ep-com.remote_addr; + + wrlen = (ep-com.remote_addr.ss_family == AF_INET) ? + roundup(sizev4, 16) : +
Re: [PATCH] osm_sa_mcmember_record.c Reduce number of error messages the for same event
On 6/3/2013 7:39 AM, Line Holen wrote: ERR msg 0x1B22 info is put into the error messages of the tests in the same subfunction. No information is lost, but you get one error message instead of two. Also changed a couple of messages to make them more end user friendly Signed-off-by: Line Holen line.ho...@oracle.com Thanks. Applied. -- Hal -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] osm_sm_state_mgr.c Don't clear IS_SM bit when changing state to NOT_ACTIVE
On 6/5/2013 7:14 AM, Line Holen wrote: The SM is still operational even though it is in this state. Other SMs will not know about our presence when IS_SM is cleared and will therefor not attempt to enable us again. Signed-off-by: Line Holen line.ho...@oracle.com Thanks. Applied. -- Hal -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] mm: Revert pinned_vm braindamage
Patch bc3e53f682 (mm: distinguish between mlocked and pinned pages) broke RLIMIT_MEMLOCK. Before that patch: mm_struct::locked_vm RLIMIT_MEMLOCK; after that patch we have: mm_struct::locked_vm RLIMIT_MEMLOCK mm_struct::pinned_vm RLIMIT_MEMLOCK. The patch doesn't mention RLIMIT_MEMLOCK and thus also doesn't discus this (user visible) change in semantics. And thus we must assume it was unintentional. Since RLIMIT_MEMLOCK is very clearly a limit on the amount of pages the process can 'lock' into memory it should very much include pinned pages as well as mlock()ed pages. Neither can be paged. Since nobody had anything constructive to say about the VM_PINNED approach and the IB code hurts my head too much to make it work I propose we revert said patch. Once again the rationale; MLOCK(2) is part of POSIX Realtime Extentsion (1003.1b-1993/1003.1i-1995). It states that the specified part of the user address space should stay memory resident until either program exit or a matching munlock() call. This definition basically excludes major faults from happening on the pages -- a major fault being one where IO needs to happen to obtain the page content; the direct implication being that page content must remain in memory. Linux has taken this literal and made mlock()ed pages subject to page migration (albeit only for the explicit move_pages() syscall; but it would very much like to make them subject to implicit page migration for the purpose of compaction etc.). This view disregards the intention of the spec; since mlock() is part of the realtime spec the intention is very much that the user address range generate no faults; neither minor nor major -- any delay is unacceptable. This leaves the RT people unhappy -- therefore _if_ we continue with this Linux specific interpretation of mlock() we must introduce new syscalls that implement the intended mlock() semantics. It was found that there are useful purposes for this weaker mlock(), a rationale to indeed have two sets of syscalls. The weaker mlock() can be used in the context of security -- where we avoid sensitive data being written to disk, and in the context of userspace deamons that are part of the IO path -- which would otherwise form IO deadlocks. The proposed second set of primitives would be mpin() and munpin() and would implement the intended mlock() semantics. Such pages would not be migratable in any way (a possible implementation would be to 'pin' the pages using an extra refcount on the page frame). From the above we can see that any mpin()ed page is also an mlock()ed page, since mpin() will disallow any fault, and thus will also disallow major faults. While we still lack the formal mpin() and munpin() syscalls there are a number of sites that have similar 'side effects' and result in user controlled 'pinning' of pages. Namely IB and perf. For the purpose of RLIMIT_MEMLOCK we must use intent only as it is not part of the formal spec. The only useful thing is to limit the amount of pages a user can exempt from paging. This would therefore include all pages either mlock()ed or mpin()ed. Back to the patch; a resource limit must have a resource counter to enact the limit upon. Before the patch this was mm_struct::locked_vm. After the patch there is no such thing left. The patch was proposed to 'fix' a double accounting problem where pages are both pinned and mlock()ed. This was particularly visible when using mlockall() on a process that uses either IB or perf. I state that since mlockall() disables/invalidates RLIMIT_MEMLOCK the actual resource counter value is irrelevant, and thus the reported problem is a non-problem. However, it would still be possible to observe weirdness in the very unlikely event that a user would indeed call mlock() upon an address range obtained from IB/perf. In this case he would be unduly constrained and find his effective RLIMIT_MEMLOCK limit halved (at worst). After the patch; that same user will find he has an effectively double RLIMIT_MEMLOCK, since the IB/perf pages are not counted towards the same limit as his mlock() pages are. It is far more likely a user will employ mlock() on different rages than those he received from IB/perf since he already knows those aren't going anywhere. Therefore the patch trades an unlikely weirdness for a much more likely weirdness. So barring a proper solution I propose we revert. I've yet to hear a coherent objection to the above. Christoph is always quick to yell: 'but if fixes a double accounting issue' but is completely deaf to the fact that he changed user visible semantics without mention and regard. Signed-off-by: Peter Zijlstra pet...@infradead.org --- drivers/infiniband/core/umem.c | 8 drivers/infiniband/hw/ipath/ipath_user_pages.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 4 ++-- fs/proc/task_mmu.c | 2 -- include/linux/mm_types.h | 1 - kernel/events/core.c
[PATCH][TRIVIAL] Add attribute information to SA request error messages
Signed-off-by: Line Holen line.ho...@oracle.com --- diff --git a/opensm/osm_sa_class_port_info.c b/opensm/osm_sa_class_port_info.c index 2682505..69abad5 100644 --- a/opensm/osm_sa_class_port_info.c +++ b/opensm/osm_sa_class_port_info.c @@ -2,6 +2,7 @@ * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -201,7 +202,7 @@ void osm_cpi_rcv_process(IN void *context, IN void *data) /* we only support GET */ if (p_sa_mad-method != IB_MAD_METHOD_GET) { OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 1403: - Unsupported Method (%s)\n, + Unsupported Method (%s) for ClassPortInfo request\n, ib_get_sa_method_str(p_sa_mad-method)); osm_sa_send_error(sa, p_madw, IB_SA_MAD_STATUS_REQ_INVALID); goto Exit; diff --git a/opensm/osm_sa_guidinfo_record.c b/opensm/osm_sa_guidinfo_record.c index a00b257..8323b38 100644 --- a/opensm/osm_sa_guidinfo_record.c +++ b/opensm/osm_sa_guidinfo_record.c @@ -2,6 +2,7 @@ * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2012 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -797,7 +798,7 @@ void osm_gir_rcv_process(IN void *ctx, IN void *data) break; default: OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 5105: - Unsupported Method (%s)\n, + Unsupported Method (%s) for GUIDInfoRecord request\n, ib_get_sa_method_str(p_rcvd_mad-method)); osm_sa_send_error(sa, p_madw, IB_MAD_STATUS_UNSUP_METHOD_ATTR); break; diff --git a/opensm/osm_sa_lft_record.c b/opensm/osm_sa_lft_record.c index 3be20f3..cf651ef 100644 --- a/opensm/osm_sa_lft_record.c +++ b/opensm/osm_sa_lft_record.c @@ -2,6 +2,7 @@ * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2005,2008 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -197,7 +198,7 @@ void osm_lftr_rcv_process(IN void *ctx, IN void *data) if (p_rcvd_mad-method != IB_MAD_METHOD_GET p_rcvd_mad-method != IB_MAD_METHOD_GETTABLE) { OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 4408: - Unsupported Method (%s)\n, + Unsupported Method (%s) for LFTRecord request\n, ib_get_sa_method_str(p_rcvd_mad-method)); osm_sa_send_error(sa, p_madw, IB_MAD_STATUS_UNSUP_METHOD_ATTR); goto Exit; diff --git a/opensm/osm_sa_link_record.c b/opensm/osm_sa_link_record.c index eec952d..5073f96 100644 --- a/opensm/osm_sa_link_record.c +++ b/opensm/osm_sa_link_record.c @@ -2,6 +2,7 @@ * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -436,7 +437,7 @@ void osm_lr_rcv_process(IN void *context, IN void *data) if (p_sa_mad-method != IB_MAD_METHOD_GET p_sa_mad-method != IB_MAD_METHOD_GETTABLE) { OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 1804: - Unsupported Method (%s)\n, + Unsupported Method (%s) for LinkRecord request\n, ib_get_sa_method_str(p_sa_mad-method)); osm_sa_send_error(sa, p_madw, IB_MAD_STATUS_UNSUP_METHOD_ATTR); goto Exit; diff --git a/opensm/osm_sa_mft_record.c b/opensm/osm_sa_mft_record.c index 2eca1f0..6f9763f 100644 --- a/opensm/osm_sa_mft_record.c +++ b/opensm/osm_sa_mft_record.c @@ -2,6 +2,7 @@ * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + *
Re: [PATCH] mm: Revert pinned_vm braindamage
On Thu, 6 Jun 2013, Peter Zijlstra wrote: Since RLIMIT_MEMLOCK is very clearly a limit on the amount of pages the process can 'lock' into memory it should very much include pinned pages as well as mlock()ed pages. Neither can be paged. So we we thought that this is the sum of the pages that a process has mlocked. Initiated by the process and/or environment explicitly. A user space initiated action. Since nobody had anything constructive to say about the VM_PINNED approach and the IB code hurts my head too much to make it work I propose we revert said patch. I said that the use of a PIN page flag would allow correct accounting if one wanted to interpret the limit the way you do. Once again the rationale; MLOCK(2) is part of POSIX Realtime Extentsion (1003.1b-1993/1003.1i-1995). It states that the specified part of the user address space should stay memory resident until either program exit or a matching munlock() call. This definition basically excludes major faults from happening on the pages -- a major fault being one where IO needs to happen to obtain the page content; the direct implication being that page content must remain in memory. Exactly that is the definition. Linux has taken this literal and made mlock()ed pages subject to page migration (albeit only for the explicit move_pages() syscall; but it would very much like to make them subject to implicit page migration for the purpose of compaction etc.). Page migration is not a page fault? The ability to move a process completely (including its mlocked segments) is important for the manual migration of process memory. That is what page migration was made for. If mlocked pages are treated as pinnned pages then the complete process can no longer be moved from node to node. This view disregards the intention of the spec; since mlock() is part of the realtime spec the intention is very much that the user address range generate no faults; neither minor nor major -- any delay is unacceptable. Where does it say that no faults are generated? Dont we generate COW on mlocked ranges? This leaves the RT people unhappy -- therefore _if_ we continue with this Linux specific interpretation of mlock() we must introduce new syscalls that implement the intended mlock() semantics. Intended means Peter's semantics? It was found that there are useful purposes for this weaker mlock(), a rationale to indeed have two sets of syscalls. The weaker mlock() can be used in the context of security -- where we avoid sensitive data being written to disk, and in the context of userspace deamons that are part of the IO path -- which would otherwise form IO deadlocks. Migratable mlocked pages enable complete process migration between nodes of a NUMA system for HPC workloads. The proposed second set of primitives would be mpin() and munpin() and would implement the intended mlock() semantics. I agree that we need mpin and munpin. But they should not be called mlock semantics. Such pages would not be migratable in any way (a possible implementation would be to 'pin' the pages using an extra refcount on the page frame). From the above we can see that any mpin()ed page is also an mlock()ed page, since mpin() will disallow any fault, and thus will also disallow major faults. That cannot be so since mlocked pages need to be migratable. While we still lack the formal mpin() and munpin() syscalls there are a number of sites that have similar 'side effects' and result in user controlled 'pinning' of pages. Namely IB and perf. Right thats why we need this. For the purpose of RLIMIT_MEMLOCK we must use intent only as it is not part of the formal spec. The only useful thing is to limit the amount of pages a user can exempt from paging. This would therefore include all pages either mlock()ed or mpin()ed. RLIMIT_MEMLOCK is a limit on the pages that a process has mlocked into memory. Pinning is not initiated by user space but by the kernel. Either temporarily (page count increases are used all over the kernel for this) or for longer time frame (IB and Perf and likely more drivers that we have not found yet). Back to the patch; a resource limit must have a resource counter to enact the limit upon. Before the patch this was mm_struct::locked_vm. After the patch there is no such thing left. The limit was not checked correctly before the patch since pinned pages were accounted as mlocked. I state that since mlockall() disables/invalidates RLIMIT_MEMLOCK the actual resource counter value is irrelevant, and thus the reported problem is a non-problem. Where does it disable RLIMIT_MEMLOCK? However, it would still be possible to observe weirdness in the very unlikely event that a user would indeed call mlock() upon an address range obtained from IB/perf. In this case he would be unduly constrained and find his effective RLIMIT_MEMLOCK limit halved (at worst). This is weird for other reasons as well since we are using two
Re: Status of ummunot branch?
On Jun 5, 2013, at 10:52 PM, Haggai Eran hagg...@mellanox.com wrote: Haggai: A verb to resize a registration would probably be a helpful step. MPI could maintain one registration that covers the sbrk region and one registration that covers the heap, much easier than searching tables and things. That's a nice idea. Even without this verb, I think it is possible to develop a registration cache that covers those regions though. When you find out you have some part of your region not registered, you can register a new, larger region that covers everything you need. For new operations you only use the newer region. Once the previous, smaller region is not used, you de-register it. I'm not sure what you mean. Are you saying I should do something like this: MPI_Init() { // the first MPI function invoked mpi_sbrk_save = sbrk(); ibv_reg_mr(..., 0, mpi_sbrk_save, ...); ... } MPI_Send(buffer, ...) { if (mpi_sbrk_save != sbrk()) mpi_sbrk_save = sbrk(); ibv_rereg_mr(..., 0, mpi_sbrk_save, ...); ... } I don't think this covers other memory regions, like those added via mmap, right? -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/ -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ANNOUNCE] dapl-2.0.37
Rupert/Vlad, please pull this package into OFED 3.5.2 Thanks, Arlin -- Latest Packages (see ChangeLog for recent changes): md5sum: 2e185e1aac2c09b3d9e529ee1aa1669e dapl-2.0.37.tar.gz For v2.0 package install RPM packages as follow: dapl-2.0.37-1 dapl-utils-2.0.37-1 dapl-devel-2.0.37-1 dapl-debuginfo-2.0.37-1 Summary of v2.0 changes: Release 2.0.37 fixes (OFED 3.5.2): common: add support for ia name during dat_ia_query common: dapl_os_atomic_inc/dec() not working as expected on ppc64 machines. dapltest: ppc64 endian issue with exchanged mem handle and address -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html