[PATCH 3/4] cxgb4: Add CLIP support to store compressed IPv6 address

2013-06-06 Thread Vipul Pandya
The Compressed LIP region is used to hold a limited number of Local IPv6
addresses. This region is primarily used to reduce the TCAM space consumed for
an IPv6 offloaded connection. A 128-bit LIP will be reduced to 13-bit and stored
in the TCAM if there is a match between the IPv6 tuple's LIP and the one stored
in the CLIP region.

Signed-off-by: Vipul Pandya vi...@chelsio.com
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 206 
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  23 +++
 2 files changed, 229 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index d1d6ff7..4ae287c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -60,6 +60,7 @@
 #include linux/workqueue.h
 #include net/neighbour.h
 #include net/netevent.h
+#include net/addrconf.h
 #include asm/uaccess.h
 
 #include cxgb4.h
@@ -68,6 +69,11 @@
 #include t4fw_api.h
 #include l2t.h
 
+#include ../drivers/net/bonding/bonding.h
+
+#ifdef DRV_VERSION
+#undef DRV_VERSION
+#endif
 #define DRV_VERSION 2.0.0-ko
 #define DRV_DESC Chelsio T4/T5 Network Driver
 
@@ -3227,6 +3233,38 @@ static int tid_init(struct tid_info *t)
return 0;
 }
 
+static int cxgb4_clip_get(const struct net_device *dev,
+ const struct in6_addr *lip)
+{
+   struct adapter *adap;
+   struct fw_clip_cmd c;
+
+   adap = netdev2adap(dev);
+   memset(c, 0, sizeof(c));
+   c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) |
+   FW_CMD_REQUEST | FW_CMD_WRITE);
+   c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
+   *(__be64 *)c.ip_hi = *(__be64 *)(lip-s6_addr);
+   *(__be64 *)c.ip_lo = *(__be64 *)(lip-s6_addr + 8);
+   return t4_wr_mbox_meat(adap, adap-mbox, c, sizeof(c), c, false);
+}
+
+static int cxgb4_clip_release(const struct net_device *dev,
+ const struct in6_addr *lip)
+{
+   struct adapter *adap;
+   struct fw_clip_cmd c;
+
+   adap = netdev2adap(dev);
+   memset(c, 0, sizeof(c));
+   c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) |
+   FW_CMD_REQUEST | FW_CMD_READ);
+   c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
+   *(__be64 *)c.ip_hi = *(__be64 *)(lip-s6_addr);
+   *(__be64 *)c.ip_lo = *(__be64 *)(lip-s6_addr + 8);
+   return t4_wr_mbox_meat(adap, adap-mbox, c, sizeof(c), c, false);
+}
+
 /**
  * cxgb4_create_server - create an IP server
  * @dev: the device
@@ -3878,6 +3916,169 @@ int cxgb4_unregister_uld(enum cxgb4_uld type)
 }
 EXPORT_SYMBOL(cxgb4_unregister_uld);
 
+/* Check if netdev on which event is occured belongs to us or not. Return
+ * suceess (1) if it belongs otherwise failure (0).
+ */
+static int cxgb4_netdev(struct net_device *netdev)
+{
+   struct adapter *adap;
+   int i;
+
+   mutex_lock(uld_mutex);
+   list_for_each_entry(adap, adapter_list, list_node)
+   for (i = 0; i  MAX_NPORTS; i++)
+   if (adap-port[i] == netdev) {
+   mutex_unlock(uld_mutex);
+   return 1;
+   }
+   mutex_unlock(uld_mutex);
+   return 0;
+}
+
+static int clip_add(struct net_device *event_dev, struct inet6_ifaddr *ifa,
+   unsigned long event)
+{
+   int ret = NOTIFY_DONE;
+
+   rcu_read_lock();
+   if (cxgb4_netdev(event_dev)) {
+   switch (event) {
+   case NETDEV_UP:
+   ret = cxgb4_clip_get(event_dev,
+   (const struct in6_addr *)ifa-addr.s6_addr);
+   if (ret  0) {
+   rcu_read_unlock();
+   return ret;
+   }
+   ret = NOTIFY_OK;
+   break;
+   case NETDEV_DOWN:
+   cxgb4_clip_release(event_dev,
+   (const struct in6_addr *)ifa-addr.s6_addr);
+   ret = NOTIFY_OK;
+   break;
+   default:
+   break;
+   }
+   }
+   rcu_read_unlock();
+   return ret;
+}
+
+static int cxgb4_inet6addr_handler(struct notifier_block *this,
+   unsigned long event, void *data)
+{
+   struct inet6_ifaddr *ifa = data;
+   struct net_device *event_dev;
+   int ret = NOTIFY_DONE;
+   int cnt;
+   struct bonding *bond = netdev_priv(ifa-idev-dev);
+   struct slave *slave;
+   struct pci_dev *first_pdev = NULL;
+
+   if (ifa-idev-dev-priv_flags  IFF_802_1Q_VLAN) {
+   event_dev = vlan_dev_real_dev(ifa-idev-dev);
+   ret = clip_add(event_dev, ifa, event);
+   } else if (ifa-idev-dev-flags  IFF_MASTER) {
+   /* It is possible that two different 

[PATCH 1/4] RDMA/cma: Add IPv6 support for iWARP.

2013-06-06 Thread Vipul Pandya
From: Steve Wise sw...@opengridcomputing.com

This patch modifies the type of local_addr and remote_addr fields in struct
iw_cm_id from struct sockaddr_in to struct sockaddr_storage to hold IPv6 and
IPv4 addresses uniformly. It changes the references of local_addr and
remote_addr in RDMA/cxgb4, RDMA/cxgb3, RDMA/nes and amso drivers such that build
failure is avoided. However to be able to actully run the traffic over IPv6
address respective drivers have to add supportive code.

Signed-off-by: Steve Wise sw...@opengridcomputing.com
---
 drivers/infiniband/core/cma.c  |  65 ++---
 drivers/infiniband/hw/amso1100/c2_ae.c |  24 +++--
 drivers/infiniband/hw/amso1100/c2_cm.c |  17 +++-
 drivers/infiniband/hw/cxgb3/iwch_cm.c  |  46 ++---
 drivers/infiniband/hw/cxgb4/cm.c   |  62 -
 drivers/infiniband/hw/nes/nes_cm.c | 165 +
 include/rdma/iw_cm.h   |   8 +-
 7 files changed, 245 insertions(+), 142 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 34fbc2f..2ad22d9 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1335,7 +1335,6 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct 
iw_cm_event *iw_event)
 {
struct rdma_id_private *id_priv = iw_id-context;
struct rdma_cm_event event;
-   struct sockaddr_in *sin;
int ret = 0;
 
if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
@@ -1347,10 +1346,10 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, 
struct iw_cm_event *iw_event)
event.event = RDMA_CM_EVENT_DISCONNECTED;
break;
case IW_CM_EVENT_CONNECT_REPLY:
-   sin = (struct sockaddr_in *) id_priv-id.route.addr.src_addr;
-   *sin = iw_event-local_addr;
-   sin = (struct sockaddr_in *) id_priv-id.route.addr.dst_addr;
-   *sin = iw_event-remote_addr;
+   memcpy(id_priv-id.route.addr.src_addr, iw_event-local_addr,
+  ip_addr_size((struct sockaddr *)iw_event-local_addr));
+   memcpy(id_priv-id.route.addr.dst_addr, iw_event-remote_addr,
+  ip_addr_size((struct sockaddr *)iw_event-remote_addr));
switch (iw_event-status) {
case 0:
event.event = RDMA_CM_EVENT_ESTABLISHED;
@@ -1400,7 +1399,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 {
struct rdma_cm_id *new_cm_id;
struct rdma_id_private *listen_id, *conn_id;
-   struct sockaddr_in *sin;
struct net_device *dev = NULL;
struct rdma_cm_event event;
int ret;
@@ -1422,14 +1420,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
mutex_lock_nested(conn_id-handler_mutex, SINGLE_DEPTH_NESTING);
conn_id-state = RDMA_CM_CONNECT;
 
-   dev = ip_dev_find(init_net, iw_event-local_addr.sin_addr.s_addr);
-   if (!dev) {
-   ret = -EADDRNOTAVAIL;
-   mutex_unlock(conn_id-handler_mutex);
-   rdma_destroy_id(new_cm_id);
-   goto out;
-   }
-   ret = rdma_copy_addr(conn_id-id.route.addr.dev_addr, dev, NULL);
+   ret = rdma_translate_ip((struct sockaddr *)iw_event-local_addr,
+   conn_id-id.route.addr.dev_addr);
if (ret) {
mutex_unlock(conn_id-handler_mutex);
rdma_destroy_id(new_cm_id);
@@ -1447,10 +1439,11 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
cm_id-context = conn_id;
cm_id-cm_handler = cma_iw_handler;
 
-   sin = (struct sockaddr_in *) new_cm_id-route.addr.src_addr;
-   *sin = iw_event-local_addr;
-   sin = (struct sockaddr_in *) new_cm_id-route.addr.dst_addr;
-   *sin = iw_event-remote_addr;
+   memcpy(new_cm_id-route.addr.src_addr, iw_event-local_addr,
+  ip_addr_size((struct sockaddr *)iw_event-local_addr));
+   memcpy(new_cm_id-route.addr.dst_addr, iw_event-remote_addr,
+  ip_addr_size((struct sockaddr *)iw_event-remote_addr));
+
 
ret = ib_query_device(conn_id-id.device, attr);
if (ret) {
@@ -1526,7 +1519,6 @@ static int cma_ib_listen(struct rdma_id_private *id_priv)
 static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
 {
int ret;
-   struct sockaddr_in *sin;
struct iw_cm_id *id;
 
id = iw_create_cm_id(id_priv-id.device,
@@ -1537,8 +1529,9 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, 
int backlog)
 
id_priv-cm_id.iw = id;
 
-   sin = (struct sockaddr_in *) id_priv-id.route.addr.src_addr;
-   id_priv-cm_id.iw-local_addr = *sin;
+   memcpy(id_priv-cm_id.iw-local_addr, id_priv-id.route.addr.src_addr,
+  ip_addr_size((struct sockaddr *)
+  id_priv-id.route.addr.src_addr));
 
ret = iw_cm_listen(id_priv-cm_id.iw, backlog);
 
@@ -2128,13 +2121,23 @@ int 

[PATCH 2/4] cxgb4: Add routines to create and remove listening IPv6 servers

2013-06-06 Thread Vipul Pandya
Add cxgb4_create_server6 and cxgb4_remove_server routines to create and remove
listening IPv6 servers.

Return success (0) from cxgb4_create_server in case of ctrl queue congestion
since in case of congestion, passive open request gets queued and gets
processed later. If non zero value is returned it can be treated as an error
and ULD can free STID which can result into an error in passive open reply.

Add cpl structure for active open request with IPv6 address for T5.

Signed-off-by: Vipul Pandya vi...@chelsio.com
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 71 -
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |  5 ++
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 17 +-
 3 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5a3256b..d1d6ff7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3246,6 +3246,7 @@ int cxgb4_create_server(const struct net_device *dev, 
unsigned int stid,
struct sk_buff *skb;
struct adapter *adap;
struct cpl_pass_open_req *req;
+   int ret;
 
skb = alloc_skb(sizeof(*req), GFP_KERNEL);
if (!skb)
@@ -3263,10 +3264,78 @@ int cxgb4_create_server(const struct net_device *dev, 
unsigned int stid,
req-opt0 = cpu_to_be64(TX_CHAN(chan));
req-opt1 = cpu_to_be64(CONN_POLICY_ASK |
SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue));
-   return t4_mgmt_tx(adap, skb);
+   ret = t4_mgmt_tx(adap, skb);
+   return net_xmit_eval(ret);
 }
 EXPORT_SYMBOL(cxgb4_create_server);
 
+/* cxgb4_create_server6 - create an IPv6 server
+ * @dev: the device
+ * @stid: the server TID
+ * @sip: local IPv6 address to bind server to
+ * @sport: the server's TCP port
+ * @queue: queue to direct messages from this server to
+ *
+ * Create an IPv6 server for the given port and address.
+ * Returns 0 on error and one of the %NET_XMIT_* values on success.
+ */
+int cxgb4_create_server6(const struct net_device *dev, unsigned int stid,
+const struct in6_addr *sip, __be16 sport,
+unsigned int queue)
+{
+   unsigned int chan;
+   struct sk_buff *skb;
+   struct adapter *adap;
+   struct cpl_pass_open_req6 *req;
+   int ret;
+
+   skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+   if (!skb)
+   return -ENOMEM;
+
+   adap = netdev2adap(dev);
+   req = (struct cpl_pass_open_req6 *)__skb_put(skb, sizeof(*req));
+   INIT_TP_WR(req, 0);
+   OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, stid));
+   req-local_port = sport;
+   req-peer_port = htons(0);
+   req-local_ip_hi = *(__be64 *)(sip-s6_addr);
+   req-local_ip_lo = *(__be64 *)(sip-s6_addr + 8);
+   req-peer_ip_hi = cpu_to_be64(0);
+   req-peer_ip_lo = cpu_to_be64(0);
+   chan = rxq_to_chan(adap-sge, queue);
+   req-opt0 = cpu_to_be64(TX_CHAN(chan));
+   req-opt1 = cpu_to_be64(CONN_POLICY_ASK |
+   SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue));
+   ret = t4_mgmt_tx(adap, skb);
+   return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_create_server6);
+
+int cxgb4_remove_server(const struct net_device *dev, unsigned int stid,
+   unsigned int queue, bool ipv6)
+{
+   struct sk_buff *skb;
+   struct adapter *adap;
+   struct cpl_close_listsvr_req *req;
+   int ret;
+
+   adap = netdev2adap(dev);
+
+   skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+   if (!skb)
+   return -ENOMEM;
+
+   req = (struct cpl_close_listsvr_req *)__skb_put(skb, sizeof(*req));
+   INIT_TP_WR(req, 0);
+   OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
+   req-reply_ctrl = htons(NO_REPLY(0) | (ipv6 ? LISTSVR_IPV6(1) :
+   LISTSVR_IPV6(0)) | QUEUENO(queue));
+   ret = t4_mgmt_tx(adap, skb);
+   return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_remove_server);
+
 /**
  * cxgb4_best_mtu - find the entry in the MTU table closest to an MTU
  * @mtus: the HW MTU table
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 4faf4d0..6f21f24 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -154,6 +154,11 @@ struct in6_addr;
 int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
__be32 sip, __be16 sport, __be16 vlan,
unsigned int queue);
+int cxgb4_create_server6(const struct net_device *dev, unsigned int stid,
+const struct in6_addr *sip, __be16 sport,
+unsigned int queue);
+int cxgb4_remove_server(const struct net_device 

[PATCH 0/4] Add IPv6 support for iWARP

2013-06-06 Thread Vipul Pandya
Hi All,

This patch series adds IPv6 support for iWARP. It enables Chelsio's T4 and T5
adapters to transmitt RDMA traffic over IPv6 address. It adds new apis and cpl
messages in cxgb4 to support IPv6 operations and uses them in RDMA/cxgb4.

The patch series  modifies the type of local_addr and remote_addr fields in
struct iw_cm_id from struct sockaddr_in to struct sockaddr_storage to hold IPv6
and IPv4 addresses uniformly. It changes the references of local_addr and
remote_addr in RDMA/cxgb4, RDMA/cxgb3, RDMA/nes and amso drivers such that build
failure is avoided.

We would like to submit this patch series via Roland's infiniband tree for-next
branch. However the series requires the latest changes pushed in cxgb4 driver in
David Miller's net-next tree. We request Roland to merge cxgb4 and RDMA/cxgb4
drivers from net-next tree before applying this series on for-next branch. We
have created this patch series on top of net-next tree.

We have included all the maintainers of respective drivers. Kindly review the
change and let us know in case of any review comments.

Thanks,
Vipul Pandya

Steve Wise (1):
  RDMA/cma: Add IPv6 support for iWARP.

Vipul Pandya (3):
  cxgb4: Add routines to create and remove listening IPv6 servers
  cxgb4: Add CLIP support to store compressed IPv6 address
  RDMA/cxgb4: Add support for active and passive open connection with
IPv6 address

 drivers/infiniband/core/cma.c   |  65 +-
 drivers/infiniband/hw/amso1100/c2_ae.c  |  24 +-
 drivers/infiniband/hw/amso1100/c2_cm.c  |  17 +-
 drivers/infiniband/hw/cxgb3/iwch_cm.c   |  46 +-
 drivers/infiniband/hw/cxgb4/cm.c| 845 
 drivers/infiniband/hw/cxgb4/device.c| 116 +++-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h  |   4 +-
 drivers/infiniband/hw/nes/nes_cm.c  | 165 +++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 278 +++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |   5 +
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h |  17 +-
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  23 +
 include/rdma/iw_cm.h|   8 +-
 13 files changed, 1175 insertions(+), 438 deletions(-)

-- 
1.8.0

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] RDMA/cxgb4: Add support for active and passive open connection with IPv6 address

2013-06-06 Thread Vipul Pandya
Add new cpl messages, cpl_act_open_req6 and cpl_t5_act_open_req6, for
initiating active open connections.

Use LLD api cxgb4_create_server and cxgb4_create_server6 for initiating passive
open connections. Similarly use cxgb4_remove_server to remove the passive open
connections in place of listen_stop.

Add support for iWARP over VLAN device and enable IPv6 support on VLAN device.

Make use of import_ep in c4iw_reconnect.

Signed-off-by: Vipul Pandya vi...@chelsio.com
---
 drivers/infiniband/hw/cxgb4/cm.c   | 835 +
 drivers/infiniband/hw/cxgb4/device.c   | 116 +++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |   4 +-
 3 files changed, 635 insertions(+), 320 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index cdc443d..35d1cf1 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -44,6 +44,8 @@
 #include net/netevent.h
 #include net/route.h
 #include net/tcp.h
+#include net/ip6_route.h
+#include net/addrconf.h
 
 #include iw_cxgb4.h
 
@@ -333,19 +335,72 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int 
len, gfp_t gfp)
return skb;
 }
 
-static struct rtable *find_route(struct c4iw_dev *dev, __be32 local_ip,
+static struct net_device *get_real_dev(struct net_device *egress_dev)
+{
+   struct net_device *phys_dev = egress_dev;
+   if (egress_dev-priv_flags  IFF_802_1Q_VLAN)
+   phys_dev = vlan_dev_real_dev(egress_dev);
+   return phys_dev;
+}
+
+static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev)
+{
+   int i;
+
+   egress_dev = get_real_dev(egress_dev);
+   for (i = 0; i  dev-rdev.lldi.nports; i++)
+   if (dev-rdev.lldi.ports[i] == egress_dev)
+   return 1;
+   return 0;
+}
+
+static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip,
+__u8 *peer_ip, __be16 local_port,
+__be16 peer_port, u8 tos,
+__u32 sin6_scope_id)
+{
+   struct flowi6 fl6;
+   struct dst_entry *dst;
+
+   memset(fl6, 0, sizeof(fl6));
+   memcpy(fl6.daddr, peer_ip, 16);
+   memcpy(fl6.saddr, local_ip, 16);
+   if (ipv6_addr_type(fl6.daddr)  IPV6_ADDR_LINKLOCAL)
+   fl6.flowi6_oif = sin6_scope_id;
+   dst = ip6_route_output(init_net, NULL, fl6);
+   if (!dst)
+   goto out;
+   if (!our_interface(dev, ip6_dst_idev(dst)-dev) 
+   !(ip6_dst_idev(dst)-dev-flags  IFF_LOOPBACK)) {
+   dst_release(dst);
+   dst = NULL;
+   }
+out:
+   return dst;
+}
+
+static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
 __be32 peer_ip, __be16 local_port,
 __be16 peer_port, u8 tos)
 {
struct rtable *rt;
struct flowi4 fl4;
+   struct neighbour *n;
 
rt = ip_route_output_ports(init_net, fl4, NULL, peer_ip, local_ip,
   peer_port, local_port, IPPROTO_TCP,
   tos, 0);
if (IS_ERR(rt))
return NULL;
-   return rt;
+   n = dst_neigh_lookup(rt-dst, peer_ip);
+   if (!n)
+   return NULL;
+   if (!our_interface(dev, n-dev)) {
+   dst_release(rt-dst);
+   return NULL;
+   }
+   neigh_release(n);
+   return rt-dst;
 }
 
 static void arp_failure_discard(void *handle, struct sk_buff *skb)
@@ -512,15 +567,28 @@ static int send_connect(struct c4iw_ep *ep)
 {
struct cpl_act_open_req *req;
struct cpl_t5_act_open_req *t5_req;
+   struct cpl_act_open_req6 *req6;
+   struct cpl_t5_act_open_req6 *t5_req6;
struct sk_buff *skb;
u64 opt0;
u32 opt2;
unsigned int mtu_idx;
int wscale;
-   int size = is_t4(ep-com.dev-rdev.lldi.adapter_type) ?
-   sizeof(struct cpl_act_open_req) :
-   sizeof(struct cpl_t5_act_open_req);
-   int wrlen = roundup(size, 16);
+   int wrlen;
+   int sizev4 = is_t4(ep-com.dev-rdev.lldi.adapter_type) ?
+   sizeof(struct cpl_act_open_req) :
+   sizeof(struct cpl_t5_act_open_req);
+   int sizev6 = is_t4(ep-com.dev-rdev.lldi.adapter_type) ?
+   sizeof(struct cpl_act_open_req6) :
+   sizeof(struct cpl_t5_act_open_req6);
+   struct sockaddr_in *la = (struct sockaddr_in *)ep-com.local_addr;
+   struct sockaddr_in *ra = (struct sockaddr_in *)ep-com.remote_addr;
+   struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)ep-com.local_addr;
+   struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)ep-com.remote_addr;
+
+   wrlen = (ep-com.remote_addr.ss_family == AF_INET) ?
+   roundup(sizev4, 16) :
+   

Re: [PATCH] osm_sa_mcmember_record.c Reduce number of error messages the for same event

2013-06-06 Thread Hal Rosenstock
On 6/3/2013 7:39 AM, Line Holen wrote:
 ERR msg 0x1B22 info is put into the error messages of the tests in the
 same subfunction. No information is lost, but you get one error message
 instead of two.
 
 Also changed a couple of messages to make them more end user friendly
 
 Signed-off-by: Line Holen line.ho...@oracle.com

Thanks. Applied.

-- Hal
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] osm_sm_state_mgr.c Don't clear IS_SM bit when changing state to NOT_ACTIVE

2013-06-06 Thread Hal Rosenstock
On 6/5/2013 7:14 AM, Line Holen wrote:
 The SM is still operational even though it is in this state. Other
 SMs will not know about our presence when IS_SM is cleared and will
 therefor not attempt to enable us again.
 
 Signed-off-by: Line Holen line.ho...@oracle.com

Thanks. Applied.

-- Hal
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] mm: Revert pinned_vm braindamage

2013-06-06 Thread Peter Zijlstra

Patch bc3e53f682 (mm: distinguish between mlocked and pinned pages)
broke RLIMIT_MEMLOCK.

Before that patch: mm_struct::locked_vm  RLIMIT_MEMLOCK; after that
patch we have: mm_struct::locked_vm  RLIMIT_MEMLOCK 
mm_struct::pinned_vm  RLIMIT_MEMLOCK.

The patch doesn't mention RLIMIT_MEMLOCK and thus also doesn't discus
this (user visible) change in semantics. And thus we must assume it was
unintentional.

Since RLIMIT_MEMLOCK is very clearly a limit on the amount of pages the
process can 'lock' into memory it should very much include pinned pages
as well as mlock()ed pages. Neither can be paged.

Since nobody had anything constructive to say about the VM_PINNED
approach and the IB code hurts my head too much to make it work I
propose we revert said patch.


Once again the rationale; MLOCK(2) is part of POSIX Realtime Extentsion
(1003.1b-1993/1003.1i-1995). It states that the specified part of the
user address space should stay memory resident until either program exit
or a matching munlock() call.

This definition basically excludes major faults from happening on the
pages -- a major fault being one where IO needs to happen to obtain the
page content; the direct implication being that page content must remain
in memory.

Linux has taken this literal and made mlock()ed pages subject to page
migration (albeit only for the explicit move_pages() syscall; but it
would very much like to make them subject to implicit page migration for
the purpose of compaction etc.).

This view disregards the intention of the spec; since mlock() is part of
the realtime spec the intention is very much that the user address range
generate no faults; neither minor nor major -- any delay is
unacceptable.

This leaves the RT people unhappy -- therefore _if_ we continue with
this Linux specific interpretation of mlock() we must introduce new
syscalls that implement the intended mlock() semantics.

It was found that there are useful purposes for this weaker mlock(), a
rationale to indeed have two sets of syscalls. The weaker mlock() can be
used in the context of security -- where we avoid sensitive data being
written to disk, and in the context of userspace deamons that are part
of the IO path -- which would otherwise form IO deadlocks.

The proposed second set of primitives would be mpin() and munpin() and
would implement the intended mlock() semantics.

Such pages would not be migratable in any way (a possible
implementation would be to 'pin' the pages using an extra refcount on
the page frame). From the above we can see that any mpin()ed page is
also an mlock()ed page, since mpin() will disallow any fault, and thus
will also disallow major faults.

While we still lack the formal mpin() and munpin() syscalls there are a
number of sites that have similar 'side effects' and result in user
controlled 'pinning' of pages. Namely IB and perf.

For the purpose of RLIMIT_MEMLOCK we must use intent only as it is not
part of the formal spec. The only useful thing is to limit the amount of
pages a user can exempt from paging. This would therefore include all
pages either mlock()ed or mpin()ed.


Back to the patch; a resource limit must have a resource counter to
enact the limit upon. Before the patch this was mm_struct::locked_vm.
After the patch there is no such thing left.

The patch was proposed to 'fix' a double accounting problem where pages
are both pinned and mlock()ed. This was particularly visible when using
mlockall() on a process that uses either IB or perf.

I state that since mlockall() disables/invalidates RLIMIT_MEMLOCK the
actual resource counter value is irrelevant, and thus the reported
problem is a non-problem.

However, it would still be possible to observe weirdness in the very
unlikely event that a user would indeed call mlock() upon an address
range obtained from IB/perf. In this case he would be unduly constrained
and find his effective RLIMIT_MEMLOCK limit halved (at worst).

After the patch; that same user will find he has an effectively double
RLIMIT_MEMLOCK, since the IB/perf pages are not counted towards the same
limit as his mlock() pages are. It is far more likely a user will employ
mlock() on different rages than those he received from IB/perf since he
already knows those aren't going anywhere.

Therefore the patch trades an unlikely weirdness for a much more likely
weirdness. So barring a proper solution I propose we revert.

I've yet to hear a coherent objection to the above. Christoph is always
quick to yell: 'but if fixes a double accounting issue' but is
completely deaf to the fact that he changed user visible semantics
without mention and regard.

Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 drivers/infiniband/core/umem.c | 8 
 drivers/infiniband/hw/ipath/ipath_user_pages.c | 6 +++---
 drivers/infiniband/hw/qib/qib_user_pages.c | 4 ++--
 fs/proc/task_mmu.c | 2 --
 include/linux/mm_types.h   | 1 -
 kernel/events/core.c   

[PATCH][TRIVIAL] Add attribute information to SA request error messages

2013-06-06 Thread Line Holen
Signed-off-by: Line Holen line.ho...@oracle.com

---

diff --git a/opensm/osm_sa_class_port_info.c b/opensm/osm_sa_class_port_info.c
index 2682505..69abad5 100644
--- a/opensm/osm_sa_class_port_info.c
+++ b/opensm/osm_sa_class_port_info.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -201,7 +202,7 @@ void osm_cpi_rcv_process(IN void *context, IN void *data)
/* we only support GET */
if (p_sa_mad-method != IB_MAD_METHOD_GET) {
OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 1403: 
-   Unsupported Method (%s)\n,
+   Unsupported Method (%s) for ClassPortInfo request\n,
ib_get_sa_method_str(p_sa_mad-method));
osm_sa_send_error(sa, p_madw, IB_SA_MAD_STATUS_REQ_INVALID);
goto Exit;
diff --git a/opensm/osm_sa_guidinfo_record.c b/opensm/osm_sa_guidinfo_record.c
index a00b257..8323b38 100644
--- a/opensm/osm_sa_guidinfo_record.c
+++ b/opensm/osm_sa_guidinfo_record.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2012 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -797,7 +798,7 @@ void osm_gir_rcv_process(IN void *ctx, IN void *data)
break;
default:
OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 5105: 
-   Unsupported Method (%s)\n,
+   Unsupported Method (%s) for GUIDInfoRecord request\n,
ib_get_sa_method_str(p_rcvd_mad-method));
osm_sa_send_error(sa, p_madw, IB_MAD_STATUS_UNSUP_METHOD_ATTR);
break;
diff --git a/opensm/osm_sa_lft_record.c b/opensm/osm_sa_lft_record.c
index 3be20f3..cf651ef 100644
--- a/opensm/osm_sa_lft_record.c
+++ b/opensm/osm_sa_lft_record.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2005,2008 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -197,7 +198,7 @@ void osm_lftr_rcv_process(IN void *ctx, IN void *data)
if (p_rcvd_mad-method != IB_MAD_METHOD_GET 
p_rcvd_mad-method != IB_MAD_METHOD_GETTABLE) {
OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 4408: 
-   Unsupported Method (%s)\n,
+   Unsupported Method (%s) for LFTRecord request\n,
ib_get_sa_method_str(p_rcvd_mad-method));
osm_sa_send_error(sa, p_madw, IB_MAD_STATUS_UNSUP_METHOD_ATTR);
goto Exit;
diff --git a/opensm/osm_sa_link_record.c b/opensm/osm_sa_link_record.c
index eec952d..5073f96 100644
--- a/opensm/osm_sa_link_record.c
+++ b/opensm/osm_sa_link_record.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -436,7 +437,7 @@ void osm_lr_rcv_process(IN void *context, IN void *data)
if (p_sa_mad-method != IB_MAD_METHOD_GET 
p_sa_mad-method != IB_MAD_METHOD_GETTABLE) {
OSM_LOG(sa-p_log, OSM_LOG_ERROR, ERR 1804: 
-   Unsupported Method (%s)\n,
+   Unsupported Method (%s) for LinkRecord request\n,
ib_get_sa_method_str(p_sa_mad-method));
osm_sa_send_error(sa, p_madw, IB_MAD_STATUS_UNSUP_METHOD_ATTR);
goto Exit;
diff --git a/opensm/osm_sa_mft_record.c b/opensm/osm_sa_mft_record.c
index 2eca1f0..6f9763f 100644
--- a/opensm/osm_sa_mft_record.c
+++ b/opensm/osm_sa_mft_record.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ * 

Re: [PATCH] mm: Revert pinned_vm braindamage

2013-06-06 Thread Christoph Lameter
On Thu, 6 Jun 2013, Peter Zijlstra wrote:

 Since RLIMIT_MEMLOCK is very clearly a limit on the amount of pages the
 process can 'lock' into memory it should very much include pinned pages
 as well as mlock()ed pages. Neither can be paged.

So we we thought that this is the sum of the pages that a process has
mlocked. Initiated by the process and/or environment explicitly. A user
space initiated action.

 Since nobody had anything constructive to say about the VM_PINNED
 approach and the IB code hurts my head too much to make it work I
 propose we revert said patch.

I said that the use of a PIN page flag would allow correct accounting if
one wanted to interpret the limit the way you do.

 Once again the rationale; MLOCK(2) is part of POSIX Realtime Extentsion
 (1003.1b-1993/1003.1i-1995). It states that the specified part of the
 user address space should stay memory resident until either program exit
 or a matching munlock() call.

 This definition basically excludes major faults from happening on the
 pages -- a major fault being one where IO needs to happen to obtain the
 page content; the direct implication being that page content must remain
 in memory.

Exactly that is the definition.

 Linux has taken this literal and made mlock()ed pages subject to page
 migration (albeit only for the explicit move_pages() syscall; but it
 would very much like to make them subject to implicit page migration for
 the purpose of compaction etc.).

Page migration is not a page fault? The ability to move a process
completely (including its mlocked segments) is important for the manual
migration of process memory. That is what page migration was made for. If
mlocked pages are treated as pinnned pages then the complete process can
no longer be moved from node to node.

 This view disregards the intention of the spec; since mlock() is part of
 the realtime spec the intention is very much that the user address range
 generate no faults; neither minor nor major -- any delay is
 unacceptable.

Where does it say that no faults are generated? Dont we generate COW on
mlocked ranges?

 This leaves the RT people unhappy -- therefore _if_ we continue with
 this Linux specific interpretation of mlock() we must introduce new
 syscalls that implement the intended mlock() semantics.

Intended means Peter's semantics?

 It was found that there are useful purposes for this weaker mlock(), a
 rationale to indeed have two sets of syscalls. The weaker mlock() can be
 used in the context of security -- where we avoid sensitive data being
 written to disk, and in the context of userspace deamons that are part
 of the IO path -- which would otherwise form IO deadlocks.

Migratable mlocked pages enable complete process migration between nodes
of a NUMA system for HPC workloads.

 The proposed second set of primitives would be mpin() and munpin() and
 would implement the intended mlock() semantics.

I agree that we need mpin and munpin. But they should not be called mlock
semantics.

 Such pages would not be migratable in any way (a possible
 implementation would be to 'pin' the pages using an extra refcount on
 the page frame). From the above we can see that any mpin()ed page is
 also an mlock()ed page, since mpin() will disallow any fault, and thus
 will also disallow major faults.

That cannot be so since mlocked pages need to be migratable.

 While we still lack the formal mpin() and munpin() syscalls there are a
 number of sites that have similar 'side effects' and result in user
 controlled 'pinning' of pages. Namely IB and perf.

Right thats why we need this.

 For the purpose of RLIMIT_MEMLOCK we must use intent only as it is not
 part of the formal spec. The only useful thing is to limit the amount of
 pages a user can exempt from paging. This would therefore include all
 pages either mlock()ed or mpin()ed.

RLIMIT_MEMLOCK is a limit on the pages that a process has mlocked into
memory. Pinning is not initiated by user space but by the kernel. Either
temporarily (page count increases are used all over the kernel for this)
or for longer time frame (IB and Perf and likely more drivers that we have
not found yet).


   Back to the patch; a resource limit must have a resource counter to
 enact the limit upon. Before the patch this was mm_struct::locked_vm.
 After the patch there is no such thing left.

The limit was not checked correctly before the patch since pinned pages
were accounted as mlocked.

 I state that since mlockall() disables/invalidates RLIMIT_MEMLOCK the
 actual resource counter value is irrelevant, and thus the reported
 problem is a non-problem.

Where does it disable RLIMIT_MEMLOCK?

 However, it would still be possible to observe weirdness in the very
 unlikely event that a user would indeed call mlock() upon an address
 range obtained from IB/perf. In this case he would be unduly constrained
 and find his effective RLIMIT_MEMLOCK limit halved (at worst).

This is weird for other reasons as well since we are using two 

Re: Status of ummunot branch?

2013-06-06 Thread Jeff Squyres (jsquyres)
On Jun 5, 2013, at 10:52 PM, Haggai Eran hagg...@mellanox.com wrote:

 Haggai: A verb to resize a registration would probably be a helpful
 step. MPI could maintain one registration that covers the sbrk
 region and one registration that covers the heap, much easier than
 searching tables and things.
 
 That's a nice idea. Even without this verb, I think it is possible to
 develop a registration cache that covers those regions though. When you
 find out you have some part of your region not registered, you can
 register a new, larger region that covers everything you need. For new
 operations you only use the newer region. Once the previous, smaller
 region is not used, you de-register it.


I'm not sure what you mean.  Are you saying I should do something like this:

MPI_Init() {
// the first MPI function invoked
  mpi_sbrk_save = sbrk();
  ibv_reg_mr(..., 0, mpi_sbrk_save, ...);
  ...
}

MPI_Send(buffer, ...) {
  if (mpi_sbrk_save != sbrk())
  mpi_sbrk_save = sbrk();
  ibv_rereg_mr(..., 0, mpi_sbrk_save, ...);
  ...
}

I don't think this covers other memory regions, like those added via mmap, 
right?

-- 
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ANNOUNCE] dapl-2.0.37

2013-06-06 Thread Davis, Arlin R
Rupert/Vlad, please pull this package into OFED 3.5.2

Thanks, Arlin
--

Latest Packages (see ChangeLog for recent changes):

md5sum: 2e185e1aac2c09b3d9e529ee1aa1669e dapl-2.0.37.tar.gz 

For v2.0 package install RPM packages as follow: 

dapl-2.0.37-1 
dapl-utils-2.0.37-1 
dapl-devel-2.0.37-1 
dapl-debuginfo-2.0.37-1 

Summary of v2.0 changes: 

Release 2.0.37 fixes (OFED 3.5.2): 

common: add support for ia name during dat_ia_query 
common: dapl_os_atomic_inc/dec() not working as expected on ppc64 machines. 
dapltest: ppc64 endian issue with exchanged mem handle and address 



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html