date:20160407

Sets the bpf program represented by fd as an early filter in the rx path
of the netdev. The fd must have been created as BPF_PROG_TYPE_PHYS_DEV.
Providing a negative value as fd clears the program. Getting the fd back
via rtnl is not possible, therefore reading of this value merely
provides a bool whether the program is valid on the link or not.

Signed-off-by: Brenden Blanco 
---
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c | 12 
 2 files changed, 13 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9427f17..ccad234 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -155,6 +155,7 @@ enum {
IFLA_PROTO_DOWN,
IFLA_GSO_MAX_SEGS,
IFLA_GSO_MAX_SIZE,
+   IFLA_BPF_FD,
__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a75f7e9..96579ce 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -910,6 +910,7 @@ static noinline size_t if_nlmsg_size(const struct 
net_device *dev,
   + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
   + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
   + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+  + nla_total_size(4) /* IFLA_BPF_FD */
   + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1242,6 +1243,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
 nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_changes)) ||
+   (dev->netdev_ops->ndo_bpf_get &&
+nla_put_s32(skb, IFLA_BPF_FD,
+dev->netdev_ops->ndo_bpf_get(dev))) ||
nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
goto nla_put_failure;
 
@@ -1375,6 +1379,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PHYS_SWITCH_ID]   = { .type = NLA_BINARY, .len = 
MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
[IFLA_PROTO_DOWN]   = { .type = NLA_U8 },
+   [IFLA_BPF_FD]   = { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2029,6 +2034,13 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_NOTIFY;
}
 
+   if (tb[IFLA_BPF_FD]) {
+   err = dev_change_bpf_fd(dev, nla_get_s32(tb[IFLA_BPF_FD]));
+   if (err)
+   goto errout;
+   status |= DO_SETLINK_NOTIFY;
+   }
+
 errout:
if (status & DO_SETLINK_MODIFIED) {
if (status & DO_SETLINK_NOTIFY)
-- 
2.8.0

[RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program

Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx4 driver. Since
bpf programs require a skb context to navigate the packet, build a
percpu fake skb with the minimal fields. This avoids the costly
allocation for packets that end up being dropped.

Since mlx4 is so far the only user of bpf_phys_dev_md, the build
function is defined locally.

Signed-off-by: Brenden Blanco 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 65 ++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 25 --
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  6 +++
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b4b258c..b228651 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -31,6 +31,7 @@
  *
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -1966,6 +1967,9 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
}
 
+   if (priv->prog)
+   bpf_prog_put(priv->prog);
+
 }
 
 int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
@@ -2078,6 +2082,11 @@ static int mlx4_en_change_mtu(struct net_device *dev, 
int new_mtu)
en_err(priv, "Bad MTU size:%d.\n", new_mtu);
return -EPERM;
}
+   if (priv->prog && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
+   en_err(priv, "MTU size:%d requires frags but bpf prog running",
+  new_mtu);
+   return -EOPNOTSUPP;
+   }
dev->mtu = new_mtu;
 
if (netif_running(dev)) {
@@ -2456,6 +2465,58 @@ static int mlx4_en_set_tx_maxrate(struct net_device 
*dev, int queue_index, u32 m
return err;
 }
 
+static DEFINE_PER_CPU(struct sk_buff, percpu_bpf_phys_dev_md);
+
+static void build_bpf_phys_dev_md(struct sk_buff *skb, void *data,
+ unsigned int length)
+{
+   /* data_len is intentionally not set here so that skb_is_nonlinear()
+* returns false
+*/
+
+   skb->len = length;
+   skb->head = data;
+   skb->data = data;
+}
+
+int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
+{
+   struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
+   int ret;
+
+   build_bpf_phys_dev_md(skb, data, length);
+
+   rcu_read_lock();
+   ret = BPF_PROG_RUN(prog, (void *)skb);
+   rcu_read_unlock();
+
+   return ret;
+}
+
+static int mlx4_bpf_set(struct net_device *dev, struct bpf_prog *prog)
+{
+   struct mlx4_en_priv *priv = netdev_priv(dev);
+   struct bpf_prog *old_prog;
+
+   if (priv->num_frags > 1)
+   return -EOPNOTSUPP;
+
+   old_prog = xchg(&priv->prog, prog);
+   if (old_prog) {
+   synchronize_net();
+   bpf_prog_put(old_prog);
+   }
+
+   return 0;
+}
+
+static bool mlx4_bpf_get(struct net_device *dev)
+{
+   struct mlx4_en_priv *priv = netdev_priv(dev);
+
+   return !!priv->prog;
+}
+
 static const struct net_device_ops mlx4_netdev_ops = {
.ndo_open   = mlx4_en_open,
.ndo_stop   = mlx4_en_close,
@@ -2486,6 +2547,8 @@ static const struct net_device_ops mlx4_netdev_ops = {
.ndo_features_check = mlx4_en_features_check,
 #endif
.ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate,
+   .ndo_bpf_set= mlx4_bpf_set,
+   .ndo_bpf_get= mlx4_bpf_get,
 };
 
 static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2524,6 +2587,8 @@ static const struct net_device_ops mlx4_netdev_ops_master 
= {
.ndo_features_check = mlx4_en_features_check,
 #endif
.ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate,
+   .ndo_bpf_set= mlx4_bpf_set,
+   .ndo_bpf_get= mlx4_bpf_get,
 };
 
 struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 86bcfe5..287da02 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -748,6 +748,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
struct mlx4_en_rx_alloc *frags;
struct mlx4_en_rx_desc *rx_desc;
+   struct bpf_prog *prog;
struct sk_buff *skb;
int index;
int nr;
@@ -764,6 +765,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
if (budget <= 0)
return polled;
 
+   prog = READ_ONCE(priv->prog);
+
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
 * descriptor offset can be deduced from the CQE index instead of
 * reading 'cqe->index' */
@@ -840,6 +843,23 @@ int mlx4_en_process_rx_cq(

[RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx

Add two new set/get netdev ops for drivers implementing the
BPF_PROG_TYPE_PHYS_DEV filter.

Signed-off-by: Brenden Blanco 
---
 include/linux/netdevice.h | 13 +
 net/core/dev.c| 38 ++
 2 files changed, 51 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb4e508..3acf732 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -61,6 +61,7 @@ struct wireless_dev;
 /* 802.15.4 specific */
 struct wpan_dev;
 struct mpls_dev;
+struct bpf_prog;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
@@ -1102,6 +1103,14 @@ struct tc_to_netdev {
  * appropriate rx headroom value allows avoiding skb head copy on
  * forward. Setting a negative value resets the rx headroom to the
  * default value.
+ * int  (*ndo_bpf_set)(struct net_device *dev, struct bpf_prog *prog);
+ * This function is used to set or clear a bpf program used in the
+ * earliest stages of packet rx. The prog will have been loaded as
+ * BPF_PROG_TYPE_PHYS_DEV. The callee is responsible for calling
+ * bpf_prog_put on any old progs that are stored, but not on the passed
+ * in prog.
+ * bool (*ndo_bpf_get)(struct net_device *dev);
+ * This function is used to check if a bpf program is set on the device.
  *
  */
 struct net_device_ops {
@@ -1292,6 +1301,9 @@ struct net_device_ops {
   struct sk_buff *skb);
void(*ndo_set_rx_headroom)(struct net_device *dev,
   int needed_headroom);
+   int (*ndo_bpf_set)(struct net_device *dev,
+  struct bpf_prog *prog);
+   bool(*ndo_bpf_get)(struct net_device *dev);
 };
 
 /**
@@ -3251,6 +3263,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
+int dev_change_bpf_fd(struct net_device *dev, int fd);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device 
*dev,
struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index 273f10d..7cf749c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -94,6 +94,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -6483,6 +6484,43 @@ int dev_change_proto_down(struct net_device *dev, bool 
proto_down)
 EXPORT_SYMBOL(dev_change_proto_down);
 
 /**
+ * dev_change_bpf_fd - set or clear a bpf program for a device
+ * @dev: device
+ * @fd: new program fd or negative value to clear
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_bpf_fd(struct net_device *dev, int fd)
+{
+   const struct net_device_ops *ops = dev->netdev_ops;
+   struct bpf_prog *prog = NULL;
+   int err;
+
+   if (!ops->ndo_bpf_set)
+   return -EOPNOTSUPP;
+   if (!netif_device_present(dev))
+   return -ENODEV;
+
+   if (fd >= 0) {
+   prog = bpf_prog_get(fd);
+   if (IS_ERR(prog))
+   return PTR_ERR(prog);
+
+   if (prog->type != BPF_PROG_TYPE_PHYS_DEV) {
+   bpf_prog_put(prog);
+   return -EINVAL;
+   }
+   }
+
+   err = ops->ndo_bpf_set(dev, prog);
+   if (err < 0 && prog)
+   bpf_prog_put(prog);
+
+   return err;
+}
+EXPORT_SYMBOL(dev_change_bpf_fd);
+
+/**
  * dev_new_index   -   allocate an ifindex
  * @net: the applicable net namespace
  *
-- 
2.8.0

[RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a
new context type, struct bpf_phys_dev_md, is exposed to userspace. So
far only expose the readable packet length, and only in read mode.

The PHYS_DEV name is chosen to represent that the program is meant only
for physical adapters, rather than all netdevs.

While the user visible struct is new, the underlying context must be
implemented as a minimal skb in order for the packet load_* instructions
to work. The skb filled in by the driver must have skb->len, skb->head,
and skb->data set, and skb->data_len == 0.

Signed-off-by: Brenden Blanco 
---
 include/uapi/linux/bpf.h | 14 ++
 kernel/bpf/verifier.c|  1 +
 net/core/filter.c| 68 
 3 files changed, 83 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70eda5a..3018d83 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -93,6 +93,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_CLS,
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
+   BPF_PROG_TYPE_PHYS_DEV,
 };
 
 #define BPF_PSEUDO_MAP_FD  1
@@ -368,6 +369,19 @@ struct __sk_buff {
__u32 tc_classid;
 };
 
+/* user return codes for PHYS_DEV prog type */
+enum bpf_phys_dev_action {
+   BPF_PHYS_DEV_DROP,
+   BPF_PHYS_DEV_OK,
+};
+
+/* user accessible metadata for PHYS_DEV packet hook
+ * new fields must be added to the end of this structure
+ */
+struct bpf_phys_dev_md {
+   __u32 len;
+};
+
 struct bpf_tunnel_key {
__u32 tunnel_id;
union {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58792fe..877542d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
case BPF_PROG_TYPE_SOCKET_FILTER:
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
+   case BPF_PROG_TYPE_PHYS_DEV:
return true;
default:
return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index e8486ba..4f73fb9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
}
 }
 
+static const struct bpf_func_proto *
+phys_dev_func_proto(enum bpf_func_id func_id)
+{
+   return sk_filter_func_proto(func_id);
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
/* check bounds */
@@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
 }
 
+static bool __is_valid_phys_dev_access(int off, int size,
+  enum bpf_access_type type)
+{
+   if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
+   return false;
+
+   if (off % size != 0)
+   return false;
+
+   if (size != 4)
+   return false;
+
+   return true;
+}
+
+static bool phys_dev_is_valid_access(int off, int size,
+enum bpf_access_type type)
+{
+   if (type == BPF_WRITE)
+   return false;
+
+   switch (off) {
+   case offsetof(struct bpf_phys_dev_md, len):
+   break;
+   default:
+   return false;
+   }
+   return __is_valid_phys_dev_access(off, size, type);
+}
+
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
  int src_reg, int ctx_off,
  struct bpf_insn *insn_buf,
@@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum 
bpf_access_type type, int dst_reg,
return insn - insn_buf;
 }
 
+static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
+  int dst_reg, int src_reg,
+  int ctx_off,
+  struct bpf_insn *insn_buf,
+  struct bpf_prog *prog)
+{
+   struct bpf_insn *insn = insn_buf;
+
+   switch (ctx_off) {
+   case offsetof(struct bpf_phys_dev_md, len):
+   BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+   *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, len));
+   break;
+   }
+
+   return insn - insn_buf;
+}
+
 static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops phys_dev_ops = {
+   .get_func_proto = phys_dev_func_proto,
+

[RFC PATCH v2 5/5] Add sample for adding simple drop program to link

Add a sample program that only drops packets at the
BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
observed single core rate is ~19.5Mpps.

Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.

$ perf record -a samples/bpf/netdrvx1 $(
---
 samples/bpf/Makefile|   4 ++
 samples/bpf/bpf_load.c  |   8 +++
 samples/bpf/netdrvx1_kern.c |  26 
 samples/bpf/netdrvx1_user.c | 155 
 4 files changed, 193 insertions(+)
 create mode 100644 samples/bpf/netdrvx1_kern.c
 create mode 100644 samples/bpf/netdrvx1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 9959771..19bb926 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -20,6 +20,7 @@ hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
+hostprogs-y += netdrvx1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -40,6 +41,7 @@ offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
+netdrvx1-objs := bpf_load.o libbpf.o netdrvx1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -60,6 +62,7 @@ always += spintest_kern.o
 always += map_perf_test_kern.o
 always += test_overhead_tp_kern.o
 always += test_overhead_kprobe_kern.o
+always += netdrvx1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -80,6 +83,7 @@ HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
 HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
+HOSTLOADLIBES_netdrvx1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 022af71..c7b2245 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -50,6 +50,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
+   bool is_phys_dev = strncmp(event, "phys_dev", 8) == 0;
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
@@ -66,6 +67,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_KPROBE;
} else if (is_tracepoint) {
prog_type = BPF_PROG_TYPE_TRACEPOINT;
+   } else if (is_phys_dev) {
+   prog_type = BPF_PROG_TYPE_PHYS_DEV;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -79,6 +82,9 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
+   if (is_phys_dev)
+   return 0;
+
if (is_socket) {
event += 6;
if (*event != '/')
@@ -319,6 +325,7 @@ int load_bpf_file(char *path)
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "tracepoint/", 11) == 0 ||
+   memcmp(shname_prog, "phys_dev", 8) == 0 ||
memcmp(shname_prog, "socket", 6) == 0)
load_and_attach(shname_prog, insns, 
data_prog->d_size);
}
@@ -336,6 +343,7 @@ int load_bpf_file(char *path)
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
+   memcmp(shname, "phys_dev", 8) == 0 ||
memcmp(shname, "socket", 6) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
diff --git a/samples/bpf/netdrvx1_kern.c b/samples/bpf/netdrvx1_kern.c
new file mode 100644
index 000..849802d
--- /dev/null
+++ b/samples/bpf/netdrvx1_kern.c
@@ -0,0 +1,26 @@
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dropcnt = {
+   .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+   .key_size = sizeof(u32),
+   .value_size = sizeof(long),
+   .max_entries = 256,
+};
+
+SEC("phys_dev1")
+int bpf_prog1(struct bpf_phys_dev_md *ctx)
+{
+   int index = load_byte(ctx, ETH_HLEN + offsetof(struct iphdr, protocol));
+   long *value;
+
+   value = bpf_map_lookup_elem(&dropcnt, &index);
+   if (value)
+   *value += 1;
+
+   return BPF_PHYS_DEV_DROP;
+}
+char _licens

[RFC PATCH v2 0/5] Add driver bpf hook for early packet drop

This patch set introduces new infrastructure for programmatically
processing packets in the earliest stages of rx, as part of an effort
others are calling Express Data Path (XDP) [1]. Start this effort by
introducing a new bpf program type for early packet filtering, before even
an skb has been allocated.

With this, hope to enable line rate filtering, with this initial
implementation providing drop/allow action only.

Patch 1 introduces the new prog type and helpers for validating the bpf
program. A new userspace struct is defined containing only len as a field,
with others to follow in the future.
In patch 2, create a new ndo to pass the fd to support drivers. 
In patch 3, expose a new rtnl option to userspace.
In patch 4, enable support in mlx4 driver. No skb allocation is required,
instead a static percpu skb is kept in the driver and minimally initialized
for each driver frag.
In patch 5, create a sample drop and count program. With single core,
achieved ~20 Mpps drop rate on a 40G mlx4. This includes packet data
access, bpf array lookup, and increment.

Interestingly, accessing packet data from the program did not have a
noticeable impact on performance. Even so, future enhancements to
prefetching / batching / page-allocs should hopefully improve the
performance in this path.

[1] https://github.com/iovisor/bpf-docs/blob/master/Express_Data_Path.pdf

v2:
  1/5: Drop xdp from types, instead consistently use bpf_phys_dev_.
Introduce enum for return values from phys_dev hook.
  2/5: Move prog->type check to just before invoking ndo.
Change ndo to take a bpf_prog * instead of fd.
Add ndo_bpf_get rather than keeping a bool in the netdev struct.
  3/5: Use ndo_bpf_get to fetch bool.
  4/5: Enforce that only 1 frag is ever given to bpf prog by disallowing
mtu to increase beyond FRAG_SZ0 when bpf prog is running, or conversely
to set a bpf prog when priv->num_frags > 1.
Rename pseudo_skb to bpf_phys_dev_md.
Implement ndo_bpf_get.
Add dma sync just before invoking prog.
Check for explicit bpf return code rather than nonzero.
Remove increment of rx_dropped.
  5/5: Use explicit bpf return code in example.
Update commit log with higher pps numbers.

Brenden Blanco (5):
  bpf: add PHYS_DEV prog type for early driver filter
  net: add ndo to set bpf prog in adapter rx
  rtnl: add option for setting link bpf prog
  mlx4: add support for fast rx drop bpf program
  Add sample for adding simple drop program to link

 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  65 +++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c |  25 +++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   6 +
 include/linux/netdevice.h  |  13 +++
 include/uapi/linux/bpf.h   |  14 +++
 include/uapi/linux/if_link.h   |   1 +
 kernel/bpf/verifier.c  |   1 +
 net/core/dev.c |  38 ++
 net/core/filter.c  |  68 +++
 net/core/rtnetlink.c   |  12 ++
 samples/bpf/Makefile   |   4 +
 samples/bpf/bpf_load.c |   8 ++
 samples/bpf/netdrvx1_kern.c|  26 +
 samples/bpf/netdrvx1_user.c| 155 +
 14 files changed, 432 insertions(+), 4 deletions(-)
 create mode 100644 samples/bpf/netdrvx1_kern.c
 create mode 100644 samples/bpf/netdrvx1_user.c

-- 
2.8.0

[PATCH net] mpls: find_outdev: check for err ptr in addition to NULL check

2016-04-07 Thread Roopa Prabhu

From: Roopa Prabhu 

find_outdev calls inet{,6}_fib_lookup_dev() or dev_get_by_index() to
find the output device. In case of an error, inet{,6}_fib_lookup_dev()
returns error pointer and dev_get_by_index() returns NULL. But the function
only checks for NULL and thus can end up calling dev_put on an ERR_PTR.
This patch adds an additional check for err ptr after the NULL check.

Before: Trying to add an mpls route with no oif from user, no available
path to 10.1.1.8 and no default route:
$ip -f mpls route add 100 as 200 via inet 10.1.1.8
[  822.337195] BUG: unable to handle kernel NULL pointer dereference at
03a3
[  822.340033] IP: [] mpls_nh_assign_dev+0x10b/0x182
[  822.340033] PGD 1db38067 PUD 1de9e067 PMD 0
[  822.340033] Oops:  [#1] SMP
[  822.340033] Modules linked in:
[  822.340033] CPU: 0 PID: 11148 Comm: ip Not tainted 4.5.0-rc7+ #54
[  822.340033] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org
04/01/2014
[  822.340033] task: 88001db82580 ti: 88001dad4000 task.ti:
88001dad4000
[  822.340033] RIP: 0010:[]  []
mpls_nh_assign_dev+0x10b/0x182
[  822.340033] RSP: 0018:88001dad7a88  EFLAGS: 00010282
[  822.340033] RAX: ff9b RBX: ff9b RCX:
0002
[  822.340033] RDX: ff9b RSI: 0008 RDI:

[  822.340033] RBP: 88001ddc9ea0 R08: 88001e9f1768 R09:

[  822.340033] R10: 88001d9c1100 R11: 88001e3c89f0 R12:
8187e0c0
[  822.340033] R13: 8187e0c0 R14: 88001ddc9e80 R15:
0004
[  822.340033] FS:  7ff9ed798700() GS:88001fc0()
knlGS:
[  822.340033] CS:  0010 DS:  ES:  CR0: 80050033
[  822.340033] CR2: 03a3 CR3: 1de89000 CR4:
06f0
[  822.340033] Stack:
[  822.340033]   0001 

[  822.340033]   0801010a 

[  822.340033]  0004 8148749b 8187e0c0
001c
[  822.340033] Call Trace:
[  822.340033]  [] ? mpls_rt_alloc+0x2b/0x3e
[  822.340033]  [] ? mpls_rtm_newroute+0x358/0x3e2
[  822.340033]  [] ? get_page+0x5/0xa
[  822.340033]  [] ? rtnetlink_rcv_msg+0x17e/0x191
[  822.340033]  [] ? __kmalloc_track_caller+0x8c/0x9e
[  822.340033]  [] ?
rht_key_hashfn.isra.20.constprop.57+0x14/0x1f
[  822.340033]  [] ? __rtnl_unlock+0xc/0xc
[  822.340033]  [] ? netlink_rcv_skb+0x36/0x82
[  822.340033]  [] ? rtnetlink_rcv+0x1f/0x28
[  822.340033]  [] ? netlink_unicast+0x106/0x189
[  822.340033]  [] ? netlink_sendmsg+0x27f/0x2c8
[  822.340033]  [] ? sock_sendmsg_nosec+0x10/0x1b
[  822.340033]  [] ? ___sys_sendmsg+0x182/0x1e3
[  822.340033]  [] ?
__alloc_pages_nodemask+0x11c/0x1e4
[  822.340033]  [] ? PageAnon+0x5/0xd
[  822.340033]  [] ? __page_set_anon_rmap+0x45/0x52
[  822.340033]  [] ? get_page+0x5/0xa
[  822.340033]  [] ? __lru_cache_add+0x1a/0x3a
[  822.340033]  [] ? current_kernel_time64+0x9/0x30
[  822.340033]  [] ? __sys_sendmsg+0x3c/0x5a
[  822.340033]  [] ?
entry_SYSCALL_64_fastpath+0x12/0x6a
[  822.340033] Code: 83 08 04 00 00 65 ff 00 48 8b 3c 24 e8 40 7c f2 ff
eb 13 48 c7 c3 9f ff ff ff eb 0f 89 ce e8 f1 ae f1 ff 48 89 c3 48 85 db
74 15 <48> 8b 83 08 04 00 00 65 ff 08 48 81 fb 00 f0 ff ff 76 0d eb 07
[  822.340033] RIP  [] mpls_nh_assign_dev+0x10b/0x182
[  822.340033]  RSP 
[  822.340033] CR2: 03a3
[  822.435363] ---[ end trace 98cc65e6f6b8bf11 ]---

After patch:
$ip -f mpls route add 100 as 200 via inet 10.1.1.8
RTNETLINK answers: Network is unreachable

Signed-off-by: Roopa Prabhu 
Reported-by: David Miller 
---
I completely missed this during negative testing of my original
submission because of a default route always present. It was always
picking the default device in the failure case and no crash. This time
tested with no default route.

Thanks for catching it.

 net/mpls/af_mpls.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index b18c5ed..0b80a71 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -543,6 +543,9 @@ static struct net_device *find_outdev(struct net *net,
if (!dev)
return ERR_PTR(-ENODEV);
 
+   if (IS_ERR(dev))
+   return dev;
+
/* The caller is holding rtnl anyways, so release the dev reference */
dev_put(dev);
 
-- 
1.9.1

[net-next 07/18] ixgbe: Add support for single-port X550 device

From: Mark Rustad 

Add support for a single-port X550 device.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 2 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h   | 1 +
 3 files changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index ee43a38..8c560da 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -97,6 +97,7 @@ bool ixgbe_device_supports_autoneg_fc(struct ixgbe_hw *hw)
case IXGBE_DEV_ID_X540T:
case IXGBE_DEV_ID_X540T1:
case IXGBE_DEV_ID_X550T:
+   case IXGBE_DEV_ID_X550T1:
case IXGBE_DEV_ID_X550EM_X_10G_T:
supported = true;
break;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d5509cc..9594438 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -125,6 +125,7 @@ static const struct pci_device_id ixgbe_pci_tbl[] = {
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_SF_QP), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X540T1), board_X540 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550T), board_X550},
+   {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550T1), board_X550},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_KX4), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_KR), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_10G_T), board_X550EM_x},
@@ -8983,6 +8984,7 @@ int ixgbe_wol_supported(struct ixgbe_adapter *adapter, 
u16 device_id,
case IXGBE_DEV_ID_X540T:
case IXGBE_DEV_ID_X540T1:
case IXGBE_DEV_ID_X550T:
+   case IXGBE_DEV_ID_X550T1:
case IXGBE_DEV_ID_X550EM_X_KX4:
case IXGBE_DEV_ID_X550EM_X_KR:
case IXGBE_DEV_ID_X550EM_X_10G_T:
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 7ae4bbd..bd95be2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -75,6 +75,7 @@
 #define IXGBE_DEV_ID_X540T1  0x1560
 
 #define IXGBE_DEV_ID_X550T 0x1563
+#define IXGBE_DEV_ID_X550T10x15D1
 #define IXGBE_DEV_ID_X550EM_X_KX4  0x15AA
 #define IXGBE_DEV_ID_X550EM_X_KR   0x15AB
 #define IXGBE_DEV_ID_X550EM_X_SFP  0x15AC
-- 
2.5.5

[net-next 16/18] ixgbe: Add support for SGMII backplane interface

From: Mark Rustad 

Add support for an SGMII backplane interface.

Signed-off-by: Mark Rustad 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |  9 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 58 +++
 3 files changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 93db4bf..c96af3f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -132,6 +132,8 @@ static const struct pci_device_id ixgbe_pci_tbl[] = {
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_10G_T), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_SFP), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SFP_N), board_x550em_a },
+   {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SGMII), board_x550em_a },
+   {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SGMII_L), board_x550em_a },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SFP), board_x550em_a },
/* required last entry */
{0, }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index fbbc132..50e8bc0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -82,6 +82,8 @@
 #define IXGBE_DEV_ID_X550EM_X_10G_T0x15AD
 #define IXGBE_DEV_ID_X550EM_X_1G_T 0x15AE
 #define IXGBE_DEV_ID_X550EM_A_SFP_N0x15C4
+#define IXGBE_DEV_ID_X550EM_A_SGMII0x15C6
+#define IXGBE_DEV_ID_X550EM_A_SGMII_L  0x15C7
 #define IXGBE_DEV_ID_X550EM_A_SFP  0x15CE
 
 /* VF Device IDs */
@@ -3065,6 +3067,7 @@ enum ixgbe_phy_type {
ixgbe_phy_qsfp_intel,
ixgbe_phy_qsfp_unknown,
ixgbe_phy_sfp_unsupported,
+   ixgbe_phy_sgmii,
ixgbe_phy_generic
 };
 
@@ -3582,6 +3585,7 @@ struct ixgbe_info {
 #define IXGBE_KRM_LINK_CTRL_1(P)   ((P) ? 0x820C : 0x420C)
 #define IXGBE_KRM_AN_CNTL_1(P) ((P) ? 0x822C : 0x422C)
 #define IXGBE_KRM_AN_CNTL_8(P) ((P) ? 0x8248 : 0x4248)
+#define IXGBE_KRM_SGMII_CTRL(P)((P) ? 0x82A0 : 0x42A0)
 #define IXGBE_KRM_DSP_TXFFE_STATE_4(P) ((P) ? 0x8634 : 0x4634)
 #define IXGBE_KRM_DSP_TXFFE_STATE_5(P) ((P) ? 0x8638 : 0x4638)
 #define IXGBE_KRM_RX_TRN_LINKUP_CTRL(P)((P) ? 0x8B00 : 0x4B00)
@@ -3595,6 +3599,8 @@ struct ixgbe_info {
 #define IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_MASK(0x7 << 8)
 #define IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_1G  (2 << 8)
 #define IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_10G (4 << 8)
+#define IXGBE_KRM_LINK_CTRL_1_TETH_AN_SGMII_EN BIT(12)
+#define IXGBE_KRM_LINK_CTRL_1_TETH_AN_CLAUSE_37_EN BIT(13)
 #define IXGBE_KRM_LINK_CTRL_1_TETH_AN_FEC_REQ  (1 << 14)
 #define IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_FEC  (1 << 15)
 #define IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_KX   (1 << 16)
@@ -3610,6 +3616,9 @@ struct ixgbe_info {
 #define IXGBE_KRM_AN_CNTL_8_LINEAR BIT(0)
 #define IXGBE_KRM_AN_CNTL_8_LIMITING   BIT(1)
 
+#define IXGBE_KRM_SGMII_CTRL_MAC_TAR_FORCE_100_D   BIT(12)
+#define IXGBE_KRM_SGMII_CTRL_MAC_TAR_FORCE_10_DBIT(19)
+
 #define IXGBE_KRM_DSP_TXFFE_STATE_C0_EN(1 << 6)
 #define IXGBE_KRM_DSP_TXFFE_STATE_CP1_CN1_EN   (1 << 15)
 #define IXGBE_KRM_DSP_TXFFE_STATE_CO_ADAPT_EN  (1 << 16)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index a9d86b3..81e5d54 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -1558,6 +1558,57 @@ static s32 ixgbe_check_link_t_X550em(struct ixgbe_hw *hw,
return 0;
 }
 
+/**
+ * ixgbe_setup_sgmii - Set up link for sgmii
+ * @hw: pointer to hardware structure
+ */
+static s32
+ixgbe_setup_sgmii(struct ixgbe_hw *hw, __always_unused ixgbe_link_speed speed,
+ __always_unused bool autoneg_wait_to_complete)
+{
+   struct ixgbe_mac_info *mac = &hw->mac;
+   u32 lval, sval;
+   s32 rc;
+
+   rc = mac->ops.read_iosf_sb_reg(hw,
+  IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id),
+  IXGBE_SB_IOSF_TARGET_KR_PHY, &lval);
+   if (rc)
+   return rc;
+
+   lval &= ~IXGBE_KRM_LINK_CTRL_1_TETH_AN_ENABLE;
+   lval &= ~IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_MASK;
+   lval |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_SGMII_EN;
+   lval |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_CLAUSE_37_EN;
+   lval |= IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_1G;
+   rc = mac->ops.write_iosf_sb_reg(hw,
+   IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id),
+   IXGBE_SB_IOSF_TARGET_KR_PHY, lval);
+   if (rc)
+   return rc;
+
+   rc =

[net-next 08/18] ixgbe: Add definitions for x550em_a 10G MAC

From: Mark Rustad 

Add definitions for a x550em_a 10G MAC device with a native SFP
interface.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index bd95be2..b505da3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -81,16 +81,18 @@
 #define IXGBE_DEV_ID_X550EM_X_SFP  0x15AC
 #define IXGBE_DEV_ID_X550EM_X_10G_T0x15AD
 #define IXGBE_DEV_ID_X550EM_X_1G_T 0x15AE
+#define IXGBE_DEV_ID_X550EM_A_SFP_N0x15C4
+
+/* VF Device IDs */
 #define IXGBE_DEV_ID_X550_VF_HV0x1564
 #define IXGBE_DEV_ID_X550_VF   0x1565
 #define IXGBE_DEV_ID_X550EM_X_VF   0x15A8
 #define IXGBE_DEV_ID_X550EM_X_VF_HV0x15A9
-
-/* VF Device IDs */
 #define IXGBE_DEV_ID_82599_VF   0x10ED
 #define IXGBE_DEV_ID_X540_VF0x1515
 #define IXGBE_DEV_ID_X550_VF   0x1565
 #define IXGBE_DEV_ID_X550EM_X_VF   0x15A8
+#define IXGBE_DEV_ID_X550EM_A_VF   0x15C5
 
 #define IXGBE_CAT(r, m)IXGBE_##r##_##m
 
@@ -129,7 +131,7 @@
 #define IXGBE_FLA_X540 IXGBE_FLA_8259X
 #define IXGBE_FLA_X550 IXGBE_FLA_8259X
 #define IXGBE_FLA_X550EM_x IXGBE_FLA_8259X
-#define IXGBE_FLA_X550EM_a 0x15F6C
+#define IXGBE_FLA_X550EM_a 0x15F68
 #define IXGBE_FLA(_hw) IXGBE_BY_MAC((_hw), FLA)
 #define IXGBE_EEMNGCTL  0x10110
 #define IXGBE_EEMNGDATA 0x10114
@@ -369,6 +371,8 @@ struct ixgbe_thermal_sensor_data {
 #define IXGBE_MRCTL(_i)  (0x0F600 + ((_i) * 4))
 #define IXGBE_VMRVLAN(_i)(0x0F610 + ((_i) * 4))
 #define IXGBE_VMRVM(_i)  (0x0F630 + ((_i) * 4))
+#define IXGBE_WQBR_RX(_i)(0x2FB0 + ((_i) * 4)) /* 4 total */
+#define IXGBE_WQBR_TX(_i)(0x8130 + ((_i) * 4)) /* 4 total */
 #define IXGBE_L34T_IMIR(_i)  (0x0E800 + ((_i) * 4)) /*128 of these (0-127)*/
 #define IXGBE_RXFECCERR0 0x051B8
 #define IXGBE_LLITHRESH 0x0EC90
@@ -440,6 +444,8 @@ struct ixgbe_thermal_sensor_data {
 #define IXGBE_DMATXCTL_TE   0x1 /* Transmit Enable */
 #define IXGBE_DMATXCTL_NS   0x2 /* No Snoop LSO hdr buffer */
 #define IXGBE_DMATXCTL_GDV  0x8 /* Global Double VLAN */
+#define IXGBE_DMATXCTL_MDP_EN   0x20 /* Bit 5 */
+#define IXGBE_DMATXCTL_MBINTEN  0x40 /* Bit 6 */
 #define IXGBE_DMATXCTL_VT_SHIFT 16  /* VLAN EtherType */
 
 #define IXGBE_PFDTXGSWC_VT_LBEN 0x1 /* Local L2 VT switch enable */
@@ -548,7 +554,6 @@ struct ixgbe_thermal_sensor_data {
 #define IXGBE_TDPT2TCCR(_i) (0x0CD20 + ((_i) * 4)) /* 8 of these (0-7) */
 #define IXGBE_TDPT2TCSR(_i) (0x0CD40 + ((_i) * 4)) /* 8 of these (0-7) */
 
-
 /* Security Control Registers */
 #define IXGBE_SECTXCTRL 0x08800
 #define IXGBE_SECTXSTAT 0x08804
@@ -1197,6 +1202,8 @@ struct ixgbe_thermal_sensor_data {
 #define IXGBE_RDRXCTL_RSCLLIDIS 0x0080 /* Disable RSC compl on LLI */
 #define IXGBE_RDRXCTL_RSCACKC   0x0200 /* must set 1 when RSC enabled 
*/
 #define IXGBE_RDRXCTL_FCOE_WRFIX0x0400 /* must set 1 when RSC enabled 
*/
+#define IXGBE_RDRXCTL_MBINTEN   0x1000
+#define IXGBE_RDRXCTL_MDP_EN0x2000
 
 /* RQTC Bit Masks and Shifts */
 #define IXGBE_RQTC_SHIFT_TC(_i) ((_i) * 4)
@@ -1951,7 +1958,9 @@ enum {
 #define IXGBE_GSSR_PHY1_SM 0x0004
 #define IXGBE_GSSR_MAC_CSR_SM  0x0008
 #define IXGBE_GSSR_FLASH_SM0x0010
+#define IXGBE_GSSR_NVM_UPDATE_SM   0x0200
 #define IXGBE_GSSR_SW_MNG_SM   0x0400
+#define IXGBE_GSSR_TOKEN_SM0x4000 /* SW bit for shared access */
 #define IXGBE_GSSR_SHARED_I2C_SM   0x1806 /* Wait for both phys & I2Cs */
 #define IXGBE_GSSR_I2C_MASK0x1800
 #define IXGBE_GSSR_NVM_PHY_MASK0xF
@@ -2524,6 +2533,10 @@ enum ixgbe_fdir_pballoc_type {
 #define IXGBE_FDIRCTRL_REPORT_STATUS_ALWAYS 0x0080
 #define IXGBE_FDIRCTRL_DROP_Q_SHIFT 8
 #define IXGBE_FDIRCTRL_FLEX_SHIFT   16
+#define IXGBE_FDIRCTRL_DROP_NO_MATCH   0x8000
+#define IXGBE_FDIRCTRL_FILTERMODE_SHIFT21
+#define IXGBE_FDIRCTRL_FILTERMODE_MACVLAN  0x0001 /* bit 23:21, 001b */
+#define IXGBE_FDIRCTRL_FILTERMODE_CLOUD0x0002 /* bit 23:21, 
010b */
 #define IXGBE_FDIRCTRL_SEARCHLIM0x0080
 #define IXGBE_FDIRCTRL_MAX_LENGTH_SHIFT 24
 #define IXGBE_FDIRCTRL_FULL_THRESH_MASK 0xF000
@@ -2982,6 +2995,7 @@ enum ixgbe_mac_type {
ixgbe_mac_X540,
ixgbe_mac_X550,
ixgbe_mac_X550EM_x,
+   ixgbe_mac_x550em_a,
ixgbe_num_macs
 };
 
-- 
2.5.5

[net-next 17/18] ixgbe: Add KR backplane support for x550em_a

From: Mark Rustad 

Add support for x550em_a-based KR backplane devices.

Signed-off-by: Mark Rustad 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |  2 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 18 ++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c96af3f..1a7bfcf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -131,6 +131,8 @@ static const struct pci_device_id ixgbe_pci_tbl[] = {
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_KR), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_10G_T), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_SFP), board_X550EM_x},
+   {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_KR), board_x550em_a },
+   {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_KR_L), board_x550em_a },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SFP_N), board_x550em_a },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SGMII), board_x550em_a },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SGMII_L), board_x550em_a },
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 50e8bc0..ba3b837 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -81,6 +81,8 @@
 #define IXGBE_DEV_ID_X550EM_X_SFP  0x15AC
 #define IXGBE_DEV_ID_X550EM_X_10G_T0x15AD
 #define IXGBE_DEV_ID_X550EM_X_1G_T 0x15AE
+#define IXGBE_DEV_ID_X550EM_A_KR   0x15C2
+#define IXGBE_DEV_ID_X550EM_A_KR_L 0x15C3
 #define IXGBE_DEV_ID_X550EM_A_SFP_N0x15C4
 #define IXGBE_DEV_ID_X550EM_A_SGMII0x15C6
 #define IXGBE_DEV_ID_X550EM_A_SGMII_L  0x15C7
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 81e5d54..c71e93e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -291,6 +291,8 @@ static s32 ixgbe_identify_phy_x550em(struct ixgbe_hw *hw)
hw->phy.type = ixgbe_phy_x550em_kx4;
break;
case IXGBE_DEV_ID_X550EM_X_KR:
+   case IXGBE_DEV_ID_X550EM_A_KR:
+   case IXGBE_DEV_ID_X550EM_A_KR_L:
hw->phy.type = ixgbe_phy_x550em_kr;
break;
case IXGBE_DEV_ID_X550EM_X_1G_T:
@@ -1984,13 +1986,17 @@ static s32 ixgbe_setup_kx4_x550em(struct ixgbe_hw *hw)
return status;
 }
 
-/**  ixgbe_setup_kr_x550em - Configure the KR PHY.
- *   @hw: pointer to hardware structure
+/**
+ * ixgbe_setup_kr_x550em - Configure the KR PHY
+ * @hw: pointer to hardware structure
  *
- *   Configures the integrated KR PHY.
+ * Configures the integrated KR PHY for X550EM_x.
  **/
 static s32 ixgbe_setup_kr_x550em(struct ixgbe_hw *hw)
 {
+   if (hw->mac.type != ixgbe_mac_X550EM_x)
+   return 0;
+
return ixgbe_setup_kr_speed_x550em(hw, hw->phy.autoneg_advertised);
 }
 
@@ -2196,7 +2202,9 @@ static s32 ixgbe_setup_fc_x550em(struct ixgbe_hw *hw)
return IXGBE_ERR_CONFIG;
}
 
-   if (hw->device_id != IXGBE_DEV_ID_X550EM_X_KR)
+   if (hw->device_id != IXGBE_DEV_ID_X550EM_X_KR &&
+   hw->device_id != IXGBE_DEV_ID_X550EM_A_KR &&
+   hw->device_id != IXGBE_DEV_ID_X550EM_A_KR_L)
return 0;
 
rc = hw->mac.ops.read_iosf_sb_reg(hw,
@@ -2437,6 +2445,8 @@ static enum ixgbe_media_type 
ixgbe_get_media_type_X550em(struct ixgbe_hw *hw)
/* Fallthrough */
case IXGBE_DEV_ID_X550EM_X_KR:
case IXGBE_DEV_ID_X550EM_X_KX4:
+   case IXGBE_DEV_ID_X550EM_A_KR:
+   case IXGBE_DEV_ID_X550EM_A_KR_L:
media_type = ixgbe_media_type_backplane;
break;
case IXGBE_DEV_ID_X550EM_X_SFP:
-- 
2.5.5

[net-next 12/18] ixgbe: Read and set instance id

From: Mark Rustad 

Read the instance number from EEPROM and save it for later use.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 8 
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h   | 5 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 11450bd..737443a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -682,6 +682,7 @@ s32 ixgbe_get_bus_info_generic(struct ixgbe_hw *hw)
 void ixgbe_set_lan_id_multi_port_pcie(struct ixgbe_hw *hw)
 {
struct ixgbe_bus_info *bus = &hw->bus;
+   u16 ee_ctrl_4;
u32 reg;
 
reg = IXGBE_READ_REG(hw, IXGBE_STATUS);
@@ -692,6 +693,13 @@ void ixgbe_set_lan_id_multi_port_pcie(struct ixgbe_hw *hw)
reg = IXGBE_READ_REG(hw, IXGBE_FACTPS(hw));
if (reg & IXGBE_FACTPS_LFS)
bus->func ^= 0x1;
+
+   /* Get MAC instance from EEPROM for configuring CS4227 */
+   if (hw->device_id == IXGBE_DEV_ID_X550EM_A_SFP) {
+   hw->eeprom.ops.read(hw, IXGBE_EEPROM_CTRL_4, &ee_ctrl_4);
+   bus->instance_id = (ee_ctrl_4 & IXGBE_EE_CTRL_4_INST_ID) >>
+  IXGBE_EE_CTRL_4_INST_ID_SHIFT;
+   }
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index ced38c1..a5c789e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -82,6 +82,7 @@
 #define IXGBE_DEV_ID_X550EM_X_10G_T0x15AD
 #define IXGBE_DEV_ID_X550EM_X_1G_T 0x15AE
 #define IXGBE_DEV_ID_X550EM_A_SFP_N0x15C4
+#define IXGBE_DEV_ID_X550EM_A_SFP  0x15CE
 
 /* VF Device IDs */
 #define IXGBE_DEV_ID_X550_VF_HV0x1564
@@ -2000,6 +2001,9 @@ enum {
 #define IXGBE_PBANUM_PTR_GUARD 0xFAFA
 #define IXGBE_EEPROM_CHECKSUM  0x3F
 #define IXGBE_EEPROM_SUM   0xBABA
+#define IXGBE_EEPROM_CTRL_40x45
+#define IXGBE_EE_CTRL_4_INST_ID0x10
+#define IXGBE_EE_CTRL_4_INST_ID_SHIFT  4
 #define IXGBE_PCIE_ANALOG_PTR  0x03
 #define IXGBE_ATLAS0_CONFIG_PTR0x04
 #define IXGBE_PHY_PTR  0x04
@@ -3175,6 +3179,7 @@ struct ixgbe_bus_info {
 
u8 func;
u8 lan_id;
+   u8 instance_id;
 };
 
 /* Flow control parameters */
-- 
2.5.5

[net-next 10/18] ixgbe: Add support for x550em_a 10G MAC type

From: Mark Rustad 

Add support for x550em_a 10G MAC type to the ixgbe driver. The new
MAC includes new firmware commands that need to be used to control
PHY and IOSF access, so that support is also added. The interface
supported is a native SFP+ interface.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h |   3 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c   |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c  |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c |   6 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   9 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c |   3 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c|  43 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c |   2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c |   6 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h|  38 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c| 208 ++-
 11 files changed, 311 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 4590fab..d10ed62 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -817,6 +817,7 @@ static inline u8 ixgbe_max_rss_indices(struct ixgbe_adapter 
*adapter)
return IXGBE_MAX_RSS_INDICES;
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
+   case ixgbe_mac_x550em_a:
return IXGBE_MAX_RSS_INDICES_X550;
default:
return 0;
@@ -860,6 +861,7 @@ enum ixgbe_boards {
board_X540,
board_X550,
board_X550EM_x,
+   board_x550em_a,
 };
 
 extern const struct ixgbe_info ixgbe_82598_info;
@@ -867,6 +869,7 @@ extern const struct ixgbe_info ixgbe_82599_info;
 extern const struct ixgbe_info ixgbe_X540_info;
 extern const struct ixgbe_info ixgbe_X550_info;
 extern const struct ixgbe_info ixgbe_X550EM_x_info;
+extern const struct ixgbe_info ixgbe_x550em_a_info;
 #ifdef CONFIG_IXGBE_DCB
 extern const struct dcbnl_rtnl_ops dcbnl_ops;
 #endif
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
index 4bb6b68..0151978 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
@@ -1633,6 +1633,7 @@ s32 ixgbe_fdir_set_input_mask_82599(struct ixgbe_hw *hw,
switch (hw->mac.type) {
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
+   case ixgbe_mac_x550em_a:
IXGBE_WRITE_REG(hw, IXGBE_FDIRSCTPM, ~fdirtcpm);
break;
default:
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 8c560da..11450bd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -2855,6 +2855,7 @@ u16 ixgbe_get_pcie_msix_count_generic(struct ixgbe_hw *hw)
case ixgbe_mac_X540:
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
+   case ixgbe_mac_x550em_a:
pcie_offset = IXGBE_PCIE_MSIX_82599_CAPS;
max_msix_count = IXGBE_MAX_MSIX_VECTORS_82599;
break;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c
index 02c7333..f8fb2ac 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c
@@ -1,7 +1,7 @@
 
/***
 
   Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2014 Intel Corporation.
+  Copyright(c) 1999 - 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify it
   under the terms and conditions of the GNU General Public License,
@@ -293,6 +293,7 @@ s32 ixgbe_dcb_hw_config(struct ixgbe_hw *hw,
case ixgbe_mac_X540:
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
+   case ixgbe_mac_x550em_a:
return ixgbe_dcb_hw_config_82599(hw, pfc_en, refill, max,
 bwgid, ptype, prio_tc);
default:
@@ -311,6 +312,7 @@ s32 ixgbe_dcb_hw_pfc_config(struct ixgbe_hw *hw, u8 pfc_en, 
u8 *prio_tc)
case ixgbe_mac_X540:
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
+   case ixgbe_mac_x550em_a:
return ixgbe_dcb_config_pfc_82599(hw, pfc_en, prio_tc);
default:
break;
@@ -368,6 +370,7 @@ s32 ixgbe_dcb_hw_ets_config(struct ixgbe_hw *hw,
case ixgbe_mac_X540:
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
+   case ixgbe_mac_x550em_a:
ixgbe_dcb_config_rx_arbiter_82599(hw, refill, max,
  bwg_id, prio_type, prio_tc);
ixgbe_dcb_config_tx_desc

[net-next 13/18] ixgbe: Read and parse NW_MNG_IF_SEL register

From: Mark Rustad 

Read the IXGBE_NW_MNG_IF_SEL register and use it to set interface
attributes.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |  5 
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 37 +++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index a5c789e..6b68e8b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -3649,5 +3649,10 @@ struct ixgbe_info {
 #define IXGBE_SB_IOSF_TARGET_KX4_PCS1  3
 
 #define IXGBE_NW_MNG_IF_SEL0x00011178
+#define IXGBE_NW_MNG_IF_SEL_MDIO_ACT   BIT(1)
+#define IXGBE_NW_MNG_IF_SEL_ENABLE_10_100M BIT(23)
 #define IXGBE_NW_MNG_IF_SEL_INT_PHY_MODE   BIT(24)
+#define IXGBE_NW_MNG_IF_SEL_MDIO_PHY_ADD_SHIFT 3
+#define IXGBE_NW_MNG_IF_SEL_MDIO_PHY_ADD   \
+   (0x1F << IXGBE_NW_MNG_IF_SEL_MDIO_PHY_ADD_SHIFT)
 #endif /* _IXGBE_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index ef1dc3b..3563b86 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -2137,6 +2137,36 @@ static s32 ixgbe_enter_lplu_t_x550em(struct ixgbe_hw *hw)
return status;
 }
 
+/**
+ * ixgbe_read_mng_if_sel_x550em - Read NW_MNG_IF_SEL register
+ * @hw: pointer to hardware structure
+ *
+ * Read NW_MNG_IF_SEL register and save field values.
+ */
+static void ixgbe_read_mng_if_sel_x550em(struct ixgbe_hw *hw)
+{
+   /* Save NW management interface connected on board. This is used
+* to determine internal PHY mode.
+*/
+   hw->phy.nw_mng_if_sel = IXGBE_READ_REG(hw, IXGBE_NW_MNG_IF_SEL);
+
+   /* If X552 (X550EM_a) and MDIO is connected to external PHY, then set
+* PHY address. This register field was has only been used for X552.
+*/
+   if (!hw->phy.nw_mng_if_sel) {
+   if (hw->mac.type == ixgbe_mac_x550em_a) {
+   struct ixgbe_adapter *adapter = hw->back;
+
+   e_warn(drv, "nw_mng_if_sel not set\n");
+   }
+   return;
+   }
+
+   hw->phy.mdio.prtad = (hw->phy.nw_mng_if_sel &
+ IXGBE_NW_MNG_IF_SEL_MDIO_PHY_ADD) >>
+IXGBE_NW_MNG_IF_SEL_MDIO_PHY_ADD_SHIFT;
+}
+
 /** ixgbe_init_phy_ops_X550em - PHY/SFP specific init
  *  @hw: pointer to hardware structure
  *
@@ -2151,14 +2181,11 @@ static s32 ixgbe_init_phy_ops_X550em(struct ixgbe_hw 
*hw)
 
hw->mac.ops.set_lan_id(hw);
 
+   ixgbe_read_mng_if_sel_x550em(hw);
+
if (hw->mac.ops.get_media_type(hw) == ixgbe_media_type_fiber) {
phy->phy_semaphore_mask = IXGBE_GSSR_SHARED_I2C_SM;
ixgbe_setup_mux_ctl(hw);
-
-   /* Save NW management interface connected on board. This is used
-* to determine internal PHY mode.
-*/
-   phy->nw_mng_if_sel = IXGBE_READ_REG(hw, IXGBE_NW_MNG_IF_SEL);
}
 
/* Identify the PHY or SFP module */
-- 
2.5.5

[net-next 03/18] ixgbe: Correct length check for round up

From: Mark Rustad 

The function ixgbe_host_interface_command actually uses a multiple
of word sized buffer to do its business, but only checks against
the actual length passed in. This means that on read operations it
could be possible to modify locations beyond the length passed in.
Change the check to round up in the same way, just to avoid any
possible hazard.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index dfdb114..a2ca9ef 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -3557,7 +3557,7 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, u32 
*buffer,
if (buf_len == 0)
return 0;
 
-   if (length < (buf_len + hdr_size)) {
+   if (length < round_up(buf_len, 4) + hdr_size) {
hw_dbg(hw, "Buffer not large enough for reply message.\n");
return IXGBE_ERR_HOST_INTERFACE_COMMAND;
}
-- 
2.5.5

[net-next 01/18] ixgbe: Delete some unused register definitions

From: Mark Rustad 

I noticed the SRAMREL registers are not referenced for any device,
so delete the definitions.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index bc012ab..d02a0a3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -143,13 +143,6 @@
 #define IXGBE_GRC_X550EM_a 0x15F64
 #define IXGBE_GRC(_hw) IXGBE_BY_MAC((_hw), GRC)
 
-#define IXGBE_SRAMREL_8259X0x10210
-#define IXGBE_SRAMREL_X540 IXGBE_SRAMREL_8259X
-#define IXGBE_SRAMREL_X550 IXGBE_SRAMREL_8259X
-#define IXGBE_SRAMREL_X550EM_x IXGBE_SRAMREL_8259X
-#define IXGBE_SRAMREL_X550EM_a 0x15F6C
-#define IXGBE_SRAMREL(_hw) IXGBE_BY_MAC((_hw), SRAMREL)
-
 /* General Receive Control */
 #define IXGBE_GRC_MNG  0x0001 /* Manageability Enable */
 #define IXGBE_GRC_APME 0x0002 /* APM enabled in EEPROM */
@@ -2948,7 +2941,6 @@ union ixgbe_atr_hash_dword {
IXGBE_CAT(EEC, m),  \
IXGBE_CAT(FLA, m),  \
IXGBE_CAT(GRC, m),  \
-   IXGBE_CAT(SRAMREL, m),  \
IXGBE_CAT(FACTPS, m),   \
IXGBE_CAT(SWSM, m), \
IXGBE_CAT(SWFW_SYNC, m),\
-- 
2.5.5

[net-next 04/18] ixgbe: Clean up interface for firmware commands

From: Mark Rustad 

Clean up the interface for issuing firmware commands to use a
void * instead of a u32 *. This eliminates a number of casts.
Also clean up ixgbe_host_interface_command in a few other ways,
eliminating comparisons with 0, redundant parens and minor
formatting issues.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 39 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.h |  4 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c   | 13 -
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index a2ca9ef..b8cdff7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -3483,15 +3483,19 @@ static u8 ixgbe_calculate_checksum(u8 *buffer, u32 
length)
  *  Communicates with the manageability block.  On success return 0
  *  else return IXGBE_ERR_HOST_INTERFACE_COMMAND.
  **/
-s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, u32 *buffer,
+s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, void *buffer,
 u32 length, u32 timeout,
 bool return_data)
 {
-   u32 hicr, i, bi, fwsts;
u32 hdr_size = sizeof(struct ixgbe_hic_hdr);
+   u32 hicr, i, bi, fwsts;
u16 buf_len, dword_len;
+   union {
+   struct ixgbe_hic_hdr hdr;
+   u32 u32arr[1];
+   } *bp = buffer;
 
-   if (length == 0 || length > IXGBE_HI_MAX_BLOCK_BYTE_LENGTH) {
+   if (!length || length > IXGBE_HI_MAX_BLOCK_BYTE_LENGTH) {
hw_dbg(hw, "Buffer length failure buffersize-%d.\n", length);
return IXGBE_ERR_HOST_INTERFACE_COMMAND;
}
@@ -3502,26 +3506,25 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
u32 *buffer,
 
/* Check that the host interface is enabled. */
hicr = IXGBE_READ_REG(hw, IXGBE_HICR);
-   if ((hicr & IXGBE_HICR_EN) == 0) {
+   if (!(hicr & IXGBE_HICR_EN)) {
hw_dbg(hw, "IXGBE_HOST_EN bit disabled.\n");
return IXGBE_ERR_HOST_INTERFACE_COMMAND;
}
 
/* Calculate length in DWORDs. We must be DWORD aligned */
-   if ((length % (sizeof(u32))) != 0) {
+   if (length % sizeof(u32)) {
hw_dbg(hw, "Buffer length failure, not aligned to dword");
return IXGBE_ERR_INVALID_ARGUMENT;
}
 
dword_len = length >> 2;
 
-   /*
-* The device driver writes the relevant command block
+   /* The device driver writes the relevant command block
 * into the ram area.
 */
for (i = 0; i < dword_len; i++)
IXGBE_WRITE_REG_ARRAY(hw, IXGBE_FLEX_MNG,
- i, cpu_to_le32(buffer[i]));
+ i, cpu_to_le32(bp->u32arr[i]));
 
/* Setting this bit tells the ARC that a new command is pending. */
IXGBE_WRITE_REG(hw, IXGBE_HICR, hicr | IXGBE_HICR_C);
@@ -3534,8 +3537,8 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, u32 
*buffer,
}
 
/* Check command successful completion. */
-   if ((timeout != 0 && i == timeout) ||
-   (!(IXGBE_READ_REG(hw, IXGBE_HICR) & IXGBE_HICR_SV))) {
+   if ((timeout && i == timeout) ||
+   !(IXGBE_READ_REG(hw, IXGBE_HICR) & IXGBE_HICR_SV)) {
hw_dbg(hw, "Command has failed with no status valid.\n");
return IXGBE_ERR_HOST_INTERFACE_COMMAND;
}
@@ -3548,13 +3551,13 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
u32 *buffer,
 
/* first pull in the header so we know the buffer length */
for (bi = 0; bi < dword_len; bi++) {
-   buffer[bi] = IXGBE_READ_REG_ARRAY(hw, IXGBE_FLEX_MNG, bi);
-   le32_to_cpus(&buffer[bi]);
+   bp->u32arr[bi] = IXGBE_READ_REG_ARRAY(hw, IXGBE_FLEX_MNG, bi);
+   le32_to_cpus(&bp->u32arr[bi]);
}
 
/* If there is any thing in data position pull it in */
-   buf_len = ((struct ixgbe_hic_hdr *)buffer)->buf_len;
-   if (buf_len == 0)
+   buf_len = bp->hdr.buf_len;
+   if (!buf_len)
return 0;
 
if (length < round_up(buf_len, 4) + hdr_size) {
@@ -3565,10 +3568,10 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
u32 *buffer,
/* Calculate length in DWORDs, add 3 for odd lengths */
dword_len = (buf_len + 3) >> 2;
 
-   /* Pull in the rest of the buffer (bi is where we left off)*/
+   /* Pull in the rest of the buffer (bi is where we left off) */
for (; bi <= dword_len; bi++) {
-   buffer[bi] = IXGBE_READ_REG_ARRAY(hw, IXGBE_FLEX_MNG, bi);
-   le32_to_cpus(&buffer[bi]);
+   bp->u32arr[bi] = IXGBE_READ_REG_ARRAY(hw, IXGBE_FLEX

[net-next 14/18] ixgbe: Introduce function to control MDIO speed

From: Mark Rustad 

Move code that controls MDIO speed into a new function because
there will be more MACs that need the control.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 27 +--
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 3563b86..0d6cbb0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -2306,6 +2306,26 @@ static s32 ixgbe_init_ext_t_x550em(struct ixgbe_hw *hw)
return status;
 }
 
+/**
+ * ixgbe_set_mdio_speed - Set MDIO clock speed
+ * @hw: pointer to hardware structure
+ */
+static void ixgbe_set_mdio_speed(struct ixgbe_hw *hw)
+{
+   u32 hlreg0;
+
+   switch (hw->device_id) {
+   case IXGBE_DEV_ID_X550EM_X_10G_T:
+   /* Config MDIO clock speed before the first MDIO PHY access */
+   hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
+   hlreg0 &= ~IXGBE_HLREG0_MDCSPD;
+   IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
+   break;
+   default:
+   break;
+   }
+}
+
 /**  ixgbe_reset_hw_X550em - Perform hardware reset
  **  @hw: pointer to hardware structure
  **
@@ -2319,7 +2339,6 @@ static s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw)
s32 status;
u32 ctrl = 0;
u32 i;
-   u32 hlreg0;
bool link_up = false;
 
/* Call adapter stop to disable Tx/Rx and clear interrupts */
@@ -2405,11 +2424,7 @@ mac_reset_top:
hw->mac.num_rar_entries = 128;
hw->mac.ops.init_rx_addrs(hw);
 
-   if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) {
-   hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
-   hlreg0 &= ~IXGBE_HLREG0_MDCSPD;
-   IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
-   }
+   ixgbe_set_mdio_speed(hw);
 
if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP)
ixgbe_setup_mux_ctl(hw);
-- 
2.5.5

[net-next 15/18] ixgbe: Add support for SFPs with retimer

From: Mark Rustad 

Add support for SFPs with an external retimer.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h  |   6 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |   5 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 133 +-
 4 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index eb93319..93db4bf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -132,6 +132,7 @@ static const struct pci_device_id ixgbe_pci_tbl[] = {
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_10G_T), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_SFP), board_X550EM_x},
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SFP_N), board_x550em_a },
+   {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SFP), board_x550em_a },
/* required last entry */
{0, }
 };
@@ -2681,6 +2682,7 @@ static inline void ixgbe_irq_enable(struct ixgbe_adapter 
*adapter, bool queues,
case ixgbe_mac_X550EM_x:
case ixgbe_mac_x550em_a:
if (adapter->hw.device_id == IXGBE_DEV_ID_X550EM_X_SFP ||
+   adapter->hw.device_id == IXGBE_DEV_ID_X550EM_A_SFP ||
adapter->hw.device_id == IXGBE_DEV_ID_X550EM_A_SFP_N)
mask |= IXGBE_EIMS_GPI_SDP0(&adapter->hw);
if (adapter->hw.phy.type == ixgbe_phy_x550em_ext_t)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h
index 5abd66c..cdf4c38 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h
@@ -1,7 +1,7 @@
 
/***
 
   Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2014 Intel Corporation.
+  Copyright(c) 1999 - 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify it
   under the terms and conditions of the GNU General Public License,
@@ -81,7 +81,11 @@
 #define IXGBE_I2C_EEPROM_STATUS_FAIL   0x2
 #define IXGBE_I2C_EEPROM_STATUS_IN_PROGRESS0x3
 #define IXGBE_CS4227   0xBE/* CS4227 address */
+#define IXGBE_CS4227_GLOBAL_ID_LSB 0
+#define IXGBE_CS4227_GLOBAL_ID_MSB 1
 #define IXGBE_CS4227_SCRATCH   2
+#define IXGBE_CS4223_PHY_ID0x7003  /* Quad port */
+#define IXGBE_CS4227_PHY_ID0x3003  /* Dual port */
 #define IXGBE_CS4227_RESET_PENDING 0x1357
 #define IXGBE_CS4227_RESET_COMPLETE0x5AA5
 #define IXGBE_CS4227_RETRIES   15
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 6b68e8b..fbbc132 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -1311,6 +1311,7 @@ struct ixgbe_thermal_sensor_data {
 
 /* MDIO definitions */
 
+#define IXGBE_MDIO_ZERO_DEV_TYPE   0x0
 #define IXGBE_MDIO_PMA_PMD_DEV_TYPE0x1
 #define IXGBE_MDIO_PCS_DEV_TYPE0x3
 #define IXGBE_MDIO_PHY_XS_DEV_TYPE 0x4
@@ -3580,6 +3581,7 @@ struct ixgbe_info {
 #define IXGBE_KRM_PORT_CAR_GEN_CTRL(P) ((P) ? 0x8010 : 0x4010)
 #define IXGBE_KRM_LINK_CTRL_1(P)   ((P) ? 0x820C : 0x420C)
 #define IXGBE_KRM_AN_CNTL_1(P) ((P) ? 0x822C : 0x422C)
+#define IXGBE_KRM_AN_CNTL_8(P) ((P) ? 0x8248 : 0x4248)
 #define IXGBE_KRM_DSP_TXFFE_STATE_4(P) ((P) ? 0x8634 : 0x4634)
 #define IXGBE_KRM_DSP_TXFFE_STATE_5(P) ((P) ? 0x8638 : 0x4638)
 #define IXGBE_KRM_RX_TRN_LINKUP_CTRL(P)((P) ? 0x8B00 : 0x4B00)
@@ -3605,6 +3607,9 @@ struct ixgbe_info {
 #define IXGBE_KRM_AN_CNTL_1_SYM_PAUSE  (1 << 28)
 #define IXGBE_KRM_AN_CNTL_1_ASM_PAUSE  (1 << 29)
 
+#define IXGBE_KRM_AN_CNTL_8_LINEAR BIT(0)
+#define IXGBE_KRM_AN_CNTL_8_LIMITING   BIT(1)
+
 #define IXGBE_KRM_DSP_TXFFE_STATE_C0_EN(1 << 6)
 #define IXGBE_KRM_DSP_TXFFE_STATE_CP1_CN1_EN   (1 << 15)
 #define IXGBE_KRM_DSP_TXFFE_STATE_CO_ADAPT_EN  (1 << 16)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 0d6cbb0..a9d86b3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -273,6 +273,12 @@ out:
 static s32 ixgbe_identify_phy_x550em(struct ixgbe_hw *hw)
 {
switch (hw->device_id) {
+   case IXGBE_DEV_ID_X550EM_A_SFP:
+   if (hw->bus.lan_id)
+   hw->phy.phy_semaphore_mask = IXGBE_GSSR_PHY1_SM;
+   else

[net-next 18/18] ixgbe: Bump version number

From: Mark Rustad 

Update ixgbe version number.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1a7bfcf..2976df7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -70,7 +70,7 @@ char ixgbe_default_device_descr[] =
 static char ixgbe_default_device_descr[] =
  "Intel(R) 10 Gigabit Network Connection";
 #endif
-#define DRV_VERSION "4.2.1-k"
+#define DRV_VERSION "4.4.0-k"
 const char ixgbe_driver_version[] = DRV_VERSION;
 static const char ixgbe_copyright[] =
"Copyright (c) 1999-2016 Intel Corporation.";
-- 
2.5.5

[net-next 05/18] ixgbe: Take manageability semaphore for firmware commands

From: Mark Rustad 

We need to take the manageability semaphore when issuing firmware
commands to avoid problems. With this in place, the semaphore is
no longer taken in the ixgbe_set_fw_drv_ver_generic function, since
it will now always be taken by the ixgbe_host_interface_command
function.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 30 -
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index b8cdff7..ee43a38 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -3494,11 +3494,16 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
void *buffer,
struct ixgbe_hic_hdr hdr;
u32 u32arr[1];
} *bp = buffer;
+   s32 status;
 
if (!length || length > IXGBE_HI_MAX_BLOCK_BYTE_LENGTH) {
hw_dbg(hw, "Buffer length failure buffersize-%d.\n", length);
return IXGBE_ERR_HOST_INTERFACE_COMMAND;
}
+   /* Take management host interface semaphore */
+   status = hw->mac.ops.acquire_swfw_sync(hw, IXGBE_GSSR_SW_MNG_SM);
+   if (status)
+   return status;
 
/* Set bit 9 of FWSTS clearing FW reset indication */
fwsts = IXGBE_READ_REG(hw, IXGBE_FWSTS);
@@ -3508,13 +3513,15 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
void *buffer,
hicr = IXGBE_READ_REG(hw, IXGBE_HICR);
if (!(hicr & IXGBE_HICR_EN)) {
hw_dbg(hw, "IXGBE_HOST_EN bit disabled.\n");
-   return IXGBE_ERR_HOST_INTERFACE_COMMAND;
+   status = IXGBE_ERR_HOST_INTERFACE_COMMAND;
+   goto rel_out;
}
 
/* Calculate length in DWORDs. We must be DWORD aligned */
if (length % sizeof(u32)) {
hw_dbg(hw, "Buffer length failure, not aligned to dword");
-   return IXGBE_ERR_INVALID_ARGUMENT;
+   status = IXGBE_ERR_INVALID_ARGUMENT;
+   goto rel_out;
}
 
dword_len = length >> 2;
@@ -3540,11 +3547,12 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
void *buffer,
if ((timeout && i == timeout) ||
!(IXGBE_READ_REG(hw, IXGBE_HICR) & IXGBE_HICR_SV)) {
hw_dbg(hw, "Command has failed with no status valid.\n");
-   return IXGBE_ERR_HOST_INTERFACE_COMMAND;
+   status = IXGBE_ERR_HOST_INTERFACE_COMMAND;
+   goto rel_out;
}
 
if (!return_data)
-   return 0;
+   goto rel_out;
 
/* Calculate length in DWORDs */
dword_len = hdr_size >> 2;
@@ -3558,11 +3566,12 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
void *buffer,
/* If there is any thing in data position pull it in */
buf_len = bp->hdr.buf_len;
if (!buf_len)
-   return 0;
+   goto rel_out;
 
if (length < round_up(buf_len, 4) + hdr_size) {
hw_dbg(hw, "Buffer not large enough for reply message.\n");
-   return IXGBE_ERR_HOST_INTERFACE_COMMAND;
+   status = IXGBE_ERR_HOST_INTERFACE_COMMAND;
+   goto rel_out;
}
 
/* Calculate length in DWORDs, add 3 for odd lengths */
@@ -3574,7 +3583,10 @@ s32 ixgbe_host_interface_command(struct ixgbe_hw *hw, 
void *buffer,
le32_to_cpus(&bp->u32arr[bi]);
}
 
-   return 0;
+rel_out:
+   hw->mac.ops.release_swfw_sync(hw, IXGBE_GSSR_SW_MNG_SM);
+
+   return status;
 }
 
 /**
@@ -3597,9 +3609,6 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 
maj, u8 min,
int i;
s32 ret_val;
 
-   if (hw->mac.ops.acquire_swfw_sync(hw, IXGBE_GSSR_SW_MNG_SM))
-   return IXGBE_ERR_SWFW_SYNC;
-
fw_cmd.hdr.cmd = FW_CEM_CMD_DRIVER_INFO;
fw_cmd.hdr.buf_len = FW_CEM_CMD_DRIVER_INFO_LEN;
fw_cmd.hdr.cmd_or_resp.cmd_resv = FW_CEM_CMD_RESERVED;
@@ -3631,7 +3640,6 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 
maj, u8 min,
break;
}
 
-   hw->mac.ops.release_swfw_sync(hw, IXGBE_GSSR_SW_MNG_SM);
return ret_val;
 }
 
-- 
2.5.5

[net-next 06/18] ixgbe/ixgbevf: Add support for bulk free in Tx cleanup & cleanup boolean logic

From: Alexander Duyck 

This patch enables bulk free in Tx cleanup for ixgbevf and cleans up the
boolean logic in the polling routines for ixgbe and ixgbevf in the hopes of
avoiding any mix-ups similar to what occurred with i40e and i40evf.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 +++---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 14 +-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 19bf386..d5509cc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -,6 +,7 @@ static int ixgbe_tx_maxrate(struct net_device *netdev,
  * ixgbe_clean_tx_irq - Reclaim resources after transmit completes
  * @q_vector: structure containing interrupt and ring information
  * @tx_ring: tx ring to clean
+ * @napi_budget: Used to determine if we are in netpoll
  **/
 static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
   struct ixgbe_ring *tx_ring, int napi_budget)
@@ -2807,8 +2808,10 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
ixgbe_update_dca(q_vector);
 #endif
 
-   ixgbe_for_each_ring(ring, q_vector->tx)
-   clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring, budget);
+   ixgbe_for_each_ring(ring, q_vector->tx) {
+   if (!ixgbe_clean_tx_irq(q_vector, ring, budget))
+   clean_complete = false;
+   }
 
/* Exit if we are called by netpoll or busy polling is active */
if ((budget <= 0) || !ixgbe_qv_lock_napi(q_vector))
@@ -2826,7 +2829,8 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
 per_ring_budget);
 
work_done += cleaned;
-   clean_complete &= (cleaned < per_ring_budget);
+   if (cleaned >= per_ring_budget)
+   clean_complete = false;
}
 
ixgbe_qv_unlock_napi(q_vector);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 50b6bff..007cbe0 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -288,9 +288,10 @@ static void ixgbevf_tx_timeout(struct net_device *netdev)
  * ixgbevf_clean_tx_irq - Reclaim resources after transmit completes
  * @q_vector: board private structure
  * @tx_ring: tx ring to clean
+ * @napi_budget: Used to determine if we are in netpoll
  **/
 static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
-struct ixgbevf_ring *tx_ring)
+struct ixgbevf_ring *tx_ring, int napi_budget)
 {
struct ixgbevf_adapter *adapter = q_vector->adapter;
struct ixgbevf_tx_buffer *tx_buffer;
@@ -328,7 +329,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
total_packets += tx_buffer->gso_segs;
 
/* free the skb */
-   dev_kfree_skb_any(tx_buffer->skb);
+   napi_consume_skb(tx_buffer->skb, napi_budget);
 
/* unmap skb header data */
dma_unmap_single(tx_ring->dev,
@@ -1013,8 +1014,10 @@ static int ixgbevf_poll(struct napi_struct *napi, int 
budget)
int per_ring_budget, work_done = 0;
bool clean_complete = true;
 
-   ixgbevf_for_each_ring(ring, q_vector->tx)
-   clean_complete &= ixgbevf_clean_tx_irq(q_vector, ring);
+   ixgbevf_for_each_ring(ring, q_vector->tx) {
+   if (!ixgbevf_clean_tx_irq(q_vector, ring, budget))
+   clean_complete = false;
+   }
 
if (budget <= 0)
return budget;
@@ -1035,7 +1038,8 @@ static int ixgbevf_poll(struct napi_struct *napi, int 
budget)
int cleaned = ixgbevf_clean_rx_irq(q_vector, ring,
   per_ring_budget);
work_done += cleaned;
-   clean_complete &= (cleaned < per_ring_budget);
+   if (cleaned >= per_ring_budget)
+   clean_complete = false;
}
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
-- 
2.5.5

[net-next 00/18][pull request] 10GbE Intel Wired LAN Driver Updates 2016-04-07

This series contains updates to ixgbe and ixgbevf.

This entire series (except for one patch from Alex) comes from Mark and
is mainly to add support for our new MAC (x550em_a).

So let's get Alex's patch out of the way first before we cover Mark's
many changes.  Alex does his enable bulk free in transmit cleanup for
ixgbe and ixgbevf, like his has done for all of our other drivers.

First Mark cleans up registers that were not being used, so do some
house cleaning.  Then to avoid casting lan_id and func fields, just
make them u8 since they only hold small values anyways.  Found and
fixed an issue where on read operations it could be possible to
modify locations beyond the length passed in, so change the check
to round up in the same way.  Cleaned up the interface for issuing
firmware commands to use a void * instead of a u32 * which eliminates
a number of casts.  Added support for the new MAC and provided method
pointers and use them to access IOSF-attached devices, since the
new MAC will also need a new access method.  Added support for SFPs
with an external retimer and for an SGMII backplane interface.

The following are changes since commit 889750bd2e08a94d52a116056d462b3a8e5616a7:
  Merge branch 'tipc-next'
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE

Alexander Duyck (1):
  ixgbe/ixgbevf: Add support for bulk free in Tx cleanup & cleanup
boolean logic

Mark Rustad (17):
  ixgbe: Delete some unused register definitions
  ixgbe: Change the lan_id and func fields to a u8 to avoid casts
  ixgbe: Correct length check for round up
  ixgbe: Clean up interface for firmware commands
  ixgbe: Take manageability semaphore for firmware commands
  ixgbe: Add support for single-port X550 device
  ixgbe: Add definitions for x550em_a 10G MAC
  ixgbe: Use method pointer to access IOSF devices
  ixgbe: Add support for x550em_a 10G MAC type
  ixgbe: Use new methods for PHY access
  ixgbe: Read and set instance id
  ixgbe: Read and parse NW_MNG_IF_SEL register
  ixgbe: Introduce function to control MDIO speed
  ixgbe: Add support for SFPs with retimer
  ixgbe: Add support for SGMII backplane interface
  ixgbe: Add KR backplane support for x550em_a
  ixgbe: Bump version number

 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |   3 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c|   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c   |  83 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.h   |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c  |   6 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |   9 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c  |   3 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  63 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c  |   2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h  |   6 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c  |   6 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 101 +++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 591 --
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  14 +-
 14 files changed, 786 insertions(+), 106 deletions(-)

-- 
2.5.5

[net-next 11/18] ixgbe: Use new methods for PHY access

From: Mark Rustad 

Now x550em_a devices will use a new method for PHY access that will
get the firmware token for each access.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 67 +--
 1 file changed, 64 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index ba161b5..ef1dc3b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -2548,6 +2548,57 @@ static void ixgbe_release_swfw_sync_x550em_a(struct 
ixgbe_hw *hw, u32 mask)
ixgbe_release_swfw_sync_X540(hw, hmask);
 }
 
+/**
+ * ixgbe_read_phy_reg_x550a - Reads specified PHY register
+ * @hw: pointer to hardware structure
+ * @reg_addr: 32 bit address of PHY register to read
+ * @phy_data: Pointer to read data from PHY register
+ *
+ * Reads a value from a specified PHY register using the SWFW lock and PHY
+ * Token. The PHY Token is needed since the MDIO is shared between to MAC
+ * instances.
+ */
+static s32 ixgbe_read_phy_reg_x550a(struct ixgbe_hw *hw, u32 reg_addr,
+   u32 device_type, u16 *phy_data)
+{
+   u32 mask = hw->phy.phy_semaphore_mask | IXGBE_GSSR_TOKEN_SM;
+   s32 status;
+
+   if (hw->mac.ops.acquire_swfw_sync(hw, mask))
+   return IXGBE_ERR_SWFW_SYNC;
+
+   status = hw->phy.ops.read_reg_mdi(hw, reg_addr, device_type, phy_data);
+
+   hw->mac.ops.release_swfw_sync(hw, mask);
+
+   return status;
+}
+
+/**
+ * ixgbe_write_phy_reg_x550a - Writes specified PHY register
+ * @hw: pointer to hardware structure
+ * @reg_addr: 32 bit PHY register to write
+ * @device_type: 5 bit device type
+ * @phy_data: Data to write to the PHY register
+ *
+ * Writes a value to specified PHY register using the SWFW lock and PHY Token.
+ * The PHY Token is needed since the MDIO is shared between to MAC instances.
+ */
+static s32 ixgbe_write_phy_reg_x550a(struct ixgbe_hw *hw, u32 reg_addr,
+u32 device_type, u16 phy_data)
+{
+   u32 mask = hw->phy.phy_semaphore_mask | IXGBE_GSSR_TOKEN_SM;
+   s32 status;
+
+   if (hw->mac.ops.acquire_swfw_sync(hw, mask))
+   return IXGBE_ERR_SWFW_SYNC;
+
+   status = ixgbe_write_phy_reg_mdi(hw, reg_addr, device_type, phy_data);
+   hw->mac.ops.release_swfw_sync(hw, mask);
+
+   return status;
+}
+
 #define X550_COMMON_MAC \
.init_hw= &ixgbe_init_hw_generic, \
.start_hw   = &ixgbe_start_hw_X540, \
@@ -2673,8 +2724,6 @@ static const struct ixgbe_eeprom_operations 
eeprom_ops_X550EM_x = {
.read_i2c_sff8472   = &ixgbe_read_i2c_sff8472_generic, \
.read_i2c_eeprom= &ixgbe_read_i2c_eeprom_generic, \
.write_i2c_eeprom   = &ixgbe_write_i2c_eeprom_generic, \
-   .read_reg   = &ixgbe_read_phy_reg_generic, \
-   .write_reg  = &ixgbe_write_phy_reg_generic, \
.setup_link = &ixgbe_setup_phy_link_generic, \
.set_phy_power  = NULL, \
.check_overtemp = &ixgbe_tn_check_overtemp, \
@@ -2684,12 +2733,16 @@ static const struct ixgbe_phy_operations phy_ops_X550 = 
{
X550_COMMON_PHY
.init   = NULL,
.identify   = &ixgbe_identify_phy_generic,
+   .read_reg   = &ixgbe_read_phy_reg_generic,
+   .write_reg  = &ixgbe_write_phy_reg_generic,
 };
 
 static const struct ixgbe_phy_operations phy_ops_X550EM_x = {
X550_COMMON_PHY
.init   = &ixgbe_init_phy_ops_X550em,
.identify   = &ixgbe_identify_phy_x550em,
+   .read_reg   = &ixgbe_read_phy_reg_generic,
+   .write_reg  = &ixgbe_write_phy_reg_generic,
.read_i2c_combined  = &ixgbe_read_i2c_combined_generic,
.write_i2c_combined = &ixgbe_write_i2c_combined_generic,
.read_i2c_combined_unlocked = &ixgbe_read_i2c_combined_generic_unlocked,
@@ -2697,6 +2750,14 @@ static const struct ixgbe_phy_operations 
phy_ops_X550EM_x = {
 &ixgbe_write_i2c_combined_generic_unlocked,
 };
 
+static const struct ixgbe_phy_operations phy_ops_x550em_a = {
+   X550_COMMON_PHY
+   .init   = &ixgbe_init_phy_ops_X550em,
+   .identify   = &ixgbe_identify_phy_x550em,
+   .read_reg   = &ixgbe_read_phy_reg_x550a,
+   .write_reg  = &ixgbe_write_phy_reg_x550a,
+};
+
 static const u32 ixgbe_mvals_X550[IXGBE_MVALS_IDX_LIMIT] = {
IXGBE_MVALS_INIT(X550)
 };
@@ -2734,7 +2795,7 @@ const struct ixgbe_info ixgbe_x550em_a_info = {
.get_invariants = &ixgbe_get_invariants_X550_x,
.mac_ops= &mac_ops_x550em_a,
.eeprom_

[net-next 02/18] ixgbe: Change the lan_id and func fields to a u8 to avoid casts

From: Mark Rustad 

Since the lan_id and func fields only ever hold small values, make
them u8 to avoid casts used to silence warnings.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h   | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 8c7e78b..dfdb114 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -3600,7 +3600,7 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 
maj, u8 min,
fw_cmd.hdr.cmd = FW_CEM_CMD_DRIVER_INFO;
fw_cmd.hdr.buf_len = FW_CEM_CMD_DRIVER_INFO_LEN;
fw_cmd.hdr.cmd_or_resp.cmd_resv = FW_CEM_CMD_RESERVED;
-   fw_cmd.port_num = (u8)hw->bus.func;
+   fw_cmd.port_num = hw->bus.func;
fw_cmd.ver_maj = maj;
fw_cmd.ver_min = min;
fw_cmd.ver_build = build;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index d02a0a3..7ae4bbd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -3122,8 +3122,8 @@ struct ixgbe_bus_info {
enum ixgbe_bus_width width;
enum ixgbe_bus_type type;
 
-   u16 func;
-   u16 lan_id;
+   u8 func;
+   u8 lan_id;
 };
 
 /* Flow control parameters */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 9d3f765..5affac1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -862,7 +862,7 @@ static void ixgbe_disable_rx_x550(struct ixgbe_hw *hw)
fw_cmd.hdr.cmd = FW_DISABLE_RXEN_CMD;
fw_cmd.hdr.buf_len = FW_DISABLE_RXEN_LEN;
fw_cmd.hdr.checksum = FW_DEFAULT_CHECKSUM;
-   fw_cmd.port_number = (u8)hw->bus.lan_id;
+   fw_cmd.port_number = hw->bus.lan_id;
 
status = ixgbe_host_interface_command(hw, (u32 *)&fw_cmd,
sizeof(struct ixgbe_hic_disable_rxen),
-- 
2.5.5

[net-next 09/18] ixgbe: Use method pointer to access IOSF devices

From: Mark Rustad 

Provide method pointers and use them to access IOSF-attached
devices. A new MAC will introduce a new access method.

Signed-off-by: Mark Rustad 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |  2 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 32 +++
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index b505da3..fef2264 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -3332,6 +3332,8 @@ struct ixgbe_mac_operations {
s32 (*dmac_config)(struct ixgbe_hw *hw);
s32 (*dmac_update_tcs)(struct ixgbe_hw *hw);
s32 (*dmac_config_tcs)(struct ixgbe_hw *hw);
+   s32 (*read_iosf_sb_reg)(struct ixgbe_hw *, u32, u32, u32 *);
+   s32 (*write_iosf_sb_reg)(struct ixgbe_hw *, u32, u32, u32);
 };
 
 struct ixgbe_phy_operations {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 65832fa..878ea1e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -1615,7 +1615,7 @@ static s32 ixgbe_setup_kr_speed_x550em(struct ixgbe_hw 
*hw,
s32 status;
u32 reg_val;
 
-   status = ixgbe_read_iosf_sb_reg_x550(hw,
+   status = hw->mac.ops.read_iosf_sb_reg(hw,
IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id),
IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val);
if (status)
@@ -1637,7 +1637,7 @@ static s32 ixgbe_setup_kr_speed_x550em(struct ixgbe_hw 
*hw,
 
/* Restart auto-negotiation. */
reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_RESTART;
-   status = ixgbe_write_iosf_sb_reg_x550(hw,
+   status = hw->mac.ops.write_iosf_sb_reg(hw,
IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id),
IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val);
 
@@ -1654,9 +1654,9 @@ static s32 ixgbe_setup_kx4_x550em(struct ixgbe_hw *hw)
s32 status;
u32 reg_val;
 
-   status = ixgbe_read_iosf_sb_reg_x550(hw, IXGBE_KX4_LINK_CNTL_1,
-IXGBE_SB_IOSF_TARGET_KX4_PCS0 +
-hw->bus.lan_id, ®_val);
+   status = hw->mac.ops.read_iosf_sb_reg(hw, IXGBE_KX4_LINK_CNTL_1,
+ IXGBE_SB_IOSF_TARGET_KX4_PCS0 +
+ hw->bus.lan_id, ®_val);
if (status)
return status;
 
@@ -1675,9 +1675,9 @@ static s32 ixgbe_setup_kx4_x550em(struct ixgbe_hw *hw)
 
/* Restart auto-negotiation. */
reg_val |= IXGBE_KX4_LINK_CNTL_1_TETH_AN_RESTART;
-   status = ixgbe_write_iosf_sb_reg_x550(hw, IXGBE_KX4_LINK_CNTL_1,
- IXGBE_SB_IOSF_TARGET_KX4_PCS0 +
- hw->bus.lan_id, reg_val);
+   status = hw->mac.ops.write_iosf_sb_reg(hw, IXGBE_KX4_LINK_CNTL_1,
+  IXGBE_SB_IOSF_TARGET_KX4_PCS0 +
+  hw->bus.lan_id, reg_val);
 
return status;
 }
@@ -1897,9 +1897,10 @@ static s32 ixgbe_setup_fc_x550em(struct ixgbe_hw *hw)
if (hw->device_id != IXGBE_DEV_ID_X550EM_X_KR)
return 0;
 
-   rc = ixgbe_read_iosf_sb_reg_x550(hw,
-IXGBE_KRM_AN_CNTL_1(hw->bus.lan_id),
-IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val);
+   rc = hw->mac.ops.read_iosf_sb_reg(hw,
+ IXGBE_KRM_AN_CNTL_1(hw->bus.lan_id),
+ IXGBE_SB_IOSF_TARGET_KR_PHY,
+ ®_val);
if (rc)
return rc;
 
@@ -1909,9 +1910,10 @@ static s32 ixgbe_setup_fc_x550em(struct ixgbe_hw *hw)
reg_val |= IXGBE_KRM_AN_CNTL_1_SYM_PAUSE;
if (asm_dir)
reg_val |= IXGBE_KRM_AN_CNTL_1_ASM_PAUSE;
-   rc = ixgbe_write_iosf_sb_reg_x550(hw,
- IXGBE_KRM_AN_CNTL_1(hw->bus.lan_id),
- IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val);
+   rc = hw->mac.ops.write_iosf_sb_reg(hw,
+  IXGBE_KRM_AN_CNTL_1(hw->bus.lan_id),
+  IXGBE_SB_IOSF_TARGET_KR_PHY,
+  reg_val);
 
/* This device does not fully support AN. */
hw->fc.disable_fc_autoneg = true;
@@ -2449,6 +2451,8 @@ static const struct ixgbe_mac_operations mac_ops_X550EM_x 
= {
.release_swfw_sync  = &ixgbe_release_swfw_sync_X550em,
.init_swfw_sync = &ixg

Re: [PATCH 0/3] crypto: af_alg - add TLS type encryption

2016-04-07 Thread Tom Herbert

On Thu, Apr 7, 2016 at 11:52 PM, Herbert Xu  wrote:
> On Wed, Apr 06, 2016 at 10:56:12AM -0700, Tadeusz Struk wrote:
>>
>> The intend is to enable HW acceleration of the TLS protocol.
>> The way it will work is that the user space will send a packet of data
>> via AF_ALG and HW will authenticate and encrypt it in one go.
>
> There have been suggestions to implement TLS data-path within
> the kernel.  So we should decide whether we pursue that or go
> with your approach before we start adding algorithms.
>
Yes, please see Dave Watson's patches on this.

Tom

> Cheers,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH 0/3] crypto: af_alg - add TLS type encryption

2016-04-07 Thread Herbert Xu

On Wed, Apr 06, 2016 at 10:56:12AM -0700, Tadeusz Struk wrote:
> 
> The intend is to enable HW acceleration of the TLS protocol.
> The way it will work is that the user space will send a packet of data
> via AF_ALG and HW will authenticate and encrypt it in one go.

There have been suggestions to implement TLS data-path within
the kernel.  So we should decide whether we pursue that or go
with your approach before we start adding algorithms.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: mpls's find_outdev()

2016-04-07 Thread roopa

On 4/7/16, 6:41 PM, David Miller wrote:
> While auditing something unrelated, I noticed that this function seems
> to potentially dev_put() on an error pointer.
>
> I guess the problem comes from the fact that there are two methods
> by which the device pointer is obtained.
>
> First, inet{,6}_fib_lookup_dev() which uses error pointers.
>
> Second, dev_get_by_index() which returns a valid device or NULL,
> and therefore does not use error pointers.
>
> If inet{,6}_fib_lookup_dev() returns an error pointer, the !dev check
> will not pass and dev_put() will operate on an error pointer and
> crash.
>
> If you agree with my analysis, could you please cook up and test a
> fix?
>
> Thank you!
certainly!. I see it. Thanks for catching it.
I will test and post a fix in a few hours.

Re: [PATCH net] ipv6: Count in extension headers in skb->network_header

From: Jakub Sitnicki 
Date: Tue,  5 Apr 2016 18:41:08 +0200

> When sending a UDPv6 message longer than MTU, account for the length
> of fragmentable IPv6 extension headers in skb->network_header offset.
> Same as we do in alloc_new_skb path in __ip6_append_data().
> 
> This ensures that later on __ip6_make_skb() will make space in
> headroom for fragmentable extension headers:
> 
>   /* move skb->data to ip header from ext header */
>   if (skb->data < skb_network_header(skb))
>   __skb_pull(skb, skb_network_offset(skb));
> 
> Prevents a splat due to skb_under_panic:
 ...
> Reported-by: Ji Jianwen 
> Signed-off-by: Jakub Sitnicki 
> Acked-by: Hannes Frederic Sowa 

Applied and queued up for -stable, thanks.

RE: [PATCH v8 net-next 1/1] hv_sock: introduce Hyper-V Sockets

2016-04-07 Thread Dexuan Cui

> From: Joe Perches [mailto:j...@perches.com]
> Sent: Friday, April 8, 2016 9:15
> On Thu, 2016-04-07 at 18:36 -0700, Dexuan Cui wrote:
> > diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h
> []
> > +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE)
> > +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE)
> > +
> > +#define HVSOCK_RCV_BUF_SZ
>   VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV
> > +#define HVSOCK_SND_BUF_SZ  PAGE_SIZE
> []
> > +struct hvsock_sock {
> []
> > +   struct {
> > +   struct vmpipe_proto_header hdr;
> > +   char buf[HVSOCK_SND_BUF_SZ];
> > +   } __packed send;
> > +
> > +   struct {
> > +   struct vmpipe_proto_header hdr;
> > +   char buf[HVSOCK_RCV_BUF_SZ];
> > +   unsigned int data_len;
> > +   unsigned int data_offset;
> > +   } __packed recv;
> > +};
> 
> These bufs are not page aligned and so can span pages.
> 
> Is there any value in allocating these bufs separately
> as pages instead of as a kmalloc?

The bufs are not required to be page aligned.
Here the 'hdr' and the 'buf' must be consecutive, i.e., the 'buf' must be
an array rather than a pointer: please see hvsock_send_data().

It looks to me there is no big value to make sure the 'buf' is page
aligned: on x86_64, at least it should already be 8-byte aligned due to the
adjacent channel pointer, so memcpy_from_msg() should work
enough good and in hvsock_send_data() -> vmbus_sendpacket(),
we don't copy the 'buf'.

Thanks,
-- Dexuan

Re: [PATCH V2 1/2] net: socket: return EADDRNOTAVAIL when source address becomes nonlocal

From: Liping Zhang 
Date: Tue,  5 Apr 2016 22:19:45 +0800

> From: Liping Zhang 
> 
> A socket can use bind(directly) or connect(indirectly) to bind to a local
> ip address, and later if the network becomes down, that cause the source
> address becomes nonlocal, then send() call will fail and return EINVAL.
> But this error code is confusing, acctually we did not pass any invalid
> arguments. Furthermore, send() maybe return ok at first, it now returns
> fail just because of a temporary network problem, i.e. when the network
> recovery, send() call will become ok. Return EADDRNOTAVAIL instead of
> EINVAL in such situation is better.
> 
> Take the ping utility for example, we can use -I option to specify the
> source address (e.g ping -I 10.19.145.26 117.135.169.41), when network
> becomes down, error message will be printed out as follows:
> 64 bytes from  (117.135.169.41): icmp_seq=9 ttl=54 time=46.7 ms
> ping: sendmsg: Invalid argument
> ping: sendmsg: Invalid argument
> 
> Apply this patch, error message will become:
> 64 bytes from  (117.135.169.41): icmp_seq=5 ttl=54 time=47.5 ms
> ping: sendmsg: Cannot assign requested address
> ping: sendmsg: Cannot assign requested address
> 
> Signed-off-by: Liping Zhang 

Changing the behavior of such a core, and fundamental function, with
hundreds of direct and indirect call sites is extremely dangerous.

I was worried that someone might depend upon the -EINVAL return value,
and indeed I quickly found that the code in net/netfilter/ipvs/ip_vs_xmit.c
does want to see EINVAL in this case.

I refuse to audit the entire call chain of all users of this helper
function, as I said there are hundreds.  But there are potentially
many other call sites, either direct or indirect, that have this
problem as well.

And furthermore, USERSPACE itself might depend upon the error code
being -EINVAL since we've been returning it for several decades.

To be quite honest I think the value of this change is very low and
with the risk factors involved here I really would rather not make
this change at all.

Sorry.

mpls's find_outdev()


While auditing something unrelated, I noticed that this function seems
to potentially dev_put() on an error pointer.

I guess the problem comes from the fact that there are two methods
by which the device pointer is obtained.

First, inet{,6}_fib_lookup_dev() which uses error pointers.

Second, dev_get_by_index() which returns a valid device or NULL,
and therefore does not use error pointers.

If inet{,6}_fib_lookup_dev() returns an error pointer, the !dev check
will not pass and dev_put() will operate on an error pointer and
crash.

If you agree with my analysis, could you please cook up and test a
fix?

Thank you!

Re: [PATCH 2/2] net-ath9k_htc: Replace a variable initialisation by an assignment in ath9k_htc_set_channel()

2016-04-07 Thread Julian Calaby

Hi Kalle,

On Sat, Jan 2, 2016 at 5:25 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Fri, 1 Jan 2016 19:09:32 +0100
>
> Replace an explicit initialisation for one local variable at the beginning
> by a conditional assignment.
>
> Signed-off-by: Markus Elfring 

This looks sane to me.

Reviewed-by: Julian Calaby 

Thanks,

Julian Calaby

> ---
>  drivers/net/wireless/ath/ath9k/htc_drv_main.c | 7 ++-
>  1 file changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c 
> b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
> index a680a97..30bd59e 100644
> --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
> +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
> @@ -246,7 +246,7 @@ static int ath9k_htc_set_channel(struct ath9k_htc_priv 
> *priv,
> struct ieee80211_conf *conf = &common->hw->conf;
> bool fastcc;
> struct ieee80211_channel *channel = hw->conf.chandef.chan;
> -   struct ath9k_hw_cal_data *caldata = NULL;
> +   struct ath9k_hw_cal_data *caldata;
> enum htc_phymode mode;
> __be16 htc_mode;
> u8 cmd_rsp;
> @@ -274,10 +274,7 @@ static int ath9k_htc_set_channel(struct ath9k_htc_priv 
> *priv,
> priv->ah->curchan->channel,
> channel->center_freq, conf_is_ht(conf), conf_is_ht40(conf),
> fastcc);
> -
> -   if (!fastcc)
> -   caldata = &priv->caldata;
> -
> +   caldata = fastcc ? NULL : &priv->caldata;
> ret = ath9k_hw_reset(ah, hchan, caldata, fastcc);
> if (ret) {
> ath_err(common,
> --
> 2.6.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Julian Calaby

Email: julian.cal...@gmail.com
Profile: http://www.google.com/profiles/julian.calaby/

Re: [PATCH v8 net-next 1/1] hv_sock: introduce Hyper-V Sockets

2016-04-07 Thread Joe Perches

On Thu, 2016-04-07 at 18:36 -0700, Dexuan Cui wrote:
> Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
> mechanism between the host and the guest. It's somewhat like TCP over
> VMBus, but the transportation layer (VMBus) is much simpler than IP.
[]
> diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h
[]
> +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE)
> +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE)
> +
> +#define HVSOCK_RCV_BUF_SZVMBUS_RINGBUFFER_SIZE_HVSOCK_RECV
> +#define HVSOCK_SND_BUF_SZPAGE_SIZE
[]
> +struct hvsock_sock {
[]
> + struct {
> + struct vmpipe_proto_header hdr;
> + char buf[HVSOCK_SND_BUF_SZ];
> + } __packed send;
> +
> + struct {
> + struct vmpipe_proto_header hdr;
> + char buf[HVSOCK_RCV_BUF_SZ];
> + unsigned int data_len;
> + unsigned int data_offset;
> + } __packed recv;
> +};

These bufs are not page aligned and so can span pages.

Is there any value in allocating these bufs separately
as pages instead of as a kmalloc?

[PATCH net-next] macvlan: Set nocarrier when lowerdev admin down

2016-04-07 Thread Debabrata Banerjee

When the lowerdev is set administratively down disable carrier on the
macvlan interface. This means operstate gets set properly instead of
still being "up".

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/macvlan.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..16d0e56 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1525,10 +1525,14 @@ static int macvlan_device_event(struct notifier_block 
*unused,
 
switch (event) {
case NETDEV_UP:
+   case NETDEV_DOWN:
case NETDEV_CHANGE:
-   list_for_each_entry(vlan, &port->vlans, list)
+   list_for_each_entry(vlan, &port->vlans, list) {
netif_stacked_transfer_operstate(vlan->lowerdev,
 vlan->dev);
+   if (!(vlan->lowerdev->flags & IFF_UP))
+   netif_carrier_off(vlan->dev);
+   }
break;
case NETDEV_FEAT_CHANGE:
list_for_each_entry(vlan, &port->vlans, list) {
-- 
2.8.0

Re: [PATCH v2 net-next 00/10] allow bpf attach to tracepoints


Series applied, thanks Alexei.

Re: [PATCH net-next v2] macvlan: Support interface operstate properly

2016-04-07 Thread Banerjee, Debabrata

On 4/7/16, 7:05 AM, "Nikolay Aleksandrov"  wrote:



>On 04/07/2016 12:36 AM, Debabrata Banerjee wrote:
>> Set appropriate macvlan interface status based on lower device and our
>> status. Can be up, down, or lowerlayerdown.
>What about dormant ?
>
>That being said I understand the need to switch to lowerlayerdown when the 
>lower
>device is in "down", which is basically the most important change of this 
>patch.
>The rest is already handled by link watch based on carrier state. By now people
>are used to having lowerlayerdown when there's no carrier, now it can also mean
>that the lower device has been brought admin down.
>
>Here's another interesting state:
>6: mac1@eth2:  mtu 
>1500 qdisc noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
>Prior to this patch the macvlan would stay in dormant state and it will also 
>propagate
>to devices stacked on top of it.

The no carrier issue gave me an idea. What if we just set no carrier on the 
macvlan device?
This seems appropriate, but it doesn't directly address the dormant issue, 
although that could
be a separate patch. Will this cause any new issues? It's very simple, new 
patch incoming.

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-07 Thread Andrew Lunn

On Thu, Apr 07, 2016 at 04:43:47PM -0500, Timur Tabi wrote:
> On my platform, firmware (UEFI) configures all of the GPIOs.  I need
> to get confirmation, but it appears that we don't actually make any
> GPIO calls at all.  I see code that looks like this:
> 
>   for (i = 0; (!adpt->no_mdio_gpio) && i < EMAC_NUM_GPIO; i++) {
>   gpio_info = &adpt->gpio_info[i];
>   retval = of_get_named_gpio(node, gpio_info->name, 0);
>   if (retval < 0)
>   return retval;
> 
> And on our ACPI system, adpt->no_mdio_gpio is always true:
> 
>   /* Assume GPIOs required for MDC/MDIO are enabled in firmware */
>   adpt->no_mdio_gpio = true;

There are two different things here. One is configuring the pin to be
a GPIO. The second is using the GPIO as a GPIO. In this case,
bit-banging the MDIO bus.

The firmware could be doing the configuration, setting the pin as a
GPIO. However, the firmware cannot be doing the MDIO bit-banging to
make an MDIO bus available. Linux has to do that.

Or it could be we have all completely misunderstood the hardware, and
we are not doing bit-banging GPIO MDIO. There is a real MDIO
controller there, we don't use these pins as GPIOs, etc

   Andrew

Re: [PATCH net-next v2] sock: make lockdep_sock_is_held inline and conditional on LOCKDEP

From: Hannes Frederic Sowa 
Date: Fri,  8 Apr 2016 01:35:18 +0200

> lockdep_is_held is only specified if CONFIG_LOCKDEP is defined, so make
> it depending on it.
> 
> Also add the missing inline keyword, so no warnings about unused functions
> show up during complilation.
> 
> Cc: Eric Dumazet 
> Cc: David Miller 
> Signed-off-by: Hannes Frederic Sowa 

I already applied your inline patch, I applied the following:


[PATCH] net: Fix build failure due to lockdep_sock_is_held().

Needs to be protected with CONFIG_LOCKDEP.

Based upon a patch by Hannes Frederic Sowa.

Signed-off-by: David S. Miller 
---
 include/net/sock.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 46b2937..81d6fec 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1360,6 +1360,7 @@ do {  
\
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
 } while (0)
 
+#ifdef CONFIG_LOCKDEP
 static inline bool lockdep_sock_is_held(const struct sock *csk)
 {
struct sock *sk = (struct sock *)csk;
@@ -1367,6 +1368,7 @@ static inline bool lockdep_sock_is_held(const struct sock 
*csk)
return lockdep_is_held(&sk->sk_lock) ||
   lockdep_is_held(&sk->sk_lock.slock);
 }
+#endif
 
 void lock_sock_nested(struct sock *sk, int subclass);
 
-- 
2.1.0

Re: [PATCH net-next] sock: make lockdep_sock_is_held static inline

From: Hannes Frederic Sowa 
Date: Thu,  7 Apr 2016 23:53:35 +0200

> I forgot to add inline to lockdep_sock_is_held, so it generated all
> kinds of build warnings if not build with lockdep support.
> 
> Reported-by: kbuild test robot 
> Signed-off-by: Hannes Frederic Sowa 

Applied, thanks.

[PATCH v8 net-next 1/1] hv_sock: introduce Hyper-V Sockets

2016-04-07 Thread Dexuan Cui

Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
mechanism between the host and the guest. It's somewhat like TCP over
VMBus, but the transportation layer (VMBus) is much simpler than IP.

With Hyper-V Sockets, applications between the host and the guest can talk
to each other directly by the traditional BSD-style socket APIs.

Hyper-V Sockets is only available on new Windows hosts, like Windows Server
2016. More info is in this article "Make your own integration services":
https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service

The patch implements the necessary support in the guest side by introducing
a new socket address family AF_HYPERV.

Signed-off-by: Dexuan Cui 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Vitaly Kuznetsov 
---
 MAINTAINERS |2 +
 include/linux/hyperv.h  |   16 +
 include/linux/socket.h  |5 +-
 include/net/af_hvsock.h |   51 ++
 include/uapi/linux/hyperv.h |   25 +
 net/Kconfig |1 +
 net/Makefile|1 +
 net/hv_sock/Kconfig |   10 +
 net/hv_sock/Makefile|3 +
 net/hv_sock/af_hvsock.c | 1483 +++
 10 files changed, 1595 insertions(+), 2 deletions(-)
 create mode 100644 include/net/af_hvsock.h
 create mode 100644 net/hv_sock/Kconfig
 create mode 100644 net/hv_sock/Makefile
 create mode 100644 net/hv_sock/af_hvsock.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 67d99dd..7b6f203 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5267,7 +5267,9 @@ F:drivers/pci/host/pci-hyperv.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/video/fbdev/hyperv_fb.c
+F: net/hv_sock/
 F: include/linux/hyperv.h
+F: include/net/af_hvsock.h
 F: tools/hv/
 F: Documentation/ABI/stable/sysfs-bus-vmbus
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index aa0fadc..b92439d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1338,4 +1338,20 @@ extern __u32 vmbus_proto_version;
 
 int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,
  const uuid_le *shv_host_servie_id);
+struct vmpipe_proto_header {
+   u32 pkt_type;
+   u32 data_size;
+} __packed;
+
+#define HVSOCK_HEADER_LEN  (sizeof(struct vmpacket_descriptor) + \
+sizeof(struct vmpipe_proto_header))
+
+/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */
+#define PREV_INDICES_LEN   (sizeof(u64))
+
+#define HVSOCK_PKT_LEN(payload_len)(HVSOCK_HEADER_LEN + \
+   ALIGN((payload_len), 8) + \
+   PREV_INDICES_LEN)
+#define HVSOCK_MIN_PKT_LEN HVSOCK_PKT_LEN(1)
+
 #endif /* _HYPERV_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 73bf6c6..88b1ccd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -201,8 +201,8 @@ struct ucred {
 #define AF_NFC 39  /* NFC sockets  */
 #define AF_VSOCK   40  /* vSockets */
 #define AF_KCM 41  /* Kernel Connection Multiplexor*/
-
-#define AF_MAX 42  /* For now.. */
+#define AF_HYPERV  42  /* Hyper-V Sockets  */
+#define AF_MAX 43  /* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC  AF_UNSPEC
@@ -249,6 +249,7 @@ struct ucred {
 #define PF_NFC AF_NFC
 #define PF_VSOCK   AF_VSOCK
 #define PF_KCM AF_KCM
+#define PF_HYPERV  AF_HYPERV
 #define PF_MAX AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h
new file mode 100644
index 000..a5aa28d
--- /dev/null
+++ b/include/net/af_hvsock.h
@@ -0,0 +1,51 @@
+#ifndef __AF_HVSOCK_H__
+#define __AF_HVSOCK_H__
+
+#include 
+#include 
+#include 
+
+#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE)
+#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE)
+
+#define HVSOCK_RCV_BUF_SZ  VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV
+#define HVSOCK_SND_BUF_SZ  PAGE_SIZE
+
+#define sk_to_hvsock(__sk)((struct hvsock_sock *)(__sk))
+#define hvsock_to_sk(__hvsk)   ((struct sock *)(__hvsk))
+
+struct hvsock_sock {
+   /* sk must be the first member. */
+   struct sock sk;
+
+   struct sockaddr_hv local_addr;
+   struct sockaddr_hv remote_addr;
+
+   /* protected by the global hvsock_mutex */
+   struct list_head bound_list;
+   struct list_head connected_list;
+
+   struct list_head accept_queue;
+   /* used by enqueue and dequeue */
+   struct mutex accept_queue_mutex;
+
+   struct delayed_work dwork;
+
+   u32 peer_shutdown;
+
+   struct vmbus_channel *channel;
+
+   struct {
+   struct vmpipe_proto_header hdr;
+   char buf[HVSOCK_SND_BUF_SZ];
+   } __packed

[PATCH v8 net-next 0/1] introduce Hyper-V VM Sockets(hv_sock)

2016-04-07 Thread Dexuan Cui

Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
mechanism between the host and the guest. It's somewhat like TCP over
VMBus, but the transportation layer (VMBus) is much simpler than IP.

With Hyper-V Sockets, applications between the host and the guest can talk
to each other directly by the traditional BSD-style socket APIs.

Hyper-V Sockets is only available on new Windows hosts, like Windows Server
2016. More info is in this article "Make your own integration services":
https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service

The patch implements the necessary support in the guest side by
introducing a new socket address family AF_HYPERV.

Note: the VMBus driver side's supporting patches have been in the mainline
tree.

I know the kernel has already had a VM Sockets driver (AF_VSOCK) based
on VMware VMCI (net/vmw_vsock/, drivers/misc/vmw_vmci), and KVM is
proposing AF_VSOCK of virtio version:
http://marc.info/?l=linux-netdev&m=145952064004765&w=2

However, though Hyper-V Sockets may seem conceptually similar to
AF_VOSCK, there are differences in the transportation layer, and IMO these
make the direct code reusing impractical:

1. In AF_VSOCK, the endpoint type is: , but in
AF_HYPERV, the endpoint type is: . Here GUID
is 128-bit.

2. AF_VSOCK supports SOCK_DGRAM, while AF_HYPERV doesn't.

3. AF_VSOCK supports some special sock opts, like SO_VM_SOCKETS_BUFFER_SIZE,
SO_VM_SOCKETS_BUFFER_MIN/MAX_SIZE and SO_VM_SOCKETS_CONNECT_TIMEOUT.
These are meaningless to AF_HYPERV.

4. Some AF_VSOCK's VMCI transportation ops are meanless to AF_HYPERV/VMBus,
like .notify_recv_init
.notify_recv_pre_block
.notify_recv_pre_dequeue
.notify_recv_post_dequeue
.notify_send_init
.notify_send_pre_block
.notify_send_pre_enqueue
.notify_send_post_enqueue
etc.

So I think we'd better introduce a new address family: AF_HYPERV.

Please review the patch.

Looking forward to your comments!

Changes since v1:
- updated "[PATCH 6/7] hvsock: introduce Hyper-V VM Sockets feature"
- added __init and __exit for the module init/exit functions
- net/hv_sock/Kconfig: "default m" -> "default m if HYPERV"
- MODULE_LICENSE: "Dual MIT/GPL" -> "Dual BSD/GPL"

Changes since v2:
- fixed various coding issue pointed out by David Miller
- fixed indentation issues
- removed pr_debug in net/hv_sock/af_hvsock.c
- used reverse-Chrismas-tree style for local variables.
- EXPORT_SYMBOL -> EXPORT_SYMBOL_GPL

Changes since v3:
- fixed a few coding issue pointed by Vitaly Kuznetsov and Dan Carpenter
- fixed the ret value in vmbus_recvpacket_hvsock on error
- fixed the style of multi-line comment: vmbus_get_hvsock_rw_status()

Changes since v4 (https://lkml.org/lkml/2015/7/28/404):
- addressed all the comments about V4.
- treat the hvsock offers/channels as special VMBus devices
- add a mechanism to pass hvsock events to the hvsock driver
- fixed some corner cases with proper locking when a connection is closed
- rebased to the latest Greg's tree

Changes since v5 (https://lkml.org/lkml/2015/12/24/103):
- addressed the coding style issues (Vitaly Kuznetsov & David Miller, thanks!)
- used a better coding for the per-channel rescind callback (Thank Vitaly!)
- avoided the introduction of new VMBUS driver APIs vmbus_sendpacket_hvsock()
and vmbus_recvpacket_hvsock() and used vmbus_sendpacket()/vmbus_recvpacket()
in the higher level (i.e., the vmsock driver). Thank Vitaly!

Changes since v6 (http://lkml.iu.edu/hypermail/linux/kernel/1601.3/01813.html)
- only a few minor changes of coding style and comments

Changes since v7
- a few minor changes of coding style: thanks, Joe Perches!
- added some lines of comments about GUID/UUID before the struct sockaddr_hv.

Dexuan Cui (1):
hv_sock: introduce Hyper-V Sockets

MAINTAINERS |2 +
include/linux/hyperv.h | 16 +
include/linux/socket.h |5 +-
include/net/af_hvsock.h | 51 ++
include/uapi/linux/hyperv.h | 25 +
net/Kconfig |1 +
net/Makefile|1 +
net/hv_sock/Kconfig | 10 +
net/hv_sock/Makefile|3 +
net/hv_sock/af_hvsock.c | 1483 +++
10 files changed, 1595 insertions(+), 2 deletions(-)
create mode 100644 include/net/af_hvsock.h
create mode 100644 net/hv_sock/Kconfig
create mode 100644 net/hv_sock/Makefile
create mode 100644 net/hv_sock/af_hvsock.c

--
2.1.0

Re: [RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO

On Thu, Apr 7, 2016 at 4:22 PM, Jesse Gross  wrote:
> On Thu, Apr 7, 2016 at 7:32 PM, Alexander Duyck  wrote:
>> This patch adds support for a feature I am calling IP ID mangling.  It is
>> basically just another way of saying the IP IDs that are transmitted by the
>> tunnel may not match up with what would normally be expected.  Specifically
>> what will happen is in the case of TSO the IP IDs on the headers will be a
>> fixed value so a given TSO will repeat the same inner IP ID value gso_segs
>> number of times.
>>
>> Signed-off-by: Alexander Duyck 
>
> If I'm understanding this correctly, enabling IP ID mangling will help
> performance on ixgbe since it will allow it to do GSO partial instead
> of plain GSO but it will hurt performance on i40e since it will drop
> from TSO to plain GSO.

Right.  However the option is currently defaulted to off, and can be
enabled per tunnel endpoint.  So if you had an ixgbe to i40e link you
could enable it on the end with the ixgbe and you should see good
performance in both directions.

> Assuming that's right, it seems like it will make it hard to chose the
> right setting without knowledge of which hardware is in use. I guess
> what we really want is "I care about nicely incrementing IP IDs" vs.
> "I don't care as long as the DF bit is set". That second case is
> really what this flag is trying to say but it seems like it is
> enforcing too much in the i40e case - I don't think anyone wants to go
> out of their way to make IP IDs jump around if incrementing is faster.

Right.  The problem is trying to sort out all the GRO/GSO bits.  I was
probably being a bit too conservative after the last few iterations
for the GRO fixes.

Just a thought.  What if I replaced NETIF_F_TSO_FIXEDID with something
that meant we could mange the IP ID like a NETIF_F_TSO_IPID_MANGLE
(advice for better name welcome).  Instead of the feature flag meaning
we are going to transmit packets with a fixed ID it would mean we
don't care about the ID and are free to mangle it as we see fit.  The
GSO type can retain the same meaning as far as that requiring the same
ID for all, but the feature would mean we will take fixed and convert
it to incrementing, or incrementing and convert it to fixed.

- Alex

[PATCH net-next v2] sock: make lockdep_sock_is_held inline and conditional on LOCKDEP

lockdep_is_held is only specified if CONFIG_LOCKDEP is defined, so make
it depending on it.

Also add the missing inline keyword, so no warnings about unused functions
show up during complilation.

Cc: Eric Dumazet 
Cc: David Miller 
Signed-off-by: Hannes Frederic Sowa 
---
 include/net/sock.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index eb2d7c3e120b25..81d6fecec0a2c0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1360,13 +1360,15 @@ do {
\
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
 } while (0)
 
-static bool lockdep_sock_is_held(const struct sock *csk)
+#ifdef CONFIG_LOCKDEP
+static inline bool lockdep_sock_is_held(const struct sock *csk)
 {
struct sock *sk = (struct sock *)csk;
 
return lockdep_is_held(&sk->sk_lock) ||
   lockdep_is_held(&sk->sk_lock.slock);
 }
+#endif
 
 void lock_sock_nested(struct sock *sk, int subclass);
 
-- 
2.5.5

Re: [RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO

2016-04-07 Thread Jesse Gross

On Thu, Apr 7, 2016 at 7:32 PM, Alexander Duyck  wrote:
> This patch adds support for a feature I am calling IP ID mangling.  It is
> basically just another way of saying the IP IDs that are transmitted by the
> tunnel may not match up with what would normally be expected.  Specifically
> what will happen is in the case of TSO the IP IDs on the headers will be a
> fixed value so a given TSO will repeat the same inner IP ID value gso_segs
> number of times.
>
> Signed-off-by: Alexander Duyck 

If I'm understanding this correctly, enabling IP ID mangling will help
performance on ixgbe since it will allow it to do GSO partial instead
of plain GSO but it will hurt performance on i40e since it will drop
from TSO to plain GSO.

Assuming that's right, it seems like it will make it hard to chose the
right setting without knowledge of which hardware is in use. I guess
what we really want is "I care about nicely incrementing IP IDs" vs.
"I don't care as long as the DF bit is set". That second case is
really what this flag is trying to say but it seems like it is
enforcing too much in the i40e case - I don't think anyone wants to go
out of their way to make IP IDs jump around if incrementing is faster.

Re: [PATCH net] lockdep: provide always true lockdep_is_held stub if lockdep disabled

On 08.04.2016 01:12, Hannes Frederic Sowa wrote:

I need this to provide a generic lockdep_sock_is_held function which can
be easily used in the kernel without using ifdef PROVEN macros.

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Eric Dumazet 
Cc: David Miller 
Signed-off-by: Hannes Frederic Sowa 
---
Hello Peter and Ingo,

if it is possible coud this go in via the net-tree, as this problem is
visible there already? Would be happy to get a review.

I take this patch back, as some call sites test if the lock is 
definitely not held. I come up with a better approach.

Re: [PATCH net-next] sock: make lockdep_sock_is_held static inline


On 08.04.2016 01:08, Eric Dumazet wrote:

On Fri, 2016-04-08 at 00:47 +0200, Hannes Frederic Sowa wrote:


I see... hmpf.

Wouldn't it be nicer if I include a helper a la:

#define lockdep_is_held(lock)  1

in lockdep.h in case lockdep is globally not enabled? I do actually have
already another user for this outside of PROVE_RCU.


It probably had been discussed on lkml a long time ago.

My guess is that you can not do that, but am too lazy to tell you why ;)


I will simply try it and otherwise go with your solution. My other cases 
would need to be ifdefed then as well, but also possible.


Patch is send out and I will now research after your tip that it was 
already tried.


Thanks,
Hannes

[PATCH net] lockdep: provide always true lockdep_is_held stub if lockdep disabled

I need this to provide a generic lockdep_sock_is_held function which can
be easily used in the kernel without using ifdef PROVEN macros.

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Eric Dumazet 
Cc: David Miller 
Signed-off-by: Hannes Frederic Sowa 
---
Hello Peter and Ingo,

if it is possible coud this go in via the net-tree, as this problem is
visible there already? Would be happy to get a review.

Thanks,
Hannes

 include/linux/lockdep.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d026b190c53066..dc8d447cb3ab1c 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -428,6 +428,8 @@ struct lock_class_key { };
 #define lockdep_pin_lock(l)do { (void)(l); } while 
(0)
 #define lockdep_unpin_lock(l)  do { (void)(l); } while (0)
 
+#define lockdep_is_held(l) ({ (void)(l); (1); })
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_LOCK_STAT
-- 
2.5.5

[PATCH ipoute2] iplink: display number of rx/tx queues

From: Eric Dumazet 

We can set the attributes, so would be nice to display them when
provided by the kernel.

Signed-off-by: Eric Dumazet 
---
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 3998d8c..f7bd1c7 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -894,6 +894,12 @@ int print_linkinfo(const struct sockaddr_nl *who,
if (do_link && tb[IFLA_AF_SPEC] && show_details)
print_af_spec(fp, tb[IFLA_AF_SPEC]);
 
+   if (tb[IFLA_NUM_TX_QUEUES] && show_details)
+   fprintf(fp, "numtxqueues %u ", 
rta_getattr_u32(tb[IFLA_NUM_TX_QUEUES]));
+
+   if (tb[IFLA_NUM_RX_QUEUES] && show_details)
+   fprintf(fp, "numrxqueues %u ", 
rta_getattr_u32(tb[IFLA_NUM_RX_QUEUES]));
+
if ((do_link || show_details) && tb[IFLA_IFALIAS]) {
fprintf(fp, "%salias %s", _SL_,
rta_getattr_str(tb[IFLA_IFALIAS]));

Re: [PATCH net-next] sock: make lockdep_sock_is_held static inline

On Fri, 2016-04-08 at 00:47 +0200, Hannes Frederic Sowa wrote:

> I see... hmpf.
> 
> Wouldn't it be nicer if I include a helper a la:
> 
> #define lockdep_is_held(lock)  1
> 
> in lockdep.h in case lockdep is globally not enabled? I do actually have 
> already another user for this outside of PROVE_RCU.

It probably had been discussed on lkml a long time ago.

My guess is that you can not do that, but am too lazy to tell you why ;)

Re: [PATCH net-next] sock: make lockdep_sock_is_held static inline


On 08.04.2016 00:37, Eric Dumazet wrote:

On Thu, 2016-04-07 at 15:30 -0700, Eric Dumazet wrote:


But... this wont solve the compiler error ?

include/net/sock.h: In function 'lockdep_sock_is_held':
include/net/sock.h:1367:2: error: implicit declaration of function
'lockdep_is_held' [-Werror=implicit-function-declaration]
In file included from security/lsm_audit.c:20:0:
include/net/sock.h: In function 'lockdep_sock_is_held':
include/net/sock.h:1367:2: error: implicit declaration of function
'lockdep_is_held' [-Werror=implicit-function-declaration]

$ egrep "LOCKDEP|PROVE" .config
CONFIG_LOCKDEP_SUPPORT=y
# CONFIG_PROVE_LOCKING is not set
# CONFIG_PROVE_RCU is not set



Better use something like :

diff --git a/include/net/sock.h b/include/net/sock.h
index eb2d7c3e120b..ab6b6b9469ad 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1360,13 +1360,15 @@ do {
\
 lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
  } while (0)

-static bool lockdep_sock_is_held(const struct sock *csk)
+#ifdef CONFIG_PROVE_RCU
+static inline bool lockdep_sock_is_held(const struct sock *csk)
  {
 struct sock *sk = (struct sock *)csk;

 return lockdep_is_held(&sk->sk_lock) ||
lockdep_is_held(&sk->sk_lock.slock);
  }
+#endif


I see... hmpf.

Wouldn't it be nicer if I include a helper a la:

#define lockdep_is_held(lock)  1

in lockdep.h in case lockdep is globally not enabled? I do actually have 
already another user for this outside of PROVE_RCU.


Thanks,
Hannes

Re: [PATCH net-next] sock: make lockdep_sock_is_held static inline

On Thu, 2016-04-07 at 15:30 -0700, Eric Dumazet wrote:

> But... this wont solve the compiler error ?
> 
> include/net/sock.h: In function 'lockdep_sock_is_held':
> include/net/sock.h:1367:2: error: implicit declaration of function
> 'lockdep_is_held' [-Werror=implicit-function-declaration]
> In file included from security/lsm_audit.c:20:0:
> include/net/sock.h: In function 'lockdep_sock_is_held':
> include/net/sock.h:1367:2: error: implicit declaration of function
> 'lockdep_is_held' [-Werror=implicit-function-declaration]
> 
> $ egrep "LOCKDEP|PROVE" .config
> CONFIG_LOCKDEP_SUPPORT=y
> # CONFIG_PROVE_LOCKING is not set
> # CONFIG_PROVE_RCU is not set
> 

Better use something like :

diff --git a/include/net/sock.h b/include/net/sock.h
index eb2d7c3e120b..ab6b6b9469ad 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1360,13 +1360,15 @@ do {
\
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
 } while (0)
 
-static bool lockdep_sock_is_held(const struct sock *csk)
+#ifdef CONFIG_PROVE_RCU
+static inline bool lockdep_sock_is_held(const struct sock *csk)
 {
struct sock *sk = (struct sock *)csk;
 
return lockdep_is_held(&sk->sk_lock) ||
   lockdep_is_held(&sk->sk_lock.slock);
 }
+#endif
 
 void lock_sock_nested(struct sock *sk, int subclass);

[RFC PATCH 01/11] GRE: Disable segmentation offloads w/ CSUM and we are encapsulated via FOU

This patch fixes an issue I found in which we were dropping frames if we
had enabled checksums on GRE headers that were encapsulated by either FOU
or GUE.  Without this patch I was barely able to get 1 Gb/s of throughput.
With this patch applied I am now at least getting around 6 Gb/s.

The issue is due to the fact that with FOU or GUE applied we do not provide
a transport offset pointing to the GRE header, nor do we offload it in
software as the GRE header is completely skipped by GSO and treated like a
VXLAN or GENEVE type header.  As such we need to prevent the stack from
generating it and also prevent GRE from generating it via any interface we
create.

Fixes: c3483384ee511 ("gro: Allow tunnel stacking in the case of FOU/GUE")
Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |5 -
 net/core/dev.c|1 +
 net/ipv4/fou.c|6 ++
 net/ipv4/gre_offload.c|8 
 net/ipv4/ip_gre.c |   13 ++---
 5 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb0d5d09c2e4..8395308a2445 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2120,7 +2120,10 @@ struct napi_gro_cb {
/* Used in foo-over-udp, set in udp[46]_gro_receive */
u8  is_ipv6:1;
 
-   /* 7 bit hole */
+   /* Used in GRE, set in fou/gue_gro_receive */
+   u8  is_fou:1;
+
+   /* 6 bit hole */
 
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum  csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index 273f10d1e306..d51343a821ed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4439,6 +4439,7 @@ static enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
+   NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
/* Setup for GRO checksum validation */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 5a94aea280d3..a39068b4a4d9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -203,6 +203,9 @@ static struct sk_buff **fou_gro_receive(struct sk_buff 
**head,
 */
NAPI_GRO_CB(skb)->encap_mark = 0;
 
+   /* Flag this frame as already having an outer encap header */
+   NAPI_GRO_CB(skb)->is_fou = 1;
+
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
@@ -368,6 +371,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff 
**head,
 */
NAPI_GRO_CB(skb)->encap_mark = 0;
 
+   /* Flag this frame as already having an outer encap header */
+   NAPI_GRO_CB(skb)->is_fou = 1;
+
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[guehdr->proto_ctype]);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index c47539d04b88..6a5bd4317866 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -150,6 +150,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff 
**head,
if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
goto out;
 
+   /* We can only support GRE_CSUM if we can track the location of
+* the GRE header.  In the case of FOU/GUE we cannot because the
+* outer UDP header displaces the GRE header leaving us in a state
+* of limbo.
+*/
+   if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou)
+   goto out;
+
type = greh->protocol;
 
rcu_read_lock();
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 31936d387cfd..af5d1f38217f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -862,9 +862,16 @@ static void __gre_tunnel_init(struct net_device *dev)
dev->hw_features|= GRE_FEATURES;
 
if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
-   /* TCP offload with GRE SEQ is not supported. */
-   dev->features|= NETIF_F_GSO_SOFTWARE;
-   dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+   /* TCP offload with GRE SEQ is not supported, nor
+* can we support 2 levels of outer headers requiring
+* an update.
+*/
+   if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
+   (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
+   dev->features|= NETIF_F_GSO_SOFTWARE;
+   dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+   }
+
/* Can use a lockless transmit, unless we generate
 * output sequences
 */

[RFC PATCH 09/11] i40e/i40evf: Add support for GSO partial with UDP_TUNNEL_CSUM and GRE_CSUM

This patch makes it so that i40e and i40evf can use GSO_PARTIAL to support
segmentation for frames with checksums enabled in outer headers.  As a
result we can now send data over these types of tunnels at over 20Gb/s
versus the 12Gb/s that was previously possible on my system.

The advantage with the i40e parts is that this offload is mostly
transparent as the hardware still deals with the inner and/or outer IPv4
headers so the IP ID is still incrementing for both when this offload is
performed.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |6 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |7 ++-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c   |7 ++-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c |6 +-
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 07a70c4ac49f..6c095b07ce82 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9119,17 +9119,21 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
   NETIF_F_TSO_ECN  |
   NETIF_F_TSO6 |
   NETIF_F_GSO_GRE  |
+  NETIF_F_GSO_GRE_CSUM |
   NETIF_F_GSO_IPIP |
   NETIF_F_GSO_SIT  |
   NETIF_F_GSO_UDP_TUNNEL   |
   NETIF_F_GSO_UDP_TUNNEL_CSUM  |
+  NETIF_F_GSO_PARTIAL  |
   NETIF_F_SCTP_CRC |
   NETIF_F_RXHASH   |
   NETIF_F_RXCSUM   |
   0;
 
if (!(pf->flags & I40E_FLAG_OUTER_UDP_CSUM_CAPABLE))
-   netdev->hw_enc_features ^= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+   netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+
+   netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
 
/* record features VLANs can make use of */
netdev->vlan_features |= netdev->hw_enc_features;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6e44cf118843..ede4183468b9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2300,11 +2300,15 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, 
u64 *cd_type_cmd_tso_mss)
}
 
if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+SKB_GSO_GRE_CSUM |
 SKB_GSO_IPIP |
 SKB_GSO_SIT |
 SKB_GSO_UDP_TUNNEL |
 SKB_GSO_UDP_TUNNEL_CSUM)) {
-   if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+   if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+   (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+   l4.udp->len = 0;
+
/* determine offset of outer transport header */
l4_offset = l4.hdr - skb->data;
 
@@ -2481,6 +2485,7 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 
*tx_flags,
 
/* indicate if we need to offload outer UDP header */
if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
+   !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index f101895ecf4a..6ce00547c13e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1565,11 +1565,15 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, 
u64 *cd_type_cmd_tso_mss)
}
 
if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+SKB_GSO_GRE_CSUM |
 SKB_GSO_IPIP |
 SKB_GSO_SIT |
 SKB_GSO_UDP_TUNNEL |
 SKB_GSO_UDP_TUNNEL_CSUM)) {
-   if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+   if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+   (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+   l4.udp->len = 0;
+
/* determine offset of outer transport header */

[RFC PATCH 02/11] ethtool: Add support for toggling any of the GSO offloads

The strings were missing for several of the GSO offloads that are
available.  This patch provides the missing strings so that we can toggle
or query any of them via the ethtool command.

Signed-off-by: Alexander Duyck 
---
 net/core/ethtool.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f426c5ad6149..6a7f99661c2f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -82,9 +82,11 @@ static const char 
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] =  "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] =  "tx-gre-segmentation",
+   [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] =  "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] =   "tx-udp_tnl-segmentation",
+   [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] ="tx-checksum-sctp",

[RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO

This patch adds support for a feature I am calling IP ID mangling.  It is
basically just another way of saying the IP IDs that are transmitted by the
tunnel may not match up with what would normally be expected.  Specifically
what will happen is in the case of TSO the IP IDs on the headers will be a
fixed value so a given TSO will repeat the same inner IP ID value gso_segs
number of times.

Signed-off-by: Alexander Duyck 
---
 drivers/net/geneve.c |   24 ++--
 include/net/udp_tunnel.h |8 
 include/uapi/linux/if_link.h |1 +
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index bc168894bda3..6352223d80c3 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -80,6 +80,7 @@ struct geneve_dev {
 #define GENEVE_F_UDP_ZERO_CSUM_TX  BIT(0)
 #define GENEVE_F_UDP_ZERO_CSUM6_TX BIT(1)
 #define GENEVE_F_UDP_ZERO_CSUM6_RX BIT(2)
+#define GENEVE_F_TCP_FIXEDID   BIT(3)
 
 struct geneve_sock {
boolcollect_md;
@@ -702,9 +703,14 @@ static int geneve_build_skb(struct rtable *rt, struct 
sk_buff *skb,
int min_headroom;
int err;
bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM_TX);
+   int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 
skb_scrub_packet(skb, xnet);
 
+   if ((flags & GENEVE_F_TCP_FIXEDID) && skb_is_gso(skb) &&
+   (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
+   type |= SKB_GSO_TCP_FIXEDID;
+
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr);
err = skb_cow_head(skb, min_headroom);
@@ -713,7 +719,7 @@ static int geneve_build_skb(struct rtable *rt, struct 
sk_buff *skb,
goto free_rt;
}
 
-   skb = udp_tunnel_handle_offloads(skb, udp_sum);
+   skb = iptunnel_handle_offloads(skb, type);
if (IS_ERR(skb)) {
err = PTR_ERR(skb);
goto free_rt;
@@ -739,9 +745,14 @@ static int geneve6_build_skb(struct dst_entry *dst, struct 
sk_buff *skb,
int min_headroom;
int err;
bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM6_TX);
+   int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 
skb_scrub_packet(skb, xnet);
 
+   if ((flags & GENEVE_F_TCP_FIXEDID) && skb_is_gso(skb) &&
+   (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
+   type |= SKB_GSO_TCP_FIXEDID;
+
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+ GENEVE_BASE_HLEN + opt_len + sizeof(struct ipv6hdr);
err = skb_cow_head(skb, min_headroom);
@@ -750,7 +761,7 @@ static int geneve6_build_skb(struct dst_entry *dst, struct 
sk_buff *skb,
goto free_dst;
}
 
-   skb = udp_tunnel_handle_offloads(skb, udp_sum);
+   skb = iptunnel_handle_offloads(skb, type);
if (IS_ERR(skb)) {
err = PTR_ERR(skb);
goto free_dst;
@@ -1249,6 +1260,7 @@ static const struct nla_policy 
geneve_policy[IFLA_GENEVE_MAX + 1] = {
[IFLA_GENEVE_UDP_CSUM]  = { .type = NLA_U8 },
[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
+   [IFLA_GENEVE_IPID_MANGLE]   = { .type = NLA_FLAG },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1436,6 +1448,9 @@ static int geneve_newlink(struct net *net, struct 
net_device *dev,
nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
flags |= GENEVE_F_UDP_ZERO_CSUM6_RX;
 
+   if (data[IFLA_GENEVE_IPID_MANGLE])
+   flags |= GENEVE_F_TCP_FIXEDID;
+
return geneve_configure(net, dev, &remote, vni, ttl, tos, label,
dst_port, metadata, flags);
 }
@@ -1460,6 +1475,7 @@ static size_t geneve_get_size(const struct net_device 
*dev)
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX 
*/
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX 
*/
+   nla_total_size(0) +  /* IFLA_GENEVE_IPID_MANGLE */
0;
 }
 
@@ -1505,6 +1521,10 @@ static int geneve_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
   !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_RX)))
goto nla_put_failure;
 
+   if ((geneve->flags & GENEVE_F_TCP_FIXEDID) &&
+   nla_put_flag(skb, IFLA_GENEVE_IPID_MANGLE))
+   goto nla_put_failure;
+
return 0;
 
 nla_put_failure:
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b83114077cee..c44d04259665 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -98,14 +98,6 @@ struct metadata_dst *udp_tun_rx_d

[RFC PATCH 11/11] igb/igbvf: Add support for GSO partial

This patch adds support for partial GSO segmentation in the case of
encapsulated frames.  Specifically with this change the driver can perform
segmentation as long as the type is either SKB_GSO_TCP_FIXEDID or
SKB_GSO_TCPV6.  If neither of these gso types are specified then tunnel
segmentation is not supported and we will default back to GSO.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/igb/igb_main.c |  119 +++
 drivers/net/ethernet/intel/igbvf/netdev.c |  180 ++---
 2 files changed, 205 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 8e96c35307fb..8204ebecd2a5 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2087,6 +2087,52 @@ static int igb_ndo_fdb_add(struct ndmsg *ndm, struct 
nlattr *tb[],
return ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, flags);
 }
 
+#define IGB_MAX_MAC_HDR_LEN127
+#define IGB_MAX_NETWORK_HDR_LEN511
+#define IGB_GSO_PARTIAL_FEATURES (NETIF_F_TSO_FIXEDID | \
+ NETIF_F_GSO_GRE | \
+ NETIF_F_GSO_GRE_CSUM | \
+ NETIF_F_GSO_IPIP | \
+ NETIF_F_GSO_SIT | \
+ NETIF_F_GSO_UDP_TUNNEL | \
+ NETIF_F_GSO_UDP_TUNNEL_CSUM)
+static netdev_features_t
+igb_features_check(struct sk_buff *skb, struct net_device *dev,
+  netdev_features_t features)
+{
+   unsigned int network_hdr_len, mac_hdr_len;
+
+   /* Make certain the headers can be described by a context descriptor */
+   mac_hdr_len = skb_network_header(skb) - skb->data;
+   network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+   if (unlikely((mac_hdr_len > IGB_MAX_MAC_HDR_LEN) ||
+(network_hdr_len >  IGB_MAX_NETWORK_HDR_LEN)))
+   return features & ~(NETIF_F_HW_CSUM |
+   NETIF_F_SCTP_CRC |
+   NETIF_F_HW_VLAN_CTAG_TX |
+   NETIF_F_TSO |
+   NETIF_F_TSO_FIXEDID |
+   NETIF_F_TSO6);
+
+   /* We can only support a fixed IPv4 ID or IPv6 header for TSO
+* with tunnels.  So if we aren't using a tunnel, or we aren't
+* performing TSO with a fixed ID we must strip the partial
+* features.
+*/
+   if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+  SKB_GSO_GRE_CSUM |
+  SKB_GSO_IPIP |
+  SKB_GSO_SIT |
+  SKB_GSO_UDP_TUNNEL |
+  SKB_GSO_UDP_TUNNEL_CSUM)) ||
+   !(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID |
+  SKB_GSO_TCPV6)))
+   return features & ~(NETIF_F_GSO_PARTIAL |
+   IGB_GSO_PARTIAL_FEATURES);
+
+   return features;
+}
+
 static const struct net_device_ops igb_netdev_ops = {
.ndo_open   = igb_open,
.ndo_stop   = igb_close,
@@ -2111,7 +2157,7 @@ static const struct net_device_ops igb_netdev_ops = {
.ndo_fix_features   = igb_fix_features,
.ndo_set_features   = igb_set_features,
.ndo_fdb_add= igb_ndo_fdb_add,
-   .ndo_features_check = passthru_features_check,
+   .ndo_features_check = igb_features_check,
 };
 
 /**
@@ -2384,6 +2430,9 @@ static int igb_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
if (hw->mac.type >= e1000_82576)
netdev->features |= NETIF_F_SCTP_CRC;
 
+   netdev->gso_partial_features = IGB_GSO_PARTIAL_FEATURES;
+   netdev->features |= NETIF_F_GSO_PARTIAL | IGB_GSO_PARTIAL_FEATURES;
+
/* copy netdev features into list of user selectable features */
netdev->hw_features |= netdev->features;
netdev->hw_features |= NETIF_F_RXALL;
@@ -2401,14 +2450,16 @@ static int igb_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 NETIF_F_SCTP_CRC;
 
netdev->mpls_features |= NETIF_F_HW_CSUM;
-   netdev->hw_enc_features |= NETIF_F_HW_CSUM;
+   netdev->hw_enc_features |= NETIF_F_HW_CSUM |
+  NETIF_F_TSO |
+  NETIF_F_TSO6 |
+  NETIF_F_GSO_PARTIAL |
+  IGB_GSO_PARTIAL_FEATURES;
 
netdev->priv_flags |= IFF_SUPP_NOFCS;
 
-   if (pci_using_dac) {
+   if (pci_using_dac)
netdev->features |= NETIF_F_HIGHDMA;
-   netdev->vlan_features |= NETIF_F_HIGHDMA;
-   }
 
net

[RFC PATCH 05/11] GSO: Support partial segmentation offload

This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header.  The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_FIXEDID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_FIXEDID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdev_features.h |5 +
 include/linux/netdevice.h   |2 ++
 include/linux/skbuff.h  |9 +++--
 net/core/dev.c  |   31 ++-
 net/core/ethtool.c  |1 +
 net/core/skbuff.c   |   29 -
 net/ipv4/af_inet.c  |   20 
 net/ipv4/gre_offload.c  |   26 +-
 net/ipv4/tcp_offload.c  |   10 --
 net/ipv4/udp_offload.c  |   27 +--
 net/ipv6/ip6_offload.c  |   10 +-
 11 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 5d7da1ac6df5..6ef549ec5b13 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,6 +48,10 @@ enum {
NETIF_F_GSO_SIT_BIT,/* ... SIT tunnel with TSO */
NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */
NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+   NETIF_F_GSO_PARTIAL_BIT,/* ... Only segment inner-most L4
+* in hardware and all other
+* headers in software.
+*/
NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
/**/NETIF_F_GSO_LAST =  /* last bit, see GSO_MASK */
NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -122,6 +126,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_FIXEDID__NETIF_F(TSO_FIXEDID)
+#define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index abf8cc2d9bfb..36a079598034 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,7 @@ struct net_device {
netdev_features_t   vlan_features;
netdev_features_t   hw_enc_features;
netdev_features_t   mpls_features;
+   netdev_features_t   gso_partial_features;
 
int ifindex;
int group;
@@ -4023,6 +4024,7 @@ static inline bool net_gso_ok(netdev_features_t features, 
int gso_type)
BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> 
NETIF_F_GSO_SHIFT));
+   BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> 
NETIF_F_GSO_SHIFT));
 
return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fba16658f9d..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,7 +483,9 @@ enum {
 
SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 
-   SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+   SKB_GSO_PARTIAL = 1 << 13,
+
+   SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 };
 
 #if BITS_PER_LONG > 32
@@ -3591,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct 
sk_buff *skb)
  * Keeps track of level of encapsulation of network headers.
  */
 struct skb_gso_cb {
-   int mac_offset;
+   union {
+   int mac_offset;
+   int data_offset;
+   };
int encap_level;
__wsum  csum;
__u16   csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ed2852b3706..53b216b617c3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buf

[RFC PATCH 04/11] GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID values

This patch does two things.

First it allows TCP to aggregate TCP frames with a fixed IPv4 ID field.  As
a result we should now be able to aggregate flows that were converted from
IPv6 to IPv4.  In addition this allows us more flexibility for future
implementations of segmentation as we may be able to use a fixed IP ID when
segmenting the flow.

The second thing this addresses is that it places limitations on the outer
IPv4 ID header in the case of tunneled frames.  Specifically it forces the
IP ID to be incrementing by 1 unless the DF bit is set in the outer IPv4
header.  This way we can avoid creating overlapping series of IP IDs that
could possibly be fragmented if the frame goes through GRO and is then
resegmented via GSO.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |5 -
 net/core/dev.c|1 +
 net/ipv4/af_inet.c|   35 ---
 net/ipv4/tcp_offload.c|   16 +++-
 net/ipv6/ip6_offload.c|8 ++--
 5 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 38ccc01eb97d..abf8cc2d9bfb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2123,7 +2123,10 @@ struct napi_gro_cb {
/* Used in GRE, set in fou/gue_gro_receive */
u8  is_fou:1;
 
-   /* 6 bit hole */
+   /* Used to determine if flush_id can be ignored */
+   u8  is_atomic:1;
+
+   /* 5 bit hole */
 
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum  csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index d51343a821ed..4ed2852b3706 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4440,6 +4440,7 @@ static enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
+   NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
/* Setup for GRO checksum validation */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 19e9a2c45d71..98fe04b99e01 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1328,6 +1328,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff 
**head,
 
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
+   u16 flush_id;
 
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1351,16 +1352,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff 
**head,
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
 
-   /* Save the IP ID check to be included later when we get to
-* the transport layer so only the inner most IP ID is checked.
-* This is because some GSO/TSO implementations do not
-* correctly increment the IP ID for the outer hdrs.
-*/
-   NAPI_GRO_CB(p)->flush_id =
-   ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ 
id);
NAPI_GRO_CB(p)->flush |= flush;
+
+   /* We need to store of the IP ID check to be included later
+* when we can verify that this packet does in fact belong
+* to a given flow.
+*/
+   flush_id = (u16)(id - ntohs(iph2->id));
+
+   /* This bit of code makes it much easier for us to identify
+* the cases where we are doing atomic vs non-atomic IP ID
+* checks.  Specifically an atomic check can return IP ID
+* values 0 - 0x, while a non-atomic check can only
+* return 0 or 0x.
+*/
+   if (!NAPI_GRO_CB(p)->is_atomic ||
+   !(iph->frag_off & htons(IP_DF))) {
+   flush_id ^= NAPI_GRO_CB(p)->count;
+   flush_id = flush_id ? 0x : 0;
+   }
+
+   /* If the previous IP ID value was based on an atomic
+* datagram we can overwrite the value and ignore it.
+*/
+   if (NAPI_GRO_CB(skb)->is_atomic)
+   NAPI_GRO_CB(p)->flush_id = flush_id;
+   else
+   NAPI_GRO_CB(p)->flush_id |= flush_id;
}
 
+   NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 08dd25d835af..d1ffd55289bd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -239,7 +239,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, 
struct sk_buff *skb)
 
 found:

[RFC PATCH 08/11] Documentation: Add documentation for TSO and GSO features

This document is a starting point for defining the TSO and GSO features.
The whole thing is starting to get a bit messy so I wanted to make sure we
have notes somwhere to start describing what does and doesn't work.

Signed-off-by: Alexander Duyck 
---
 Documentation/networking/segmentation-offloads.txt |  127 
 1 file changed, 127 insertions(+)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

diff --git a/Documentation/networking/segmentation-offloads.txt 
b/Documentation/networking/segmentation-offloads.txt
new file mode 100644
index ..b06dd9b65ab3
--- /dev/null
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -0,0 +1,127 @@
+Segmentation Offloads in the Linux Networking Stack
+
+Introduction
+
+
+This document describes a set of techniques in the Linux networking stack
+to take advantage of segmentation offload capabilities of various NICs.
+
+The following technologies are described:
+ * TCP Segmentation Offload - TSO
+ * UDP Fragmentation Offload - UFO
+ * IPIP, SIT, GRE, and UDP Tunnel Offloads
+ * Generic Segmentation Offload - GSO
+ * Generic Receive Offload - GRO
+ * Partial Generic Segmentation Offload - GSO_PARTIAL
+
+TCP Segmentation Offload
+
+
+TCP segmentation allows a device to segment a single frame into multiple
+frames with a data payload size specified in skb_shinfo()->gso_size.
+When TCP segmentation requested the bit for either SKB_GSO_TCP or
+SKB_GSO_TCP6 should be set in skb_shinfo()->gso_type and
+skb_shinfo()->gso_size should be set to a non-zero value.
+
+TCP segmentation is dependent on support for the use of partial checksum
+offload.  For this reason TSO is normally disabled if the Tx checksum
+offload for a given device is disabled.
+
+In order to support TCP segmentation offload it is necessary to populate
+the network and transport header offsets of the skbuff so that the device
+drivers will be able determine the offsets of the IP or IPv6 header and the
+TCP header.  In addition as CHECKSUM_PARTIAL is required csum_start should
+also point to the TCP header of the packet.
+
+For IPv4 segmentation we support one of two types in terms of the IP ID.
+The default behavior is to increment the IP ID with every segment.  If the
+GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
+ID and all segments will use the same IP ID.
+
+UDP Fragmentation Offload
+=
+
+UDP fragmentation offload allows a device to fragment an oversized UDP
+datagram into multiple IPv4 fragments.  Many of the requirements for UDP
+fragmentation offload are the same as TSO.  However the IPv4 ID for
+fragments should not increment as a single IPv4 datagram is fragmented.
+
+IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
+
+
+In addition to the offloads described above it is possible for a frame to
+contain additional headers such as an outer tunnel.  In order to account
+for such instances an additional set of segmentation offload types were
+introduced including SKB_GSO_IPIP, SKB_GSO_SIT, SKB_GSO_GRE, and
+SKB_GSO_UDP_TUNNEL.  These extra segmentation types are used to identify
+cases where there are more than just 1 set of headers.  For example in the
+case of IPIP and SIT we should have the network and transport headers moved
+from the standard list of headers to "inner" header offsets.
+
+Currently only two levels of headers are supported.  The convention is to
+refer to the tunnel headers as the outer headers, while the encapsulated
+data is normally referred to as the inner headers.  Below is the list of
+calls to access the given headers:
+
+IPIP/SIT Tunnel:
+   Outer   Inner
+MACskb_mac_header
+Networkskb_network_header  skb_inner_network_header
+Transport  skb_transport_header
+
+UDP/GRE Tunnel:
+   Outer   Inner
+MACskb_mac_header  skb_inner_mac_header
+Networkskb_network_header  skb_inner_network_header
+Transport  skb_transport_headerskb_inner_transport_header
+
+In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
+SKB_GSO_UDP_TUNNEL_CSUM.  These two additional tunnel types reflect the
+fact that the outer header also requests to have a non-zero checksum
+included in the outer header.
+
+Finally there is SKB_GSO_REMCSUM which indicates that a given tunnel header
+has requested a remote checksum offload.  In this case the inner headers
+will be left with a partial checksum and only the outer header checksum
+will be computed.
+
+Generic Segmentation Offload
+
+
+Generic segmentation offload is a pure software offload that is meant to
+deal with cases where device drivers cannot perform the offloads described
+above.  What occurs in GSO is that a given skbuff will have its data broken
+out over multipl

[RFC PATCH 10/11] ixgbe/ixgbevf: Add support for GSO partial

This patch adds support for partial GSO segmentation in the case of
encapsulated frames.  Specifically with this change the driver can perform
segmentation as long as the type is either SKB_GSO_TCP_FIXEDID or
SKB_GSO_TCPV6.  If neither of these gso types are specified then tunnel
segmentation is not supported and we will default back to GSO.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  115 +++-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  122 -
 2 files changed, 180 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c6bd3ae5f986..57e083f6c8a9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7195,9 +7195,18 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 struct ixgbe_tx_buffer *first,
 u8 *hdr_len)
 {
+   u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
struct sk_buff *skb = first->skb;
-   u32 vlan_macip_lens, type_tucmd;
-   u32 mss_l4len_idx, l4len;
+   union {
+   struct iphdr *v4;
+   struct ipv6hdr *v6;
+   unsigned char *hdr;
+   } ip;
+   union {
+   struct tcphdr *tcp;
+   unsigned char *hdr;
+   } l4;
+   u32 paylen, l4_offset;
int err;
 
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -7210,46 +7219,52 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
if (err < 0)
return err;
 
+   ip.hdr = skb_network_header(skb);
+   l4.hdr = skb_checksum_start(skb);
+
/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_TCP;
 
-   if (first->protocol == htons(ETH_P_IP)) {
-   struct iphdr *iph = ip_hdr(skb);
-   iph->tot_len = 0;
-   iph->check = 0;
-   tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
-iph->daddr, 0,
-IPPROTO_TCP,
-0);
+   /* initialize outer IP header fields */
+   if (ip.v4->version == 4) {
+   /* IP header will have to cancel out any data that
+* is not a part of the outer IP header
+*/
+   ip.v4->check = csum_fold(csum_add(lco_csum(skb),
+ csum_unfold(l4.tcp->check)));
type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
+
+   ip.v4->tot_len = 0;
first->tx_flags |= IXGBE_TX_FLAGS_TSO |
   IXGBE_TX_FLAGS_CSUM |
   IXGBE_TX_FLAGS_IPV4;
-   } else if (skb_is_gso_v6(skb)) {
-   ipv6_hdr(skb)->payload_len = 0;
-   tcp_hdr(skb)->check =
-   ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-&ipv6_hdr(skb)->daddr,
-0, IPPROTO_TCP, 0);
+   } else {
+   ip.v6->payload_len = 0;
first->tx_flags |= IXGBE_TX_FLAGS_TSO |
   IXGBE_TX_FLAGS_CSUM;
}
 
-   /* compute header lengths */
-   l4len = tcp_hdrlen(skb);
-   *hdr_len = skb_transport_offset(skb) + l4len;
+   /* determine offset of inner transport header */
+   l4_offset = l4.hdr - skb->data;
+
+   /* compute length of segmentation header */
+   *hdr_len = (l4.tcp->doff * 4) + l4_offset;
+
+   /* remove payload length from inner checksum */
+   paylen = skb->len - l4_offset;
+   csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
 
/* update gso size and bytecount with header size */
first->gso_segs = skb_shinfo(skb)->gso_segs;
first->bytecount += (first->gso_segs - 1) * *hdr_len;
 
/* mss_l4len_id: use 0 as index for TSO */
-   mss_l4len_idx = l4len << IXGBE_ADVTXD_L4LEN_SHIFT;
+   mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
-   vlan_macip_lens = skb_network_header_len(skb);
-   vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
+   vlan_macip_lens = l4.hdr - ip.hdr;
+   vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
@@ -8906,17 +8921,49 @@ static void ixgbe_fwd_del(struct net_device *pdev, void 
*priv)
kfree(fwd_adapter);
 }
 
-#define IXGBE_MAX_TUNNEL_HDR_LEN 80
+#define IXGBE_MAX_MAC_HDR_LEN  127
+#define IXGBE_MAX_NETWORK_HDR_LE

[RFC PATCH 03/11] GSO: Add GSO type for fixed IPv4 ID

This patch adds support for TSO using IPv4 headers with a fixed IP ID
field.  This is meant to allow us to do a lossless GRO in the case of TCP
flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
headers.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdev_features.h |3 +++
 include/linux/netdevice.h   |1 +
 include/linux/skbuff.h  |   20 +++-
 net/core/ethtool.c  |1 +
 net/ipv4/af_inet.c  |   19 +++
 net/ipv4/gre_offload.c  |1 +
 net/ipv4/tcp_offload.c  |4 +++-
 net/ipv6/ip6_offload.c  |3 ++-
 8 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a734bf43d190..5d7da1ac6df5 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -39,6 +39,7 @@ enum {
NETIF_F_UFO_BIT,/* ... UDPv4 fragmentation */
NETIF_F_GSO_ROBUST_BIT, /* ... ->SKB_GSO_DODGY */
NETIF_F_TSO_ECN_BIT,/* ... TCP ECN support */
+   NETIF_F_TSO_FIXEDID_BIT,/* ... IPV4 header has fixed IP ID */
NETIF_F_TSO6_BIT,   /* ... TCPv6 segmentation */
NETIF_F_FSO_BIT,/* ... FCoE segmentation */
NETIF_F_GSO_GRE_BIT,/* ... GRE with TSO */
@@ -120,6 +121,7 @@ enum {
 #define NETIF_F_GSO_SIT__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
+#define NETIF_F_TSO_FIXEDID__NETIF_F(TSO_FIXEDID)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX__NETIF_F(HW_VLAN_STAG_RX)
@@ -147,6 +149,7 @@ enum {
 
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE   (NETIF_F_TSO | NETIF_F_TSO_ECN | \
+NETIF_F_TSO_FIXEDID | \
 NETIF_F_TSO6 | NETIF_F_UFO)
 
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8395308a2445..38ccc01eb97d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4011,6 +4011,7 @@ static inline bool net_gso_ok(netdev_features_t features, 
int gso_type)
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
+   BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_FIXEDID >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FCOE!= (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 007381270ff8..5fba16658f9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -465,23 +465,25 @@ enum {
/* This indicates the tcp segment has CWR set. */
SKB_GSO_TCP_ECN = 1 << 3,
 
-   SKB_GSO_TCPV6 = 1 << 4,
+   SKB_GSO_TCP_FIXEDID = 1 << 4,
 
-   SKB_GSO_FCOE = 1 << 5,
+   SKB_GSO_TCPV6 = 1 << 5,
 
-   SKB_GSO_GRE = 1 << 6,
+   SKB_GSO_FCOE = 1 << 6,
 
-   SKB_GSO_GRE_CSUM = 1 << 7,
+   SKB_GSO_GRE = 1 << 7,
 
-   SKB_GSO_IPIP = 1 << 8,
+   SKB_GSO_GRE_CSUM = 1 << 8,
 
-   SKB_GSO_SIT = 1 << 9,
+   SKB_GSO_IPIP = 1 << 9,
 
-   SKB_GSO_UDP_TUNNEL = 1 << 10,
+   SKB_GSO_SIT = 1 << 10,
 
-   SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+   SKB_GSO_UDP_TUNNEL = 1 << 11,
 
-   SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
+   SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+
+   SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6a7f99661c2f..5340c9dbc318 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -79,6 +79,7 @@ static const char 
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_UFO_BIT] =  "tx-udp-fragmentation",
[NETIF_F_GSO_ROBUST_BIT] =   "tx-gso-robust",
[NETIF_F_TSO_ECN_BIT] =  "tx-tcp-ecn-segmentation",
+   [NETIF_F_TSO_FIXEDID_BIT] =  "tx-tcp-fixedid-segmentation",
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] =  "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] =  "tx-gre-segmentation",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a38b9910af60..19e9a2c45d71 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1195,10 +1195,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 static struct sk_buff *inet_gso_segment(struct sk_buff *s

[RFC PATCH 06/11] VXLAN: Add option to mangle IP IDs on inner headers when using TSO

This patch adds support for a feature I am calling IP ID mangling.  It is
basically just another way of saying the IP IDs that are transmitted by the
tunnel may not match up with what would normally be expected.  Specifically
what will happen is in the case of TSO the IP IDs on the headers will be a
fixed value so a given TSO will repeat the same inner IP ID value gso_segs
number of times.

Signed-off-by: Alexander Duyck 
---
 drivers/net/vxlan.c  |   16 
 include/net/vxlan.h  |1 +
 include/uapi/linux/if_link.h |1 +
 3 files changed, 18 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 51cccddfe403..cc903ab832c2 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1783,6 +1783,10 @@ static int vxlan_build_skb(struct sk_buff *skb, struct 
dst_entry *dst,
type |= SKB_GSO_TUNNEL_REMCSUM;
}
 
+   if ((vxflags & VXLAN_F_TCP_FIXEDID) && skb_is_gso(skb) &&
+   (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
+   type |= SKB_GSO_TCP_FIXEDID;
+
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+ VXLAN_HLEN + iphdr_len
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
@@ -2635,6 +2639,7 @@ static const struct nla_policy 
vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_GBP]= { .type = NLA_FLAG, },
[IFLA_VXLAN_GPE]= { .type = NLA_FLAG, },
[IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
+   [IFLA_VXLAN_IPID_MANGLE]= { .type = NLA_FLAG },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -3092,6 +3097,9 @@ static int vxlan_newlink(struct net *src_net, struct 
net_device *dev,
if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
 
+   if (data[IFLA_VXLAN_IPID_MANGLE])
+   conf.flags |= VXLAN_F_TCP_FIXEDID;
+
err = vxlan_dev_configure(src_net, dev, &conf);
switch (err) {
case -ENODEV:
@@ -3154,6 +3162,10 @@ static size_t vxlan_get_size(const struct net_device 
*dev)
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX 
*/
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
+   nla_total_size(0) +/* IFLA_VXLAN_GBP */
+   nla_total_size(0) +/* IFLA_VXLAN_GPE */
+   nla_total_size(0) +/* IFLA_VXLAN_REMCSUM_NOPARTIAL 
*/
+   nla_total_size(0) +/* IFLA_VXLAN_IPID_MANGLE */
0;
 }
 
@@ -3244,6 +3256,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
goto nla_put_failure;
 
+   if (vxlan->flags & VXLAN_F_TCP_FIXEDID &&
+   nla_put_flag(skb, IFLA_VXLAN_IPID_MANGLE))
+   goto nla_put_failure;
+
return 0;
 
 nla_put_failure:
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index dcc6f4057115..5c2dc9ecea59 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -265,6 +265,7 @@ struct vxlan_dev {
 #define VXLAN_F_REMCSUM_NOPARTIAL  0x1000
 #define VXLAN_F_COLLECT_METADATA   0x2000
 #define VXLAN_F_GPE0x4000
+#define VXLAN_F_TCP_FIXEDID0x8000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9427f17d06d6..a3bc3f2a63d3 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -489,6 +489,7 @@ enum {
IFLA_VXLAN_COLLECT_METADATA,
IFLA_VXLAN_LABEL,
IFLA_VXLAN_GPE,
+   IFLA_VXLAN_IPID_MANGLE,
__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)

[RFC PATCH 00/11] GSO partial and TSO FIXEDID support

This patch series represents my respun version of the GSO partial work.
The major changes from the first version is that we are no longer making
the decision to mangle IP IDs transparently at the device.  Instead it is
now pushed up to the tunnel layer itself so that the tunnel is not
responsible for deciding if the IP IDs will be static or fixed for a given
TSO.

I'm a bit jammed up at the moment as I am trying to determine the best spot
to make the same change I currently am for VXLAN and GENEVE with GRE and
IPIP tunnels.  I'm assuming the correct spot would be somewhere near
iptunnel_handle_offload as I did for the other two tunnel types I have
already updated, but the flow based tunnels for GRE seem to be making it a
bit more complicated as I am not sure if a tunnel dev actually exists for
those tunnels.

This patch series is meant to apply to the dev-queue branch Jeff Kirsher's
next-queue tree at:
https://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git

I chose his tree as the Intel driver patches would not apply otherwise.

Patch 1 from the series is a copy of the patch recently accepted for the
net tree.  It is needed in this series to avoid merge conflicts later on as
we were making other changes in the GRO code.

---

Alexander Duyck (11):
  GRE: Disable segmentation offloads w/ CSUM and we are encapsulated via FOU
  ethtool: Add support for toggling any of the GSO offloads
  GSO: Add GSO type for fixed IPv4 ID
  GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID 
values
  GSO: Support partial segmentation offload
  VXLAN: Add option to mangle IP IDs on inner headers when using TSO
  GENEVE: Add option to mangle IP IDs on inner headers when using TSO
  Documentation: Add documentation for TSO and GSO features
  i40e/i40evf: Add support for GSO partial with UDP_TUNNEL_CSUM and GRE_CSUM
  ixgbe/ixgbevf: Add support for GSO partial
  igb/igbvf: Add support for GSO partial


 Documentation/networking/segmentation-offloads.txt |  127 ++
 drivers/net/ethernet/intel/i40e/i40e_main.c|6 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|7 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  |7 +
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|6 +
 drivers/net/ethernet/intel/igb/igb_main.c  |  119 ++---
 drivers/net/ethernet/intel/igbvf/netdev.c  |  180 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |  115 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c  |  122 +++---
 drivers/net/geneve.c   |   24 ++-
 drivers/net/vxlan.c|   16 ++
 include/linux/netdev_features.h|8 +
 include/linux/netdevice.h  |   11 +
 include/linux/skbuff.h |   27 ++-
 include/net/udp_tunnel.h   |8 -
 include/net/vxlan.h|1 
 include/uapi/linux/if_link.h   |2 
 net/core/dev.c |   33 
 net/core/ethtool.c |4 
 net/core/skbuff.c  |   29 +++
 net/ipv4/af_inet.c |   70 ++--
 net/ipv4/fou.c |6 +
 net/ipv4/gre_offload.c |   35 +++-
 net/ipv4/ip_gre.c  |   13 +
 net/ipv4/tcp_offload.c |   30 +++
 net/ipv4/udp_offload.c |   27 ++-
 net/ipv6/ip6_offload.c |   21 ++
 27 files changed, 837 insertions(+), 217 deletions(-)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

--

Re: [PATCH net-next] sock: make lockdep_sock_is_held static inline

On Thu, 2016-04-07 at 23:53 +0200, Hannes Frederic Sowa wrote:
> I forgot to add inline to lockdep_sock_is_held, so it generated all
> kinds of build warnings if not build with lockdep support.
> 
> Reported-by: kbuild test robot 
> Signed-off-by: Hannes Frederic Sowa 
> ---
>  include/net/sock.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index eb2d7c3e120b25..46b29374df8ed7 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1360,7 +1360,7 @@ do {
> \
>   lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
>  } while (0)
>  
> -static bool lockdep_sock_is_held(const struct sock *csk)
> +static inline bool lockdep_sock_is_held(const struct sock *csk)
>  {
>   struct sock *sk = (struct sock *)csk;
>  


But... this wont solve the compiler error ?

include/net/sock.h: In function 'lockdep_sock_is_held':
include/net/sock.h:1367:2: error: implicit declaration of function
'lockdep_is_held' [-Werror=implicit-function-declaration]
In file included from security/lsm_audit.c:20:0:
include/net/sock.h: In function 'lockdep_sock_is_held':
include/net/sock.h:1367:2: error: implicit declaration of function
'lockdep_is_held' [-Werror=implicit-function-declaration]

$ egrep "LOCKDEP|PROVE" .config
CONFIG_LOCKDEP_SUPPORT=y
# CONFIG_PROVE_LOCKING is not set
# CONFIG_PROVE_RCU is not set

[PATCH net-next 0/2] fix two more udp pull header issues

2016-04-07 Thread Willem de Bruijn

From: Willem de Bruijn 

Follow up patches to the fixes to RxRPC and SunRPC. A scan of the
code showed two other interfaces that expect UDP packets to have
a udphdr when queued: read packet length with ioctl SIOCINQ and
receive payload checksum with socket option IP_CHECKSUM.

Willem de Bruijn (2):
  udp: do not expect udp headers on ioctl SIOCINQ
  udp: do not expect udp headers in recv cmsg IP_CMSG_CHECKSUM

 net/ipv4/ip_sockglue.c | 3 ++-
 net/ipv4/udp.c | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

-- 
2.8.0.rc3.226.g39d4020

[PATCH net-next 1/2] udp: do not expect udp headers on ioctl SIOCINQ

2016-04-07 Thread Willem de Bruijn

From: Willem de Bruijn 

On udp sockets, ioctl SIOCINQ returns the payload size of the first
packet. Since commit e6afc8ace6dd pulled the headers, the result is
incorrect when subtracting header length. Remove that operation.

Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing")

Signed-off-by: Willem de Bruijn 
---
 net/ipv4/udp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3563788d..d2d294b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1282,8 +1282,6 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 * of this packet since that is all
 * that will be read.
 */
-   amount -= sizeof(struct udphdr);
-
return put_user(amount, (int __user *)arg);
}
 
-- 
2.8.0.rc3.226.g39d4020

[PATCH net-next 2/2] udp: do not expect udp headers in recv cmsg IP_CMSG_CHECKSUM

2016-04-07 Thread Willem de Bruijn

From: Willem de Bruijn 

On udp sockets, recv cmsg IP_CMSG_CHECKSUM returns a checksum over
the packet payload. Since commit e6afc8ace6dd pulled the headers,
taking skb->data as the start of transport header is incorrect. Use
the transport header pointer.

Also, when peeking at an offset from the start of the packet, only
return a checksum from the start of the peeked data. Note that the
cmsg does not subtract a tail checkum when reading truncated data.

Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing")

Signed-off-by: Willem de Bruijn 
---
 net/ipv4/ip_sockglue.c | 3 ++-
 net/ipv4/udp.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 89b5f3b..279471c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -106,7 +106,8 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, 
struct sk_buff *skb,
return;
 
if (offset != 0)
-   csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
+   csum = csum_sub(csum, csum_partial(skb_transport_header(skb),
+  offset, 0));
 
put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d2d294b..f186313 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1375,7 +1375,7 @@ try_again:
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
-   ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
+   ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off);
 
err = copied;
if (flags & MSG_TRUNC)
-- 
2.8.0.rc3.226.g39d4020

Re: [net-next:master 194/196] include/net/sock.h:1367:9: error: implicit declaration of function 'lockdep_is_held'



On Thu, Apr 7, 2016, at 23:09, David Miller wrote:
> From: kbuild test robot 
> Date: Fri, 8 Apr 2016 05:00:42 +0800
> 
> >include/net/sock.h: In function 'lockdep_sock_is_held':
> >>> include/net/sock.h:1367:9: error: implicit declaration of function 
> >>> 'lockdep_is_held' [-Werror=implicit-function-declaration]
> >  return lockdep_is_held(&sk->sk_lock) ||
> ...
> >   1361  } while (0)
> >   1362  
> >   1363  static bool lockdep_sock_is_held(const struct sock *csk)
> >   1364  {
> >   1365  struct sock *sk = (struct sock *)csk;
> >   1366  
> >> 1367   return lockdep_is_held(&sk->sk_lock) ||
> >   1368 lockdep_is_held(&sk->sk_lock.slock);
> >   1369  }
> 
> Hmmm, Hannes to we need to make this a macro just like lockdep_is_held()
> is?

I think my newest patch should fix it. I simply forgot the inline
keyword. inline functions get invisible if not used by the compiler.

Sorry,
Hannes

[net-next:master 207/208] net/tipc/bearer.c:560:48: sparse: incompatible types in comparison expression (different base types)

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   889750bd2e08a94d52a116056d462b3a8e5616a7
commit: 5b7066c3dd24c7d538e5ee402eb24bb182c16dab [207/208] tipc: stricter 
filtering of packets in bearer layer
reproduce:
# apt-get install sparse
git checkout 5b7066c3dd24c7d538e5ee402eb24bb182c16dab
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   include/linux/compiler.h:232:8: sparse: attribute 'no_sanitize_address': 
unknown attribute
>> net/tipc/bearer.c:560:48: sparse: incompatible types in comparison 
>> expression (different base types)

vim +560 net/tipc/bearer.c

   544   * This function is called by the Ethernet driver in case of link
   545   * change event.
   546   */
   547  static int tipc_l2_device_event(struct notifier_block *nb, unsigned 
long evt,
   548  void *ptr)
   549  {
   550  struct net_device *dev = netdev_notifier_info_to_dev(ptr);
   551  struct net *net = dev_net(dev);
   552  struct tipc_net *tn = tipc_net(net);
   553  struct tipc_bearer *b;
   554  int i;
   555  
   556  b = rtnl_dereference(dev->tipc_ptr);
   557  if (!b) {
   558  for (i = 0; i < MAX_BEARERS; b = NULL, i++) {
   559  b = rtnl_dereference(tn->bearer_list[i]);
 > 560  if (b && (b->media_ptr == dev))
   561  break;
   562  }
   563  }
   564  if (!b)
   565  return NOTIFY_DONE;
   566  
   567  b->mtu = dev->mtu;
   568  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation

Re: [PATCH V2 5/8] net: mediatek: fix mtk_pending_work

Hi John,

[auto build test ERROR on net-next/master]
[also build test ERROR on v4.6-rc2 next-20160407]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improving the system]

url:
https://github.com/0day-ci/linux/commits/John-Crispin/net-mediatek-make-the-driver-pass-stress-tests/20160408-033430
config: arm-allyesconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm 

Note: the 
linux-review/John-Crispin/net-mediatek-make-the-driver-pass-stress-tests/20160408-033430
 HEAD e648090f60723da77108430208b4b957c481048b builds fine.
  It only hurts bisectibility.

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/list.h:8:0,
from include/linux/kobject.h:20,
from include/linux/device.h:17,
from include/linux/node.h:17,
from include/linux/cpu.h:16,
from include/linux/of_device.h:4,
from drivers/net/ethernet/mediatek/mtk_eth_soc.c:15:
   drivers/net/ethernet/mediatek/mtk_eth_soc.c: In function 'mtk_pending_work':
>> include/linux/kernel.h:824:27: error: 'struct mtk_eth' has no member named 
>> 'pending_work'
 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
  ^
>> drivers/net/ethernet/mediatek/mtk_eth_soc.c:1433:24: note: in expansion of 
>> macro 'container_of'
 struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work);
   ^
   include/linux/kernel.h:824:48: warning: initialization from incompatible 
pointer type
 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
   ^
>> drivers/net/ethernet/mediatek/mtk_eth_soc.c:1433:24: note: in expansion of 
>> macro 'container_of'
 struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work);
   ^
   include/linux/kernel.h:824:48: warning: (near initialization for 'eth')
 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
   ^
>> drivers/net/ethernet/mediatek/mtk_eth_soc.c:1433:24: note: in expansion of 
>> macro 'container_of'
 struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work);
   ^
   In file included from include/linux/compiler.h:60:0,
from include/linux/ioport.h:12,
from include/linux/device.h:16,
from include/linux/node.h:17,
from include/linux/cpu.h:16,
from include/linux/of_device.h:4,
from drivers/net/ethernet/mediatek/mtk_eth_soc.c:15:
>> include/linux/compiler-gcc.h:158:2: error: 'struct mtk_eth' has no member 
>> named 'pending_work'
 __builtin_offsetof(a, b)
 ^
   include/linux/stddef.h:16:32: note: in expansion of macro 
'__compiler_offsetof'
#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
   ^
   include/linux/kernel.h:825:29: note: in expansion of macro 'offsetof'
 (type *)( (char *)__mptr - offsetof(type,member) );})
^
>> drivers/net/ethernet/mediatek/mtk_eth_soc.c:1433:24: note: in expansion of 
>> macro 'container_of'
 struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work);
   ^

vim +824 include/linux/kernel.h

^1da177e Linus Torvalds 2005-04-16  818   * @ptr:   the pointer to the 
member.
^1da177e Linus Torvalds 2005-04-16  819   * @type:  the type of the 
container struct this is embedded in.
^1da177e Linus Torvalds 2005-04-16  820   * @member:the name of the member 
within the struct.
^1da177e Linus Torvalds 2005-04-16  821   *
^1da177e Linus Torvalds 2005-04-16  822   */
^1da177e Linus Torvalds 2005-04-16  823  #define container_of(ptr, type, 
member) ({ \
^1da177e Linus Torvalds 2005-04-16 @824 const typeof( ((type 
*)0)->member ) *__mptr = (ptr);\
^1da177e Linus Torvalds 2005-04-16  825 (type *)( (char *)__mptr - 
offsetof(type,member) );})
^1da177e Linus Torvalds 2005-04-16  826  
b9d4f426 Arnaud Lacombe 2011-07-25  827  /* Rebuild everything on 
CONFIG_FTRACE_MCOUNT_RECORD */

:: The code at line 824 was first introduced by commit
:: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2

:: TO: Linus Torvalds 
:: CC: Linus Torvalds 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

[PATCH net-next] sock: make lockdep_sock_is_held static inline

I forgot to add inline to lockdep_sock_is_held, so it generated all
kinds of build warnings if not build with lockdep support.

Reported-by: kbuild test robot 
Signed-off-by: Hannes Frederic Sowa 
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index eb2d7c3e120b25..46b29374df8ed7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1360,7 +1360,7 @@ do {  
\
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
 } while (0)
 
-static bool lockdep_sock_is_held(const struct sock *csk)
+static inline bool lockdep_sock_is_held(const struct sock *csk)
 {
struct sock *sk = (struct sock *)csk;
 
-- 
2.5.5

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-07 Thread Timur Tabi


Andrew Lunn wrote:

I'm back to working on this driver, and I need some more help with
how to handle the phy.  mdio-gpio.txt doesn't really tell me much.
I'm actually working on an ACPI system and not DT.


I can help you with DT, but not ACPI.

The MDIO bus can be a separate Linux device. Since you have GPIO lines
for the MDIO bus, it makes sense for this to be a mdio-gpio device. So
in DT, you would have:

mdio0: mdio {
 compatible = "virtual,mdio-gpio";
 #address-cells = <1>;
 #size-cells = <0>;
 gpios = <&qcomgpio 123 0
  &qcomgpio 124 0>;

phy0: ethernet-phy@8 {
reg = <9>;
};
};

Here i've assumed the PHY is using address 8 on the bus. Change as
needed.

In your MAC DT node, you then have phy-handle pointing to this phy:

   emac0: qcom,emac@feb2 {
cell-index = <0>;
compatible = "qcom,emac";
reg-names = "base", "csr", "ptp", "sgmii";
reg =   <0xfeb2 0x1>,
<0xfeb36000 0x1000>,
<0xfeb3c000 0x4000>,
<0xfeb38000 0x400>;
#address-cells = <0>;
interrupt-parent = <&emac0>;
#interrupt-cells = <1>;
interrupts = <0 1>;
interrupt-map-mask = <0x>;
interrupt-map = <0 &intc 0 76 0
 1 &intc 0 80 0>;
interrupt-names = "emac_core0", "sgmii_irq";
qcom,emac-tstamp-en;
qcom,emac-ptp-frac-ns-adj = <12500 1>;

phy-handle = <&phy0>
}

In the driver, you need to connect the PHY to the MAC. You do this
using something like:

  if (dev->of_node) {
  phy_np = of_parse_phandle(dev->of_node, "phy-handle", 0);
  if (!phy_np) {
  netdev_dbg(ndev, "No phy-handle found in DT\n");
  return -ENODEV;
  }

  phy_dev = of_phy_connect(ndev, phy_np, &_enet_adjust_link,
   0, pdata->phy_mode);
  if (!phy_dev) {
  netdev_err(ndev, "Could not connect to PHY\n");
  return -ENODEV;
  }


Thank you very much.  I'll study this in detail.


Do you have an ACPI table describing this hardware? What does it look
like?


So a little background.  There are several versions of this driver 
floating in Qualcomm, and this is the first serious attempt to upstream 
it.  I'm trying to reconcile Gilad's driver with the one we use 
internally for our ACPI-enabled ARM server platform (the QDF2432).


My goal is to get Gilad's driver accepted upstream with minimal changes 
on my part.  I will then follow up with several patches that enable ACPI 
and our SOC, as well as adding missing parts like ethtool and 1588 support.


On my platform, firmware (UEFI) configures all of the GPIOs.  I need to 
get confirmation, but it appears that we don't actually make any GPIO 
calls at all.  I see code that looks like this:


for (i = 0; (!adpt->no_mdio_gpio) && i < EMAC_NUM_GPIO; i++) {
gpio_info = &adpt->gpio_info[i];
retval = of_get_named_gpio(node, gpio_info->name, 0);
if (retval < 0)
return retval;

And on our ACPI system, adpt->no_mdio_gpio is always true:

/* Assume GPIOs required for MDC/MDIO are enabled in firmware */
adpt->no_mdio_gpio = true;

--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum, a Linux Foundation collaborative project.

[net-next:master 194/208] include/net/sock.h:1363:13: error: 'lockdep_sock_is_held' defined but not used

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   889750bd2e08a94d52a116056d462b3a8e5616a7
commit: 1e1d04e678cf72442f57ce82803c7a407769135f [194/208] net: introduce 
lockdep_is_held and update various places to use it
config: sparc64-allnoconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 1e1d04e678cf72442f57ce82803c7a407769135f
# save the attached .config to linux build tree
make.cross ARCH=sparc64 

All errors (new ones prefixed by >>):

   In file included from include/linux/tcp.h:22:0,
from include/linux/ipv6.h:75,
from include/net/ipv6.h:16,
from include/linux/sunrpc/clnt.h:27,
from include/linux/nfs_fs.h:30,
from arch/sparc/kernel/sys_sparc32.c:24:
   include/net/sock.h: In function 'lockdep_sock_is_held':
   include/net/sock.h:1367:2: error: implicit declaration of function 
'lockdep_is_held' [-Werror=implicit-function-declaration]
 return lockdep_is_held(&sk->sk_lock) ||
 ^
   arch/sparc/kernel/sys_sparc32.c: At top level:
>> include/net/sock.h:1363:13: error: 'lockdep_sock_is_held' defined but not 
>> used [-Werror=unused-function]
static bool lockdep_sock_is_held(const struct sock *csk)
^
   cc1: all warnings being treated as errors

vim +/lockdep_sock_is_held +1363 include/net/sock.h

  1357  sizeof((sk)->sk_lock)); 
\
  1358  lockdep_set_class_and_name(&(sk)->sk_lock.slock,
\
  1359  (skey), (sname));   
\
  1360  lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); 
\
  1361  } while (0)
  1362  
> 1363  static bool lockdep_sock_is_held(const struct sock *csk)
  1364  {
  1365  struct sock *sk = (struct sock *)csk;
  1366  
> 1367  return lockdep_is_held(&sk->sk_lock) ||
  1368 lockdep_is_held(&sk->sk_lock.slock);
  1369  }
  1370  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

David Miller  wrote:

> As you can with the function tracer and tracepoints.

I've had experience with tracepoints before (i2c and smbus).  It wasn't
particularly fun.  There's got to be some easier way to write them.

Hmmm...

Of the _enter() and _leave() macros in my tree at the moment, I have 261 that
record more information than just the name of the function.  I presume each
would need to be converted to a TRACE_EVENT*() macro as, as far as I can see
with a cursory examination, the function tracer doesn't record arguments or the
things arguments point to (which I do a lot of).  There are also another 84 of
these macros that only record the name of the function which are simpler
propositions.

Add to that 286 _debug(), _net() and _proto() macros, all of which record more
information than just the name of the calling function and don't have anything
particularly to do with function trace.

Now, I have been considering that the _net() and _proto() macros would probably
work well as TRACE_EVENT()-type constructs, allowing the tracing of protocol
flow.

I don't suppose you have a script for automatically converting something like
a printk()-type thing into a TRACE_EVENT()?

David

Re: [RFC PATCH] possible bug in handling of ipv4 route caching

2016-04-07 Thread Julian Anastasov


Hello,

On Thu, 7 Apr 2016, Chris Friesen wrote:

> Hi,
> 
> We think we may have found a bug in the handling of ipv4 route caching,
> and are curious what you think.
> 
> For local routes that require a particular output interface we do not
> want to cache the result.  Caching the result causes incorrect behaviour
> when there are multiple source addresses on the interface.  The end
> result being that if the intended recipient is waiting on that interface
> for the packet he won't receive it because it will be delivered on the
> loopback interface and the IP_PKTINFO ipi_ifindex will be set to the
> loopback interface as well.

So, if we store some orig_oif in rt_iif that
is specific (cached in nexthop) to some device, this
orig_oif should match the device. The only case when
this is violated is when we redirect traffic to lo.
Any attempts to store eth1 in rt_iif for nexthop
cache for lo should be avoided.

> This can be tested by running a program such as "dhcp_release" which
> attempts to inject a packet on a particular interface so that it is
> received by another program on the same board.  The receiving process
> should see an IP_PKTINFO ipi_ifndex value of the source interface
> (e.g., eth1) instead of the loopback interface (e.g., lo).  The packet
> will still appear on the loopback interface in tcpdump but the important
> aspect is that the CMSG info is correct.
> 
> For what it's worth, here's a patch that we've applied locally to deal
> with the issue.
> 
> Chris
> 
> 
> 
> Signed-off-by: Allain Legacy 
> Signed-off-by: Chris Friesen 
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 02c6229..e965d4b 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2045,6 +2045,17 @@ static struct rtable *__mkroute_output(const struct 
> fib_result *res,
>*/
>   if (fi && res->prefixlen < 4)
>   fi = NULL;
> + } else if ((type == RTN_LOCAL) && (orig_oif != 0)) {

So, we can be more specific. Can this work?:

} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
   (orig_oif != dev_out->ifindex)) {

I.e. we should allow to cache orig_oif=LOOPBACK_IFINDEX
but eth1 should not be cached.

> + /* For local routes that require a particular output interface
> + * we do not want to cache the result.  Caching the result
> + * causes incorrect behaviour when there are multiple source
> + * addresses on the interface, the end result being that if 
> the
> + * intended recipient is waiting on that interface for the
> + * packet he won't receive it because it will be delivered on
> + * the loopback interface and the IP_PKTINFO ipi_ifindex will
> + * be set to the loopback interface as well.
> +  */
> + fi = NULL;
>   }
>  
>   fnhe = NULL;

Regards

--
Julian Anastasov

Re: [net-next:master 194/196] include/net/sock.h:1367:9: error: implicit declaration of function 'lockdep_is_held'

From: kbuild test robot 
Date: Fri, 8 Apr 2016 05:00:42 +0800

>include/net/sock.h: In function 'lockdep_sock_is_held':
>>> include/net/sock.h:1367:9: error: implicit declaration of function 
>>> 'lockdep_is_held' [-Werror=implicit-function-declaration]
>  return lockdep_is_held(&sk->sk_lock) ||
...
>   1361} while (0)
>   1362
>   1363static bool lockdep_sock_is_held(const struct sock *csk)
>   1364{
>   1365struct sock *sk = (struct sock *)csk;
>   1366
>> 1367 return lockdep_is_held(&sk->sk_lock) ||
>   1368   lockdep_is_held(&sk->sk_lock.slock);
>   1369}

Hmmm, Hannes to we need to make this a macro just like lockdep_is_held() is?

[net-next:master 194/196] include/net/sock.h:1367:9: error: implicit declaration of function 'lockdep_is_held'

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   1fbbe1a8a9b195c4ac856540dfaef49d663c2e91
commit: 1e1d04e678cf72442f57ce82803c7a407769135f [194/196] net: introduce 
lockdep_is_held and update various places to use it
config: i386-tinyconfig (attached as .config)
reproduce:
git checkout 1e1d04e678cf72442f57ce82803c7a407769135f
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from include/linux/tcp.h:22:0,
from include/linux/ipv6.h:75,
from include/net/ipv6.h:16,
from include/linux/sunrpc/clnt.h:27,
from include/linux/nfs_fs.h:30,
from init/do_mounts.c:32:
   include/net/sock.h: In function 'lockdep_sock_is_held':
>> include/net/sock.h:1367:9: error: implicit declaration of function 
>> 'lockdep_is_held' [-Werror=implicit-function-declaration]
 return lockdep_is_held(&sk->sk_lock) ||
^
   init/do_mounts.c: At top level:
   include/net/sock.h:1363:13: warning: 'lockdep_sock_is_held' defined but not 
used [-Wunused-function]
static bool lockdep_sock_is_held(const struct sock *csk)
^
   cc1: some warnings being treated as errors

vim +/lockdep_is_held +1367 include/net/sock.h

  1361  } while (0)
  1362  
  1363  static bool lockdep_sock_is_held(const struct sock *csk)
  1364  {
  1365  struct sock *sk = (struct sock *)csk;
  1366  
> 1367  return lockdep_is_held(&sk->sk_lock) ||
  1368 lockdep_is_held(&sk->sk_lock.slock);
  1369  }
  1370  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

Re: [PATCH v2 net-next 04/10] perf, bpf: allow bpf programs attach to tracepoints

2016-04-07 Thread Peter Zijlstra

On Wed, Apr 06, 2016 at 06:43:25PM -0700, Alexei Starovoitov wrote:
> introduce BPF_PROG_TYPE_TRACEPOINT program type and allow it to be attached
> to the perf tracepoint handler, which will copy the arguments into
> the per-cpu buffer and pass it to the bpf program as its first argument.
> The layout of the fields can be discovered by doing
> 'cat /sys/kernel/debug/tracing/events/sched/sched_switch/format'
> prior to the compilation of the program with exception that first 8 bytes
> are reserved and not accessible to the program. This area is used to store
> the pointer to 'struct pt_regs' which some of the bpf helpers will use:
> +-+
> | 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program)
> +-+
> | N bytes | static tracepoint fields defined in tracepoint/format (bpf 
> readonly)
> +-+
> | dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet)
> +-+
> 
> Not that all of the fields are already dumped to user space via perf ring 
> buffer
> and broken application access it directly without consulting 
> tracepoint/format.
> Same rule applies here: static tracepoint fields should only be accessed
> in a format defined in tracepoint/format. The order of fields and
> field sizes are not an ABI.
> 
> Signed-off-by: Alexei Starovoitov 

Acked-by: Peter Zijlstra (Intel)

Re: [PATCH net-next v3 0/2] tipc: some small fixes

From: Jon Maloy 
Date: Thu,  7 Apr 2016 10:09:12 -0400

> When fix a minor buffer leak, and ensure that bearers filter packets
> correctly while they are being shut down.
> 
> v2: Corrected typos in commit #3, as per feedback from S. Shtylyov
> v3: Removed commit #3 from the series. Improved version will be 
> re-submitted later.

Series applied, thanks Jon.

Re: [PATCH v2 net-next 03/10] perf: split perf_trace_buf_prepare into alloc and update parts

2016-04-07 Thread Peter Zijlstra

On Wed, Apr 06, 2016 at 06:43:24PM -0700, Alexei Starovoitov wrote:
> split allows to move expensive update of 'struct trace_entry' to later phase.
> Repurpose unused 1st argument of perf_tp_event() to indicate event type.
> 
> While splitting use temp variable 'rctx' instead of '*rctx' to avoid
> unnecessary loads done by the compiler due to -fno-strict-aliasing
> 
> Signed-off-by: Alexei Starovoitov 

Acked-by: Peter Zijlstra (Intel)

Re: [net PATCH v3] GRE: Disable segmentation offloads w/ CSUM and we are encapsulated via FOU

From: Alexander Duyck 
Date: Tue, 05 Apr 2016 09:13:39 -0700

> This patch fixes an issue I found in which we were dropping frames if we
> had enabled checksums on GRE headers that were encapsulated by either FOU
> or GUE.  Without this patch I was barely able to get 1 Gb/s of throughput.
> With this patch applied I am now at least getting around 6 Gb/s.
> 
> The issue is due to the fact that with FOU or GUE applied we do not provide
> a transport offset pointing to the GRE header, nor do we offload it in
> software as the GRE header is completely skipped by GSO and treated like a
> VXLAN or GENEVE type header.  As such we need to prevent the stack from
> generating it and also prevent GRE from generating it via any interface we
> create.
> 
> Fixes: c3483384ee511 ("gro: Allow tunnel stacking in the case of FOU/GUE")
> Signed-off-by: Alexander Duyck 
> ---
> 
> v3: Basically the same patch as v1 and v2, but I am cutting it loose from
> the IPv4 ID patch as that one will likely need to be resolved in
> net-next.

Applied, thanks Alexander.

Re: [PATCH v2 net-next 02/10] perf: remove unused __addr variable

2016-04-07 Thread Peter Zijlstra

On Wed, Apr 06, 2016 at 06:43:23PM -0700, Alexei Starovoitov wrote:
> now all calls to perf_trace_buf_submit() pass 0 as 4th
> argument which will be repurposed in the next patch which will
> change the meaning of 1st arg of perf_tp_event() to event_type
> 
> Signed-off-by: Alexei Starovoitov 
> ---
>  include/trace/perf.h | 7 ++-
>  include/trace/trace_events.h | 3 ---
>  2 files changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/include/trace/perf.h b/include/trace/perf.h
> index 26486fcd74ce..6f7e37869065 100644
> --- a/include/trace/perf.h
> +++ b/include/trace/perf.h
> @@ -20,9 +20,6 @@
>  #undef __get_bitmask
>  #define __get_bitmask(field) (char *)__get_dynamic_array(field)
>  
> -#undef __perf_addr
> -#define __perf_addr(a)   (__addr = (a))
> -

Ah, we're not using that anylonger..

Acked-by: Peter Zijlstra (Intel)

Re: [PATCH net-next 0/8] udp: GRO in UDP sockets

From: Tom Herbert 
Date: Tue, 5 Apr 2016 08:22:48 -0700

> This patch set adds GRO functions (gro_receive and gro_complete) to UDP
> sockets and removes udp_offload infrastructure.
> 
> Add GRO functions (gro_receive and gro_complete) to UDP sockets. In
> udp_gro_receive and udp_gro_complete a socket lookup is done instead of
> looking up the port number in udp_offloads.  If a socket is found and
> there are GRO functions for it then those are called. This feature
> allows binding GRO functions to more than just a port number.
> Eventually, we will be able to use this technique to allow application
> defined GRO for an application protocol by attaching BPF porgrams to UDP
> sockets for doing GRO.
> 
> In order to implement these functions, we added exported
> udp6_lib_lookup_skb and udp4_lib_lookup_skb functions in ipv4/udp.c and
> ipv6/udp.c. Also, inet_iif and references to skb_dst() were changed to
> check that dst is set in skbuf before derefencing. In the GRO path there
> is now a UDP socket lookup performed before dst is set, to the get the
> device in that case we simply use skb->dev.
> 
> Tested:
> 
> Ran various combinations of VXLAN and GUE TCP_STREAM and TCP_RR tests.
> Did not see any material regression.

Series applied thanks Tom.

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

From: David Howells 
Date: Thu, 07 Apr 2016 21:47:19 +0100

> David Miller  wrote:
> 
>> > Excellent only if I can get at it to find out why the something went wrong.
>> > If it's lost because the machine panics, then it is worthless.
>> 
>> If you're ok with these kenter things spewing into the logs with the
>> current facility, you can run the function tracer and have it record
>> into the logs all the time too.
> 
> They don't do that unless you enable them.  And I can enable them individually
> at compile time or by set at runtime.

As you can with the function tracer and tracepoints.

We're not telling you to kill this now, we're just saying that you should
thinking about tossing this custom stuff when you have a chance.

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

David Miller  wrote:

> > Excellent only if I can get at it to find out why the something went wrong.
> > If it's lost because the machine panics, then it is worthless.
> 
> If you're ok with these kenter things spewing into the logs with the
> current facility, you can run the function tracer and have it record
> into the logs all the time too.

They don't do that unless you enable them.  And I can enable them individually
at compile time or by set at runtime.

> I don't see any argument which is appropriate for keeping this stuff
> around.

Sigh.

As I said to Joe: I don't want to deal with converting to using ftrace at the
moment whilst I'm towards the end of reworking the rewrite of the code.  I
have a stack of patches that would need reworking yet again.

David

Re: [PATCH v2 net-next 00/10] allow bpf attach to tracepoints

From: Alexei Starovoitov 
Date: Wed, 6 Apr 2016 18:43:21 -0700

> Hi Steven, Peter,

Steven/Peter, can you give this series a review?

Thanks!

Re: [PATCH net-next 0/3] sock: lockdep tightening

From: Hannes Frederic Sowa 
Date: Tue,  5 Apr 2016 17:10:13 +0200

> First patch is from Eric Dumazet and improves lockdep accuracy for
> socket locks. After that, second patch introduces lockdep_sock_is_held
> and uses it. Final patch reverts and reworks the lockdep fix from Daniel
> in the filter code, as we now have tighter lockdep support.

Series applied, thanks Hannes.

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

Joe Perches  wrote:

> > Let's see...  If the machine panics whilst I'm developing stuff (quite
> > likely if something goes wrong in BH context), how do I get at the
> > function tracing log to find out why it panicked if the log is then
> > lost?  With the serial console, at least I automatically capture the
> > output of kenter and co..
> 
> But you also don't get a bunch of other useful stuff.
> 
> btw: there is ftrace_dump_on_oops
> 
> https://lwn.net/Articles/366796/

Cute.

How do I get at the argument values and suchlike that kenter() and co print?
I presume I would need to replace any kenter() or suchlike that printed more
than just the address with a tracing hook?

David

Re: [Lsf] [LSF/MM TOPIC] Generic page-pool recycle facility?

2016-04-07 Thread Jesper Dangaard Brouer

On Thu, 7 Apr 2016 19:48:50 +
"Waskiewicz, PJ"  wrote:

> On Thu, 2016-04-07 at 16:17 +0200, Jesper Dangaard Brouer wrote:
> > (Topic proposal for MM-summit)
> > 
> > Network Interface Cards (NIC) drivers, and increasing speeds stress
> > the page-allocator (and DMA APIs).  A number of driver specific
> > open-coded approaches exists that work-around these bottlenecks in
> > the
> > page allocator and DMA APIs. E.g. open-coded recycle mechanisms, and
> > allocating larger pages and handing-out page "fragments".
> > 
> > I'm proposing a generic page-pool recycle facility, that can cover
> > the
> > driver use-cases, increase performance and open up for zero-copy RX.  
> 
> Is this based on the page recycle stuff from ixgbe that used to be in
> the driver?  If so I'd really like to be part of the discussion.

Okay, so it is not part of the driver any-longer?  I've studied the
current ixgbe driver (and other NIC drivers) closely.  Do you have some
code pointers, to this older code?

The likely-fastest recycle code I've see is in the bnx2x driver.  If
you are interested see: bnx2x_reuse_rx_data().  Again is it a bit
open-coded produce/consumer ring queue (which would be nice to also
cleanup).

To amortize the cost of allocating a single page, most other drivers
use the trick of allocating a larger (compound) page, and partition
this page into smaller "fragments".  Which also amortize the cost of
dma_map/unmap (important on non-x86).

This is actually problematic performance wise, because packet-data
(in these page fragments) only get DMA_sync'ed, and is thus considered
"read-only".  As netstack need to write packet headers, yet-another
(writable) memory area is allocated per packet (plus the SKB meta-data
struct).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

From: David Howells 
Date: Thu, 07 Apr 2016 20:45:08 +0100

> David Miller  wrote:
> 
>> Yeah this custom stuff is really inappropriate given the excellent
>> infrastructure we have these days...
> 
> Excellent only if I can get at it to find out why the something went wrong.
> If it's lost because the machine panics, then it is worthless.

If you're ok with these kenter things spewing into the logs with the
current facility, you can run the function tracer and have it record
into the logs all the time too.

I don't see any argument which is appropriate for keeping this stuff
around.

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

2016-04-07 Thread Joe Perches

On Thu, 2016-04-07 at 20:43 +0100, David Howells wrote:
> Joe Perches  wrote:
> > > Joe Perches  wrote:
> > > > It might be better to remove kenter and _enter
> > > > altogether and use function tracing instead.
> > > Possibly - but not at this time.
> > Swell.
> I didn't say I wouldn't do it - it's just that I'm trying to fix other stuff
> at the moment and don't particularly want to add that to the list just now.
> kenter, _enter and co. are serving me very well.

No worries, I didn't mean to imply you wouldn't.
It's your code, do what and when you want.

> > > Besides, isn't the function tracing log lost
> > > if the machine crashes?
> > I believe yes, but would it matter?
> Let's see...  If the machine panics whilst I'm developing stuff (quite likely
> if something goes wrong in BH context), how do I get at the function tracing
> log to find out why it panicked if the log is then lost?  With the serial
> console, at least I automatically capture the output of kenter and co..

But you also don't get a bunch of other useful stuff.

btw: there is ftrace_dump_on_oops

https://lwn.net/Articles/366796/

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-07 Thread Andrew Lunn

> I'm back to working on this driver, and I need some more help with
> how to handle the phy.  mdio-gpio.txt doesn't really tell me much.
> I'm actually working on an ACPI system and not DT.

I can help you with DT, but not ACPI.

The MDIO bus can be a separate Linux device. Since you have GPIO lines
for the MDIO bus, it makes sense for this to be a mdio-gpio device. So
in DT, you would have:

mdio0: mdio {
compatible = "virtual,mdio-gpio";
#address-cells = <1>;
#size-cells = <0>;
gpios = <&qcomgpio 123 0
 &qcomgpio 124 0>;

phy0: ethernet-phy@8 {
reg = <9>;
};
};

Here i've assumed the PHY is using address 8 on the bus. Change as
needed.

In your MAC DT node, you then have phy-handle pointing to this phy:

  emac0: qcom,emac@feb2 {
cell-index = <0>;
compatible = "qcom,emac";
reg-names = "base", "csr", "ptp", "sgmii";
reg =   <0xfeb2 0x1>,
<0xfeb36000 0x1000>,
<0xfeb3c000 0x4000>,
<0xfeb38000 0x400>;
#address-cells = <0>;
interrupt-parent = <&emac0>;
#interrupt-cells = <1>;
interrupts = <0 1>;
interrupt-map-mask = <0x>;
interrupt-map = <0 &intc 0 76 0
 1 &intc 0 80 0>;
interrupt-names = "emac_core0", "sgmii_irq";
qcom,emac-tstamp-en;
qcom,emac-ptp-frac-ns-adj = <12500 1>;

phy-handle = <&phy0>
}

In the driver, you need to connect the PHY to the MAC. You do this
using something like:

 if (dev->of_node) {
 phy_np = of_parse_phandle(dev->of_node, "phy-handle", 0);
 if (!phy_np) {
 netdev_dbg(ndev, "No phy-handle found in DT\n");
 return -ENODEV;
 }
 
 phy_dev = of_phy_connect(ndev, phy_np, &_enet_adjust_link,
  0, pdata->phy_mode);
 if (!phy_dev) {
 netdev_err(ndev, "Could not connect to PHY\n");
 return -ENODEV;
 }

Do you have an ACPI table describing this hardware? What does it look
like?

Andrew

Re: [Lsf] [LSF/MM TOPIC] Generic page-pool recycle facility?

2016-04-07 Thread Waskiewicz, PJ

On Thu, 2016-04-07 at 16:17 +0200, Jesper Dangaard Brouer wrote:
> (Topic proposal for MM-summit)
> 
> Network Interface Cards (NIC) drivers, and increasing speeds stress
> the page-allocator (and DMA APIs).  A number of driver specific
> open-coded approaches exists that work-around these bottlenecks in
> the
> page allocator and DMA APIs. E.g. open-coded recycle mechanisms, and
> allocating larger pages and handing-out page "fragments".
> 
> I'm proposing a generic page-pool recycle facility, that can cover
> the
> driver use-cases, increase performance and open up for zero-copy RX.

Is this based on the page recycle stuff from ixgbe that used to be in
the driver?  If so I'd really like to be part of the discussion.

-PJ


-- 
PJ Waskiewicz
Principal Engineer, NetApp
e: pj.waskiew...@netapp.com
d: 503.961.3705

Re: [PATCH V2 6/8] net: mediatek: fix TX locking

2016-04-07 Thread Sergei Shtylyov


Hello.

On 04/07/2016 10:26 PM, John Crispin wrote:


Inside the TX path there is a lock inside the tx_map function. This is
however too late. The patch moves the lock to the start of the xmit
function right before the free count check of the DMA ring happens.
If we do not do this, the code becomes racy leading to TX stalls and
dropped packets. This happens as there are 2 netdevs running on the
same physical DMA ring.

Signed-off-by: John Crispin 
---
  drivers/net/ethernet/mediatek/mtk_eth_soc.c |   20 ++--
  1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c 
b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 60b66ab..8434355 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c

[...]

@@ -712,14 +702,22 @@ static int mtk_start_xmit(struct sk_buff *skb, struct 
net_device *dev)
struct mtk_eth *eth = mac->hw;
struct mtk_tx_ring *ring = ð->tx_ring;
struct net_device_stats *stats = &dev->stats;
+   unsigned long flags;
bool gso = false;
int tx_num;

+   /* normally we can rely on the stack not calling this more than once,
+* however we have 2 queues running ont he same ring so we need to lock


   s/ont he/ on the/, perhaps a good chance to fix the comment?


+* the ring access
+*/
+   spin_lock_irqsave(ð->page_lock, flags);
+
tx_num = mtk_cal_txd_req(skb);
if (unlikely(atomic_read(&ring->free_count) <= tx_num)) {
mtk_stop_queue(eth);
netif_err(eth, tx_queued, dev,
  "Tx Ring full when queue awake!\n");
+   spin_unlock_irqrestore(ð->page_lock, flags);
return NETDEV_TX_BUSY;
}


[...]

MBR, Sergei

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.

David Miller  wrote:

> Yeah this custom stuff is really inappropriate given the excellent
> infrastructure we have these days...

Excellent only if I can get at it to find out why the something went wrong.
If it's lost because the machine panics, then it is worthless.

David

Re: [PATCH 2/9] rxrpc: Disable a debugging statement that has been left enabled.