Re: [bpf-next RFC 2/3] flow_dissector: implements eBPF parser

2018-08-18 Thread Tom Herbert
  FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
> +   FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs 
> */
> +   FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
> +   FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
> +   FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
> +   FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags 
> */
> +   FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
> +   FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
> +   FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
> +   FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct 
> flow_dissector_key_ipv4_addrs */
> +   FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct 
> flow_dissector_key_ipv6_addrs */
> +   FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control 
> */
> +   FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
> +   FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
> +   FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
> +   FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
> +   FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */
> +
> +   FLOW_DISSECTOR_KEY_MAX,
> +};
> +
> +struct flow_dissector_key_control {
> +   __u16   thoff;
> +   __u16   addr_type;
> +   __u32   flags;
> +};
> +
> +#define FLOW_DIS_IS_FRAGMENT   (1 << 0)
> +#define FLOW_DIS_FIRST_FRAG(1 << 1)
> +#define FLOW_DIS_ENCAPSULATION (1 << 2)
> +
> +struct flow_dissector_key_basic {
> +   __be16  n_proto;
> +   __u8ip_proto;
> +   __u8padding;
> +};
> +
> +struct flow_dissector_key_ipv4_addrs {
> +   __be32 src;
> +   __be32 dst;
> +};
> +
> +struct flow_dissector_key_ipv6_addrs {
> +   struct in6_addr src;
> +   struct in6_addr dst;
> +};
> +
> +struct flow_dissector_key_addrs {
> +   union {
> +   struct flow_dissector_key_ipv4_addrs v4addrs;
> +   struct flow_dissector_key_ipv6_addrs v6addrs;
> +   };
> +};
> +
> +struct flow_dissector_key_ports {
> +   union {
> +   __be32 ports;
> +   struct {
> +   __be16 src;
> +   __be16 dst;
> +   };
> +   };
> +};
> +
> +struct bpf_map_def SEC("maps") jmp_table = {
> +   .type = BPF_MAP_TYPE_PROG_ARRAY,
> +   .key_size = sizeof(__u32),
> +   .value_size = sizeof(__u32),
> +   .max_entries = 8
> +};
> +
> +struct bpf_dissect_cb {
> +   __u16 nhoff;
> +   __u16 flags;
> +};
> +
> +/* Dispatches on ETHERTYPE */
> +static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 
> proto)
> +{
> +   switch (proto) {
> +   case bpf_htons(ETH_P_IP):
> +   bpf_tail_call(skb, _table, IP);
> +   break;
> +   case bpf_htons(ETH_P_IPV6):
> +   bpf_tail_call(skb, _table, IPV6);
> +   break;
> +   case bpf_htons(ETH_P_MPLS_MC):
> +   case bpf_htons(ETH_P_MPLS_UC):
> +   bpf_tail_call(skb, _table, MPLS);
> +   break;
> +   case bpf_htons(ETH_P_8021Q):
> +   case bpf_htons(ETH_P_8021AD):
> +   bpf_tail_call(skb, _table, VLAN);
> +   break;
> +   default:
> +   /* Protocol not supported */
> +   return BPF_DROP;
> +   }
> +
> +   return BPF_DROP;
> +}
> +
> +static __always_inline int write_ports(struct __sk_buff *skb, __u8 proto)
> +{
> +   struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +   struct flow_dissector_key_ports ports;
> +
> +   /* The supported protocols always start with the ports */
> +   if (bpf_skb_load_bytes(skb, cb->nhoff, , sizeof(ports)))
> +   return BPF_DROP;
> +
> +   if (proto == IPPROTO_UDP && ports.dst == bpf_htons(GUE_PORT)) {
> +   /* GUE encapsulation */
> +   cb->nhoff += sizeof(struct udphdr);
> +   bpf_tail_call(skb, _table, GUE);
> +       return BPF_DROP;

It's a nice sentiment to support GUE, but this really isn't the right
way to do it. What would be much better is a means to generically
support all the various UDP encapsulations like GUE, VXLAN, Geneve,
GRE/UDP, MPLS/UDP, etc. I think there's two ways to do that:

1) A UDP socket lookup that returns an encapsulation socket containing
a flow dissector function that can be called. This is the 

Re: C45 support and mdiobus_scan

2018-08-10 Thread Tom Lendacky
On 8/9/2018 10:25 AM, Andrew Lunn wrote:
>>> The PCIe core will look in the device tree and when it creates the
>>> platform device for the i210 on the pcie bus, it points
>>> pdev->dev.of_node at this node. So long as you are using a platform
>>> with DT, you can do this. I hope you are not using x86..
>>
>> Yes I am :( Any possible solution for this?

I haven't looked too closely, but maybe you can add a new mdiobus_scan
function for 10G that attempts get_phy_device() with is_c45 set to true
and if nothing is found falls back to get_phy_device() with is_c45 set to
false.  I don't know what would happen if you have a non-c45 phy attached,
but it's worth a shot to try it and see for each situation.

Thanks,
Tom

> 
> Well, DT can be used with x86. I think Edison did that. But i assume
> your PCIe host is in ACPI, not DT. So getting this linking working
> will not be easy.
> 
> There has been some work to add an ACPI binding for PHYs. I don't know
> if it actually got far enough that you can hack your DSDT to add a
> PHY. But i'm sure it did not get far enough that you can describe an
> MDIO bus in DSDT, so it probably is not going to help you.
> 
>> I guess in ultimate case I will have to switch to ARM based setup.
> 
> Yes, or MIPS.
> 
>  Andrew
> 


Re: KCM - recvmsg() mangles packets?

2018-08-09 Thread Tom Herbert
On Sun, Aug 5, 2018 at 4:39 PM, Dominique Martinet
 wrote:
> Dominique Martinet wrote on Sun, Aug 05, 2018:
>> It's getting late but I'll try adding a pskb_pull in there tomorrow, it
>> would be better to make the bpf program start with an offset but I don't
>> think that'll be easy to change...
>
> I can confirm the following patch fixes the issue for me:
> -8<-
> diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
> index 625acb27efcc..348ff5945591 100644
> --- a/net/strparser/strparser.c
> +++ b/net/strparser/strparser.c
> @@ -222,6 +222,16 @@ static int __strp_recv(read_descriptor_t *desc, struct 
> sk_buff *orig_skb,
> if (!stm->strp.full_len) {
> ssize_t len;
>
> +   /* Can only parse if there is no offset */
> +   if (unlikely(stm->strp.offset)) {
> +   if (!pskb_pull(skb, stm->strp.offset)) {
> +   STRP_STATS_INCR(strp->stats.mem_fail);
> +   strp_parser_err(strp, -ENOMEM, desc);
> +   break;
> +   }
> +   stm->strp.offset = 0;
> +   }
> +

Seems okay to me for a fix. Looks like strp.offset is only set in one
place and read in one place. With this pull maybe that just can go
away?

Tom


> len = (*strp->cb.parse_msg)(strp, head);
>
> if (!len) {
> 8<--
>
> Now, I was looking at other users of strparser (I see sockmap, kcm and
> tls) and it looks like sockmap does not handle offsets either but tls
> does by using skb_copy_bits -- they're copying the tls header to a
> buffer on the stack.
>
> kcm cannot do that because we do not know how much data the user expects
> to read, and I'm not comfortable doing pskb_pull in the kcm callback
> either, but the cost of this pull is probably non-negligible if some
> user can make do without it...
>
> On the other hand, I do not see how to make the bpf program handle an
> offset in the skb as that offset is strparser-specific.
>
> Maybe add a flag in the cb that specifies wether the callback allows
> non-zero offset?
>
>
> I'll let you see if you can reproduce this and will wait for advices on
> how to solve this properly so we can work on a proper fix.
>
>
> Thanks,
> --
> Dominique


Re: KCM - recvmsg() mangles packets?

2018-08-03 Thread Tom Herbert
On Fri, Aug 3, 2018 at 4:20 PM, Dominique Martinet
 wrote:
> Tom Herbert wrote on Fri, Aug 03, 2018:
>> struct my_proto {
>>struct _hdr {
>>uint32_t len;
>> } hdr;
>> char data[32];
>> } __attribute__((packed));
>>
>> // use htons to use LE header size, since load_half does a first convertion
>> // from network byte order
>> const char *bpf_prog_string = " \
>> ssize_t bpf_prog1(struct __sk_buff *skb) \
>> { \
>> return bpf_htons(load_half(skb, 0)) + 4; \
>> }";
>>
>> The length in hdr is uint32_t above, but this looks like it's being
>> read as a short.
>
> Err, I agree this is obviously wrong here (I can blame my lack of
> attention to this and the example I used), but this isn't the problem as
> the actual size is between 0 and 32 -- I could use any size I want here
> and the result would the same.
>
> A "real" problem with the conversion program would mean that my example
> would not work if I slow it down, but I can send as many packet as I
> want if I uncomment the usleep() on the client side or if I just
> throttle the network stack with a loud tcpdump writing to stdout -- that
> means the algorithm is working even if it's making some badly-sized
> conversions.
>
> (Just to make sure I did fix it to htonl(load_word()) and I can confirm
> there is no difference)
>

You also need to htonl for

my_msg.hdr.len = (i++ * 1312739ULL) % 31 + 1;


>
> Thanks,
> --
> Dominique Martinet


Re: KCM - recvmsg() mangles packets?

2018-08-03 Thread Tom Herbert
struct my_proto {
   struct _hdr {
   uint32_t len;
} hdr;
char data[32];
} __attribute__((packed));

// use htons to use LE header size, since load_half does a first convertion
// from network byte order
const char *bpf_prog_string = " \
ssize_t bpf_prog1(struct __sk_buff *skb) \
{ \
return bpf_htons(load_half(skb, 0)) + 4; \
}";


On Fri, Aug 3, 2018 at 11:28 AM, Dominique Martinet
 wrote:
> I've been playing with KCM on a 4.18.0-rc7 kernel and I'm running in a
> problem where the iovec filled by recvmsg() is mangled up: it is filled
> by the length of one packet, but contains (truncated) data from another
> packet, rendering KCM unuseable.
>
> (I haven't tried old kernels to see for how long this is broken/try to
> bisect; I might if there's no progress but this might be simpler than I
> think)
>
>
> I've attached a reproducer, a simple program that forks, creates a tcp
> server/client, attach the server socket to a kcm socket, and in an
> infinite loop sends varying-length messages from the client to the
> server.
> The loop stops when the server gets a message which length is not the
> length indicated in the packet header, rather fast (I can make it run
> for a while if I slow down emission, or if I run a verbose tcpdump for
> example)
>
>From the reproducer:

struct my_proto {
   struct _hdr {
   uint32_t len;
} hdr;
char data[32];
} __attribute__((packed));

// use htons to use LE header size, since load_half does a first convertion
// from network byte order
const char *bpf_prog_string = " \
ssize_t bpf_prog1(struct __sk_buff *skb) \
{ \
return bpf_htons(load_half(skb, 0)) + 4; \
}";

The length in hdr is uint32_t above, but this looks like it's being
read as a short.

Tom

> In the quiet version on a VM on my laptop, I get this output:
> [root@f2 ~]# gcc -g -l bcc -o kcm kcm.c
> [root@f2 ~]# ./kcm
> client is starting
> server is starting
> server is receiving data
> Got 14, expected 27 on 1th message: 22; flags: 80
>
> The client sends message deterministacally, first one is 14 bytes filled
> with 1, second one is 27 bytes filled with 2, third one is 9 bytes
> filled with 3 etc (final digit is actually a \0 instead)
>
> As we can see, the server received 14 '2', and the header size matches
> the second message header, so something went wrong™.
> Flags 0x80 is MSG_EOR meaning recvmsg copied the full message.
>
>
>
> This happens even if I reduce the VMs CPU to 1, so I was thinking some
> irq messes with the sock between skb_peek and the actual copy of the
> data (as this deos work if I send slowly!), but even disabling
> irq/preempt doesn't seem to help so I'm not sure what to try next.
>
> Any idea?
>
>
> Thanks,
> --
> Dominique Martinet


[PATCH net-next 1/4] ila: Fix use of rhashtable walk in ila_xlat.c

2018-06-27 Thread Tom Herbert
Perform better EAGAIN handling, handle case where ila_dump_info
fails and we missed objects in the dump, and add a skip index
to skip over ila entires in a list on a rhashtable node that have
already been visited (by a previous call to ila_nl_dump).

Signed-off-by: Tom Herbert 
---
 net/ipv6/ila/ila_xlat.c | 70 ++---
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 10ae135..40f3f64 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -475,24 +475,31 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, 
struct genl_info *info)
 
 struct ila_dump_iter {
struct rhashtable_iter rhiter;
+   int skip;
 };
 
 static int ila_nl_dump_start(struct netlink_callback *cb)
 {
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
-   struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+   struct ila_dump_iter *iter;
+   int ret;
 
-   if (!iter) {
-   iter = kmalloc(sizeof(*iter), GFP_KERNEL);
-   if (!iter)
-   return -ENOMEM;
+   iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+   if (!iter)
+   return -ENOMEM;
 
-   cb->args[0] = (long)iter;
+   ret = rhashtable_walk_init(>rhash_table, >rhiter,
+  GFP_KERNEL);
+   if (ret) {
+   kfree(iter);
+   return ret;
}
 
-   return rhashtable_walk_init(>rhash_table, >rhiter,
-   GFP_KERNEL);
+   iter->skip = 0;
+   cb->args[0] = (long)iter;
+
+   return ret;
 }
 
 static int ila_nl_dump_done(struct netlink_callback *cb)
@@ -510,20 +517,45 @@ static int ila_nl_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
 {
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
struct rhashtable_iter *rhiter = >rhiter;
+   int skip = iter->skip;
struct ila_map *ila;
int ret;
 
rhashtable_walk_start(rhiter);
 
-   for (;;) {
-   ila = rhashtable_walk_next(rhiter);
+   /* Get first entry */
+   ila = rhashtable_walk_peek(rhiter);
+
+   if (ila && !IS_ERR(ila) && skip) {
+   /* Skip over visited entries */
+
+   while (ila && skip) {
+   /* Skip over any ila entries in this list that we
+* have already dumped.
+*/
+   ila = rcu_access_pointer(ila->next);
+   skip--;
+   }
+   }
 
+   skip = 0;
+
+   for (;;) {
if (IS_ERR(ila)) {
-   if (PTR_ERR(ila) == -EAGAIN)
-   continue;
ret = PTR_ERR(ila);
-   goto done;
+   if (ret == -EAGAIN) {
+   /* Table has changed and iter has reset. Return
+* -EAGAIN to the application even if we have
+* written data to the skb. The application
+* needs to deal with this.
+*/
+
+   goto out_ret;
+   } else {
+   break;
+   }
} else if (!ila) {
+   ret = 0;
break;
}
 
@@ -532,15 +564,21 @@ static int ila_nl_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
 cb->nlh->nlmsg_seq, NLM_F_MULTI,
 skb, ILA_CMD_GET);
if (ret)
-   goto done;
+   goto out;
 
+   skip++;
ila = rcu_access_pointer(ila->next);
}
+
+   skip = 0;
+   ila = rhashtable_walk_next(rhiter);
}
 
-   ret = skb->len;
+out:
+   iter->skip = skip;
+   ret = (skb->len ? : ret);
 
-done:
+out_ret:
rhashtable_walk_stop(rhiter);
return ret;
 }
-- 
2.7.4



[PATCH net-next 4/4] ila: Flush netlink command to clear xlat table

2018-06-27 Thread Tom Herbert
Add ILA_CMD_FLUSH netlink command to clear the ILA translation table.

Signed-off-by: Tom Herbert 
---
 include/uapi/linux/ila.h |  1 +
 net/ipv6/ila/ila.h   |  1 +
 net/ipv6/ila/ila_main.c  |  6 +
 net/ipv6/ila/ila_xlat.c  | 62 ++--
 4 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 483b77af..db45d3e 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -30,6 +30,7 @@ enum {
ILA_CMD_ADD,
ILA_CMD_DEL,
ILA_CMD_GET,
+   ILA_CMD_FLUSH,
 
__ILA_CMD_MAX,
 };
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index faba782..1f747bc 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -123,6 +123,7 @@ void ila_xlat_exit_net(struct net *net);
 int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_dump_start(struct netlink_callback *cb);
 int ila_xlat_nl_dump_done(struct netlink_callback *cb);
 int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
index f6ac6b1..18fac76 100644
--- a/net/ipv6/ila/ila_main.c
+++ b/net/ipv6/ila/ila_main.c
@@ -27,6 +27,12 @@ static const struct genl_ops ila_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
},
{
+   .cmd = ILA_CMD_FLUSH,
+   .doit = ila_xlat_nl_cmd_flush,
+   .policy = ila_nl_policy,
+   .flags = GENL_ADMIN_PERM,
+   },
+   {
.cmd = ILA_CMD_GET,
.doit = ila_xlat_nl_cmd_get_mapping,
.start = ila_xlat_nl_dump_start,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index d05de89..51a15ce 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -164,9 +164,9 @@ static inline void ila_release(struct ila_map *ila)
kfree_rcu(ila, rcu);
 }
 
-static void ila_free_cb(void *ptr, void *arg)
+static void ila_free_node(struct ila_map *ila)
 {
-   struct ila_map *ila = (struct ila_map *)ptr, *next;
+   struct ila_map *next;
 
/* Assume rcu_readlock held */
while (ila) {
@@ -176,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg)
}
 }
 
+static void ila_free_cb(void *ptr, void *arg)
+{
+   ila_free_node((struct ila_map *)ptr);
+}
+
 static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
 
 static unsigned int
@@ -365,6 +370,59 @@ int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, 
struct genl_info *info)
return 0;
 }
 
+static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
+   struct ila_map *ila)
+{
+   return ila_get_lock(ilan, ila->xp.ip.locator_match);
+}
+
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
+{
+   struct net *net = genl_info_net(info);
+   struct ila_net *ilan = net_generic(net, ila_net_id);
+   struct rhashtable_iter iter;
+   struct ila_map *ila;
+   spinlock_t *lock;
+   int ret;
+
+   ret = rhashtable_walk_init(>xlat.rhash_table, , GFP_KERNEL);
+   if (ret)
+   goto done;
+
+   rhashtable_walk_start();
+
+   for (;;) {
+   ila = rhashtable_walk_next();
+
+   if (IS_ERR(ila)) {
+   if (PTR_ERR(ila) == -EAGAIN)
+   continue;
+   ret = PTR_ERR(ila);
+   goto done;
+   } else if (!ila) {
+   break;
+   }
+
+   lock = lock_from_ila_map(ilan, ila);
+
+   spin_lock(lock);
+
+   ret = rhashtable_remove_fast(>xlat.rhash_table,
+>node, rht_params);
+   if (!ret)
+   ila_free_node(ila);
+
+   spin_unlock(lock);
+
+   if (ret)
+   break;
+   }
+
+done:
+   rhashtable_walk_stop();
+   return ret;
+}
+
 static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 {
if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
-- 
2.7.4



[PATCH net-next 3/4] ila: Create main ila source file

2018-06-27 Thread Tom Herbert
Create a main ila file that contains the module initialization functions
as well as netlink definitions. Previously these were defined in
ila_xlat and ila_common. This approach allows better extensibility.

Signed-off-by: Tom Herbert 
---
 net/ipv6/ila/Makefile |   2 +-
 net/ipv6/ila/ila.h|  26 -
 net/ipv6/ila/ila_common.c |  30 --
 net/ipv6/ila/ila_main.c   | 115 +
 net/ipv6/ila/ila_xlat.c   | 142 +-
 5 files changed, 168 insertions(+), 147 deletions(-)
 create mode 100644 net/ipv6/ila/ila_main.c

diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index 4b32e59..b7739ab 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_IPV6_ILA) += ila.o
 
-ila-objs := ila_common.o ila_lwt.o ila_xlat.o
+ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 3c7a11b..faba782 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -104,9 +105,30 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct 
ila_params *p,
 
 void ila_init_saved_csum(struct ila_params *p);
 
+struct ila_net {
+   struct {
+   struct rhashtable rhash_table;
+   spinlock_t *locks; /* Bucket locks for entry manipulation */
+   unsigned int locks_mask;
+   bool hooks_registered;
+   } xlat;
+};
+
 int ila_lwt_init(void);
 void ila_lwt_fini(void);
-int ila_xlat_init(void);
-void ila_xlat_fini(void);
+
+int ila_xlat_init_net(struct net *net);
+void ila_xlat_exit_net(struct net *net);
+
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_dump_start(struct netlink_callback *cb);
+int ila_xlat_nl_dump_done(struct netlink_callback *cb);
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+extern unsigned int ila_net_id;
+
+extern struct genl_family ila_nl_family;
 
 #endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 8c88ecf..5793104 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -154,33 +154,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct 
ila_params *p,
iaddr->loc = p->locator;
 }
 
-static int __init ila_init(void)
-{
-   int ret;
-
-   ret = ila_lwt_init();
-
-   if (ret)
-   goto fail_lwt;
-
-   ret = ila_xlat_init();
-   if (ret)
-   goto fail_xlat;
-
-   return 0;
-fail_xlat:
-   ila_lwt_fini();
-fail_lwt:
-   return ret;
-}
-
-static void __exit ila_fini(void)
-{
-   ila_xlat_fini();
-   ila_lwt_fini();
-}
-
-module_init(ila_init);
-module_exit(ila_fini);
-MODULE_AUTHOR("Tom Herbert ");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
new file mode 100644
index 000..f6ac6b1
--- /dev/null
+++ b/net/ipv6/ila/ila_main.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include "ila.h"
+
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+   [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+   [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
+   [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
+   [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+   [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+};
+
+static const struct genl_ops ila_nl_ops[] = {
+   {
+   .cmd = ILA_CMD_ADD,
+   .doit = ila_xlat_nl_cmd_add_mapping,
+   .policy = ila_nl_policy,
+   .flags = GENL_ADMIN_PERM,
+   },
+   {
+   .cmd = ILA_CMD_DEL,
+   .doit = ila_xlat_nl_cmd_del_mapping,
+   .policy = ila_nl_policy,
+   .flags = GENL_ADMIN_PERM,
+   },
+   {
+   .cmd = ILA_CMD_GET,
+   .doit = ila_xlat_nl_cmd_get_mapping,
+   .start = ila_xlat_nl_dump_start,
+   .dumpit = ila_xlat_nl_dump,
+   .done = ila_xlat_nl_dump_done,
+   .policy = ila_nl_policy,
+   },
+};
+
+unsigned int ila_net_id;
+
+struct genl_family ila_nl_family __ro_after_init = {
+   .hdrsize= 0,
+   .name   = ILA_GENL_NAME,
+   .version= ILA_GENL_VERSION,
+   .maxattr= ILA_ATTR_MAX,
+   .netnsok= true,
+   .parallel_ops   = true,
+   .module = THIS_MODULE,
+   .ops= ila_nl_ops,
+   .n_ops  = ARRAY_SIZE(ila_nl_ops),
+};
+
+static __net_init int ila_init_net(struct net *net)
+{
+   int err;
+
+   err = ila_xlat_init_net(net);
+

[PATCH net-next 2/4] ila: Call library function alloc_bucket_locks

2018-06-27 Thread Tom Herbert
To allocate the array of bucket locks for the hash table we now
call library function alloc_bucket_spinlocks.

Signed-off-by: Tom Herbert 
---
 net/ipv6/ila/ila_xlat.c | 23 +--
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 40f3f64..9cc8bee 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -31,27 +31,14 @@ struct ila_net {
bool hooks_registered;
 };
 
+#define MAX_LOCKS 1024
 #defineLOCKS_PER_CPU 10
 
 static int alloc_ila_locks(struct ila_net *ilan)
 {
-   unsigned int i, size;
-   unsigned int nr_pcpus = num_possible_cpus();
-
-   nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
-   size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
-
-   if (sizeof(spinlock_t) != 0) {
-   ilan->locks = kvmalloc_array(size, sizeof(spinlock_t),
-GFP_KERNEL);
-   if (!ilan->locks)
-   return -ENOMEM;
-   for (i = 0; i < size; i++)
-   spin_lock_init(>locks[i]);
-   }
-   ilan->locks_mask = size - 1;
-
-   return 0;
+   return alloc_bucket_spinlocks(>locks, >locks_mask,
+ MAX_LOCKS, LOCKS_PER_CPU,
+ GFP_KERNEL);
 }
 
 static u32 hashrnd __read_mostly;
@@ -640,7 +627,7 @@ static __net_exit void ila_exit_net(struct net *net)
 
rhashtable_free_and_destroy(>rhash_table, ila_free_cb, NULL);
 
-   kvfree(ilan->locks);
+   free_bucket_spinlocks(ilan->locks);
 
if (ilan->hooks_registered)
nf_unregister_net_hooks(net, ila_nf_hook_ops,
-- 
2.7.4



[PATCH net-next 0/4] ila: Cleanup

2018-06-27 Thread Tom Herbert
Perform some cleanup in ILA code. This includes:

- Fix rhashtable walk for cases where nl dumps are done with muliple
  function calls. Add a skip index to skip over entries in
  a node that have been previously visitied. Call rhashtable_walk_peek
  to avoid dropping items between calls to ila_nl_dump.
- Call alloc_bucket_spinlocks to create bucket locks.
- Split out module initialization and netlink definitions into
  separate files.
- Add ILA_CMD_FLUSH netlink command to clear the ILA translation table.


Tom Herbert (4):
  ila: Fix use of rhashtable walk in ila_xlat.c
  ila: Call library function alloc_bucket_locks
  ila: Create main ila source file
  ila: Flush netlink command to clear xlat table

 include/uapi/linux/ila.h  |   1 +
 net/ipv6/ila/Makefile |   2 +-
 net/ipv6/ila/ila.h|  27 -
 net/ipv6/ila/ila_common.c |  30 -
 net/ipv6/ila/ila_main.c   | 121 +++
 net/ipv6/ila/ila_xlat.c   | 291 +++---
 6 files changed, 292 insertions(+), 180 deletions(-)
 create mode 100644 net/ipv6/ila/ila_main.c

-- 
2.7.4



Re: [net-next PATCH v4 7/7] Documentation: Add explanation for XPS using Rx-queue(s) map

2018-06-26 Thread Tom Herbert
On Mon, Jun 25, 2018 at 11:04 AM, Amritha Nambiar
 wrote:
> Signed-off-by: Amritha Nambiar 

Acked-by: Tom Herbert 

> ---
>  Documentation/ABI/testing/sysfs-class-net-queues |   11 
>  Documentation/networking/scaling.txt |   57 
> ++
>  2 files changed, 58 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-class-net-queues 
> b/Documentation/ABI/testing/sysfs-class-net-queues
> index 0c0df91..978b763 100644
> --- a/Documentation/ABI/testing/sysfs-class-net-queues
> +++ b/Documentation/ABI/testing/sysfs-class-net-queues
> @@ -42,6 +42,17 @@ Description:
> network device transmit queue. Possible vaules depend on the
> number of available CPU(s) in the system.
>
> +What:  /sys/class//queues/tx-/xps_rxqs
> +Date:  June 2018
> +KernelVersion: 4.18.0
> +Contact:   netdev@vger.kernel.org
> +Description:
> +   Mask of the receive queue(s) currently enabled to participate
> +   into the Transmit Packet Steering packet processing flow for 
> this
> +   network device transmit queue. Possible values depend on the
> +   number of available receive queue(s) in the network device.
> +   Default is disabled.
> +
>  What:  
> /sys/class//queues/tx-/byte_queue_limits/hold_time
>  Date:  November 2011
>  KernelVersion: 3.3
> diff --git a/Documentation/networking/scaling.txt 
> b/Documentation/networking/scaling.txt
> index f55639d..8336116 100644
> --- a/Documentation/networking/scaling.txt
> +++ b/Documentation/networking/scaling.txt
> @@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
>
>  Transmit Packet Steering is a mechanism for intelligently selecting
>  which transmit queue to use when transmitting a packet on a multi-queue
> -device. To accomplish this, a mapping from CPU to hardware queue(s) is
> -recorded. The goal of this mapping is usually to assign queues
> +device. This can be accomplished by recording two kinds of maps, either
> +a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
> +to hardware transmit queue(s).
> +
> +1. XPS using CPUs map
> +
> +The goal of this mapping is usually to assign queues
>  exclusively to a subset of CPUs, where the transmit completions for
>  these queues are processed on a CPU within this set. This choice
>  provides two benefits. First, contention on the device queue lock is
> @@ -377,12 +382,35 @@ transmit queue). Secondly, cache miss rate on transmit 
> completion is
>  reduced, in particular for data cache lines that hold the sk_buff
>  structures.
>
> -XPS is configured per transmit queue by setting a bitmap of CPUs that
> -may use that queue to transmit. The reverse mapping, from CPUs to
> -transmit queues, is computed and maintained for each network device.
> -When transmitting the first packet in a flow, the function
> -get_xps_queue() is called to select a queue. This function uses the ID
> -of the running CPU as a key into the CPU-to-queue lookup table. If the
> +2. XPS using receive queues map
> +
> +This mapping is used to pick transmit queue based on the receive
> +queue(s) map configuration set by the administrator. A set of receive
> +queues can be mapped to a set of transmit queues (many:many), although
> +the common use case is a 1:1 mapping. This will enable sending packets
> +on the same queue associations for transmit and receive. This is useful for
> +busy polling multi-threaded workloads where there are challenges in
> +associating a given CPU to a given application thread. The application
> +threads are not pinned to CPUs and each thread handles packets
> +received on a single queue. The receive queue number is cached in the
> +socket for the connection. In this model, sending the packets on the same
> +transmit queue corresponding to the associated receive queue has benefits
> +in keeping the CPU overhead low. Transmit completion work is locked into
> +the same queue-association that a given application is polling on. This
> +avoids the overhead of triggering an interrupt on another CPU. When the
> +application cleans up the packets during the busy poll, transmit completion
> +may be processed along with it in the same thread context and so result in
> +reduced latency.
> +
> +XPS is configured per transmit queue by setting a bitmap of
> +CPUs/receive-queues that may use that queue to transmit. The reverse
> +mapping, from CPUs to transmit queues or from receive-queues to transmit
> +queues, is computed and maintained for each network device. When
> +transmitting the first packet in a flow, the function get_xps_queue() is
> +called to select a queue. This function uses the ID

Re: [net-next PATCH v4 2/7] net: Use static_key for XPS maps

2018-06-26 Thread Tom Herbert
On Mon, Jun 25, 2018 at 11:04 AM, Amritha Nambiar
 wrote:
> Use static_key for XPS maps to reduce the cost of extra map checks,
> similar to how it is used for RPS and RFS. This includes static_key
> 'xps_needed' for XPS and another for 'xps_rxqs_needed' for XPS using
> Rx queues map.
>

Acked-by: Tom Herbert 

> Signed-off-by: Amritha Nambiar 
> ---
>  net/core/dev.c |   26 --
>  1 file changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 2552556..df2a78d 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
> int txq)
>  EXPORT_SYMBOL(netdev_txq_to_tc);
>
>  #ifdef CONFIG_XPS
> +struct static_key xps_needed __read_mostly;
> +EXPORT_SYMBOL(xps_needed);
> +struct static_key xps_rxqs_needed __read_mostly;
> +EXPORT_SYMBOL(xps_rxqs_needed);
>  static DEFINE_MUTEX(xps_map_mutex);
>  #define xmap_dereference(P)\
> rcu_dereference_protected((P), lockdep_is_held(_map_mutex))
> @@ -2170,12 +2174,14 @@ static void netif_reset_xps_queues(struct net_device 
> *dev, u16 offset,
>
> mutex_lock(_map_mutex);
>
> -   dev_maps = xmap_dereference(dev->xps_rxqs_map);
> -   if (dev_maps) {
> -   nr_ids = dev->num_rx_queues;
> -   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
> -  count, true);
> -
> +   if (static_key_false(_rxqs_needed)) {
> +   dev_maps = xmap_dereference(dev->xps_rxqs_map);
> +   if (dev_maps) {
> +   nr_ids = dev->num_rx_queues;
> +   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
> +  offset, count, true);
> +   }
> +   static_key_slow_dec(_rxqs_needed);
> }
>
> dev_maps = xmap_dereference(dev->xps_cpus_map);
> @@ -2189,6 +2195,7 @@ static void netif_reset_xps_queues(struct net_device 
> *dev, u16 offset,
>false);
>
>  out_no_maps:
> +   static_key_slow_dec(_needed);
> mutex_unlock(_map_mutex);
>  }
>
> @@ -2297,6 +2304,10 @@ int __netif_set_xps_queue(struct net_device *dev, 
> const unsigned long *mask,
> if (!new_dev_maps)
> goto out_no_new_maps;
>
> +   static_key_slow_inc(_needed);
> +   if (is_rxqs_map)
> +   static_key_slow_inc(_rxqs_needed);
> +
> for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>  j < nr_ids;) {
> /* copy maps belonging to foreign traffic classes */
> @@ -3450,6 +3461,9 @@ static inline int get_xps_queue(struct net_device *dev, 
> struct sk_buff *skb)
> struct xps_map *map;
> int queue_index = -1;
>
> +   if (!static_key_false(_needed))
> +   return -1;
> +
> rcu_read_lock();
> dev_maps = rcu_dereference(dev->xps_cpus_map);
> if (dev_maps) {
>


Re: [net-next PATCH v4 1/7] net: Refactor XPS for CPUs and Rx queues

2018-06-26 Thread Tom Herbert
>xps_cpus_map, new_dev_maps);
>
> /* Cleanup old maps */
> if (!dev_maps)
> goto out_no_old_maps;
>
> -   for_each_possible_cpu(cpu) {
> -   for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
> -   new_map = 
> xmap_dereference(new_dev_maps->cpu_map[tci]);
> -   map = xmap_dereference(dev_maps->cpu_map[tci]);
> +   for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +j < nr_ids;) {
> +   for (i = num_tc, tci = j * num_tc; i--; tci++) {
> +   new_map = 
> xmap_dereference(new_dev_maps->attr_map[tci]);
> +   map = xmap_dereference(dev_maps->attr_map[tci]);
> if (map && map != new_map)
> kfree_rcu(map, rcu);
> }
> @@ -2317,19 +2370,23 @@ int netif_set_xps_queue(struct net_device *dev, const 
> struct cpumask *mask,
> active = true;
>
>  out_no_new_maps:
> -   /* update Tx queue numa node */
> -   netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
> -(numa_node_id >= 0) ? numa_node_id :
> -NUMA_NO_NODE);
> +   if (!is_rxqs_map) {
> +   /* update Tx queue numa node */
> +   netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
> +(numa_node_id >= 0) ?
> +numa_node_id : NUMA_NO_NODE);
> +   }
>
> if (!dev_maps)
> goto out_no_maps;
>
> -   /* removes queue from unused CPUs */
> -   for_each_possible_cpu(cpu) {
> -   for (i = tc, tci = cpu * num_tc; i--; tci++)
> +   /* removes tx-queue from unused CPUs/rx-queues */
> +   for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +j < nr_ids;) {
> +   for (i = tc, tci = j * num_tc; i--; tci++)
> active |= remove_xps_queue(dev_maps, tci, index);
> -   if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
> +   if (!attr_test_mask(j, mask, nr_ids) ||
> +   !attr_test_online(j, online_mask, nr_ids))
> active |= remove_xps_queue(dev_maps, tci, index);
> for (i = num_tc - tc, tci++; --i; tci++)
> active |= remove_xps_queue(dev_maps, tci, index);
> @@ -2337,7 +2394,10 @@ int netif_set_xps_queue(struct net_device *dev, const 
> struct cpumask *mask,
>
> /* free map if not active */
> if (!active) {
> -   RCU_INIT_POINTER(dev->xps_maps, NULL);
> +   if (is_rxqs_map)
> +   RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
> +   else
> +   RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
> kfree_rcu(dev_maps, rcu);
> }
>
> @@ -2347,11 +2407,12 @@ int netif_set_xps_queue(struct net_device *dev, const 
> struct cpumask *mask,
> return 0;
>  error:
> /* remove any maps that we added */
> -   for_each_possible_cpu(cpu) {
> -   for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
> -   new_map = 
> xmap_dereference(new_dev_maps->cpu_map[tci]);
> +   for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +j < nr_ids;) {
> +   for (i = num_tc, tci = j * num_tc; i--; tci++) {
> +   new_map = 
> xmap_dereference(new_dev_maps->attr_map[tci]);
> map = dev_maps ?
> - xmap_dereference(dev_maps->cpu_map[tci]) :
> + xmap_dereference(dev_maps->attr_map[tci]) :
>   NULL;
> if (new_map && new_map != map)
> kfree(new_map);
> @@ -2363,6 +2424,12 @@ int netif_set_xps_queue(struct net_device *dev, const 
> struct cpumask *mask,
> kfree(new_dev_maps);
> return -ENOMEM;
>  }
> +
> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> +   u16 index)
> +{
> +   return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
> +}
>  EXPORT_SYMBOL(netif_set_xps_queue);
>
>  #endif
> @@ -3384,7 +3451,7 @@ static inline int get_xps_queue(struct net_device *dev, 
> struct sk_buff *skb)
> int queue_index = -1;
>
> rcu_read_lock();
> -   dev_maps = rcu_dereference(dev->xps_maps);
> +   dev_maps = rcu_dereference(dev->xps_cpus_map);
> if (dev_maps) {
> unsigned int tci = skb->sender_cpu - 1;
>
> @@ -3393,7 +3460,7 @@ static inline int get_xps_queue(struct net_device *dev, 
> struct sk_buff *skb)
> tci += netdev_get_prio_tc_map(dev, skb->priority);
> }
>
> -   map = rcu_dereference(dev_maps->cpu_map[tci]);
> +   map = rcu_dereference(dev_maps->attr_map[tci]);
> if (map) {
> if (map->len == 1)
> queue_index = map->queues[0];
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index bb7e80f..b39987c 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue 
> *queue,
> return -ENOMEM;
>
> rcu_read_lock();
> -   dev_maps = rcu_dereference(dev->xps_maps);
> +   dev_maps = rcu_dereference(dev->xps_cpus_map);
> if (dev_maps) {
> for_each_possible_cpu(cpu) {
> int i, tci = cpu * num_tc + tc;
> struct xps_map *map;
>
> -   map = rcu_dereference(dev_maps->cpu_map[tci]);
> +   map = rcu_dereference(dev_maps->attr_map[tci]);
> if (!map)
> continue;
>
>

Acked-by: Tom Herbert 


Re: [RFC PATCH v2 net-next 00/12] Handle multiple received packets at each stage

2018-06-26 Thread Tom Herbert
them very readable. I do think that XDP
related patches at the end of the set should be separated out.

I suspects the effects will vary a lot between architectures and
configuration, so I'm not too worried about the variance mentioned in
the performance numbers. For future work, it might also be worth it to
compare to techniques done in VPP.

Tom

>
> I also performed tests with Generic XDP enabled (using a simple map-based
>  UDP port drop program with no entries in the map), both with and without
>  the eBPF JIT enabled.
> No JIT:
> net-next: 3.52 Mb/s (datum)
>  after 9: 4.91 Mb/s (datum + 39.5%)
> after 12: 4.83 Mb/s (datum + 37.3%)
>
> With JIT:
> net-next: 5.23 Mb/s (datum)
>  after 9: 6.64 Mb/s (datum + 27.0%)
> after 12: 6.46 Mb/s (datum + 23.6%)
>
> Another test variation was the use of software filtering/firewall rules.
>  Adding a single iptables rule (a UDP port drop on a port range not
>  matching the test traffic), thus making the netfilter hook have work to
>  do, reduced baseline performance but showed a similar delta from the
>  patches.  Similarly, testing with a set of TC flower filters (kindly
>  supplied by Cong Wang) in the single-RXQ setup (that previously gave 4%)
>  slowed down the baseline but not the patched performance, giving a 5.7%
>  performance delta.  These data suggest that the batching approach
>  remains effective in the presence of software switching rules.
>
> Changes from v1 (see [3]):
> * Rebased across 2 years' net-next movement (surprisingly straightforward).
>   - Added Generic XDP handling to netif_receive_skb_list_internal()
>   - Dealt with changes to PFMEMALLOC setting APIs
> * General cleanup of code and comments.
> * Skipped function calls for empty lists at various points in the stack
>   (patch #9).
> * Added listified Generic XDP handling (patches 10-12), though it doesn't
>   seem to help (see above).
> * Extended testing to cover software firewalls / netfilter etc.
>
> [1] http://vger.kernel.org/netconf2018_files/DavidMiller_netconf2018.pdf
> [2] http://vger.kernel.org/netconf2018_files/EdwardCree_netconf2018.pdf
> [3] http://lists.openwall.net/netdev/2016/04/19/89
>
> Edward Cree (12):
>   net: core: trivial netif_receive_skb_list() entry point
>   sfc: batch up RX delivery
>   net: core: unwrap skb list receive slightly further
>   net: core: Another step of skb receive list processing
>   net: core: another layer of lists, around PF_MEMALLOC skb handling
>   net: core: propagate SKB lists through packet_type lookup
>   net: ipv4: listified version of ip_rcv
>   net: ipv4: listify ip_rcv_finish
>   net: don't bother calling list RX functions on empty lists
>   net: listify Generic XDP processing, part 1
>   net: listify Generic XDP processing, part 2
>   net: listify jited Generic XDP processing on x86_64
>
>  arch/x86/net/bpf_jit_comp.c   | 164 ++
>  drivers/net/ethernet/sfc/efx.c|  12 +
>  drivers/net/ethernet/sfc/net_driver.h |   3 +
>  drivers/net/ethernet/sfc/rx.c |   7 +-
>  include/linux/filter.h|  43 +++-
>  include/linux/netdevice.h |   4 +
>  include/linux/netfilter.h |  27 +++
>  include/linux/skbuff.h|  16 ++
>  include/net/ip.h  |   2 +
>  include/trace/events/net.h|  14 ++
>  kernel/bpf/core.c |  38 +++-
>  net/core/dev.c| 415 
> +-
>  net/core/filter.c |  10 +-
>  net/ipv4/af_inet.c|   1 +
>  net/ipv4/ip_input.c   | 129 ++-
>  15 files changed, 810 insertions(+), 75 deletions(-)
>


Re: 答复: ANNOUNCE: Enhanced IP v1.4

2018-06-04 Thread Tom Herbert
On Mon, Jun 4, 2018 at 6:02 AM, Eric Dumazet  wrote:
>
>
> On 06/03/2018 10:58 PM, PKU.孙斌 wrote:
>> On Sun, Jun 03, 2018 at 03:41:08PM -0700, Eric Dumazet wrote:
>>>
>>>
>>> On 06/03/2018 01:37 PM, Tom Herbert wrote:
>>>
>>>> This is not an inconsequential mechanism that is being proposed. It's
>>>> a modification to IP protocol that is intended to work on the
>>>> Internet, but it looks like the draft hasn't been updated for two
>>>> years and it is not adopted by any IETF working group. I don't see how
>>>> this can go anywhere without IETF support. Also, I suggest that you
>>>> look at the IPv10 proposal since that was very similar in intent. One
>>>> of the reasons that IPv10 shot down was because protocol transition
>>>> mechanisms were more interesting ten years ago than today. IPv6 has
>>>> good traction now. In fact, it's probably the case that it's now
>>>> easier to bring up IPv6 than to try to make IPv4 options work over the
>>>> Internet.
>>>
>>> +1
>>>
>>> Many hosts do not use IPv4 anymore.
>>>
>>> We even have the project making IPv4 support in linux optional.
>>
>> I guess then Linux kernel wouldn't be able to boot itself without IPv4 built 
>> in, e.g., when we only have old L2 links (without the IPv6 frame type)...
>
>
>
> *Optional* means that a CONFIG_IPV4 would be there, and some people could 
> build a kernel with CONFIG_IPV4=n,
>
> Like IPv6 is optional today.
>
> Of course, most distros will select CONFIG_IPV4=y  (as they probably select 
> CONFIG_IPV6=y today)
>
> Do not worry, IPv4 is not dead, but I doubt Enhanced IP v1.4 has any chance,
> it is at least 10 years too late.

There's also 
https://www.theregister.co.uk/2018/05/30/internet_engineers_united_nations_ipv6/.
We're reaching the point where it's the transition mechnanisms that
are hampering IPv6 adoption.

Tom


Re: ANNOUNCE: Enhanced IP v1.4

2018-06-03 Thread Tom Herbert
On Sat, Jun 2, 2018 at 9:17 AM, Sam Patton  wrote:
> Hello Willy, netdev,
>
> Thank you for your reply and advice.  I couldn't agree more with you
> about containers and the exciting prospects there,
>
> as well as the ADSL scenario you mention.
>
> As far as application examples, check out this simple netcat-like
> program I use for testing:
>
> https://github.com/EnIP/enhancedip/blob/master/userspace/netcat/netcat.c
>
> Lines 61-67 show how to connect directly via an EnIP address.  The
> netcat-like application uses
>
> a header file called eip.h.  You can look at it here:
>
> https://github.com/EnIP/enhancedip/blob/master/userspace/include/eip.h
>
> EnIP makes use of IPv6  records for DNS lookup.  We simply put
> 2001:0101 (which is an IPv6 experimental prefix) and
>
> then we put the 64-bit EnIP address into the next 8 bytes of the
> address.  The remaining bytes are set to zero.
>
> In the kernel, if you want to see how we convert the IPv6 DNS lookup
> into something connect() can manage,
>
> check out the add_enhanced_ip() routine found here:
>
> https://github.com/EnIP/enhancedip/blob/master/kernel/4.9.28/socket.c
>
> The reason we had to do changes for openssh and not other applications
> (that use DNS) is openssh has a check to
>
> see if the socket is using IP options.  If the socket does, sshd drops
> the connection.  I had to work around that to get openssh working
>
> with EnIP.  The result: if you want to connect the netcat-like program
> with IP addresses you'll end up doing something like the
>
> example above.  If you're using DNS (getaddrinfo) to connect(), it
> should just work (except for sshd as noted).
>
> Here's the draft experimental RFC:
> https://tools.ietf.org/html/draft-chimiak-enhanced-ipv4-03
> I'll also note that I am doing this code part time as a hobby for a long
> time so I appreciate your help and support.  It would be really
>
> great if the kernel community decided to pick this up, but if it's not a
> reality please let me know soonest so I can move on to a
>
Hi Sam,

This is not an inconsequential mechanism that is being proposed. It's
a modification to IP protocol that is intended to work on the
Internet, but it looks like the draft hasn't been updated for two
years and it is not adopted by any IETF working group. I don't see how
this can go anywhere without IETF support. Also, I suggest that you
look at the IPv10 proposal since that was very similar in intent. One
of the reasons that IPv10 shot down was because protocol transition
mechanisms were more interesting ten years ago than today. IPv6 has
good traction now. In fact, it's probably the case that it's now
easier to bring up IPv6 than to try to make IPv4 options work over the
Internet.

Tom


> different hobby.  :)
>
> Thank you.
>
> Sam Patton
>
> On 6/2/18 1:57 AM, Willy Tarreau wrote:
>> Hello Sam,
>>
>> On Fri, Jun 01, 2018 at 09:48:28PM -0400, Sam Patton wrote:
>>> Hello!
>>>
>>> If you do not know what Enhanced IP is, read this post on netdev first:
>>>
>>> https://www.spinics.net/lists/netdev/msg327242.html
>>>
>>>
>>> The Enhanced IP project presents:
>>>
>>>  Enhanced IP v1.4
>>>
>>> The Enhanced IP (EnIP) code has been updated.  It now builds with OpenWRT 
>>> barrier breaker (for 148 different devices). We've been testing with the 
>>> Western Digital N600 and N750 wireless home routers.
>> (...) First note, please think about breaking your lines if you want your
>> mails to be read by the widest audience, as for some of us here, reading
>> lines wider than a terminal is really annoying, and often not considered
>> worth spending time on them considering there are so many easier ones
>> left to read.
>>
>>> Interested in seeing Enhanced IP in the Linux kernel, read on.  Not
>>> interested in seeing Enhanced IP in the Linux kernel read on.
>> (...)
>>
>> So I personally find the concept quite interesting. It reminds me of the
>> previous IPv5/IPv7/IPv8 initiatives, which in my opinion were a bit hopeless.
>> Here the fact that you decide to consider the IPv4 address as a network opens
>> new perspectives. For containerized environments it could be considered that
>> each server, with one IPv4, can host 2^32 guests and that NAT is not needed
>> anymore for example. It could also open the possibility that enthousiasts
>> can more easily host some services at home behind their ADSL line without
>> having to run on strange ports.
>>
>> However I think your approach is not the most efficient to encourage 
>> adoption.
>> It's important to 

[PATCH net-next v2 08/12] amd-xgbe: Add ethtool show/set channels support

2018-05-23 Thread Tom Lendacky
Add ethtool support to show and set the device channel configuration.
Changing the channel configuration will result in a device restart.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |   25 +
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  134 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |4 +
 3 files changed, 163 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 397e3a0..24f1053 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1329,6 +1329,17 @@ static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
struct net_device *netdev = pdata->netdev;
int ret;
 
+   if (pdata->new_tx_ring_count) {
+   pdata->tx_ring_count = pdata->new_tx_ring_count;
+   pdata->tx_q_count = pdata->tx_ring_count;
+   pdata->new_tx_ring_count = 0;
+   }
+
+   if (pdata->new_rx_ring_count) {
+   pdata->rx_ring_count = pdata->new_rx_ring_count;
+   pdata->new_rx_ring_count = 0;
+   }
+
/* Calculate the Rx buffer size before allocating rings */
pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
 
@@ -1482,6 +1493,20 @@ static void xgbe_stopdev(struct work_struct *work)
netdev_alert(pdata->netdev, "device stopped\n");
 }
 
+void xgbe_full_restart_dev(struct xgbe_prv_data *pdata)
+{
+   /* If not running, "restart" will happen on open */
+   if (!netif_running(pdata->netdev))
+   return;
+
+   xgbe_stop(pdata);
+
+   xgbe_free_memory(pdata);
+   xgbe_alloc_memory(pdata);
+
+   xgbe_start(pdata);
+}
+
 void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
/* If not running, "restart" will happen on open */
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index d12f982..a880f10 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -705,6 +705,138 @@ static int xgbe_set_ringparam(struct net_device *netdev,
return 0;
 }
 
+static void xgbe_get_channels(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+   unsigned int rx, tx, combined;
+
+   /* Calculate maximums allowed:
+*   - Take into account the number of available IRQs
+*   - Do not take into account the number of online CPUs so that
+* the user can over-subscribe if desired
+*   - Tx is additionally limited by the number of hardware queues
+*/
+   rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+   rx = min(rx, pdata->channel_irq_count);
+   tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+   tx = min(tx, pdata->channel_irq_count);
+   tx = min(tx, pdata->tx_max_q_count);
+
+   combined = min(rx, tx);
+
+   channels->max_combined = combined;
+   channels->max_rx = rx ? rx - 1 : 0;
+   channels->max_tx = tx ? tx - 1 : 0;
+
+   /* Get current settings based on device state */
+   rx = pdata->new_rx_ring_count ? : pdata->rx_ring_count;
+   tx = pdata->new_tx_ring_count ? : pdata->tx_ring_count;
+
+   combined = min(rx, tx);
+   rx -= combined;
+   tx -= combined;
+
+   channels->combined_count = combined;
+   channels->rx_count = rx;
+   channels->tx_count = tx;
+}
+
+static void xgbe_print_set_channels_input(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+   netdev_err(netdev, "channel inputs: combined=%u, rx-only=%u, 
tx-only=%u\n",
+  channels->combined_count, channels->rx_count,
+  channels->tx_count);
+}
+
+static int xgbe_set_channels(struct net_device *netdev,
+struct ethtool_channels *channels)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+   unsigned int rx, rx_curr, tx, tx_curr, combined;
+
+   /* Calculate maximums allowed:
+*   - Take into account the number of available IRQs
+*   - Do not take into account the number of online CPUs so that
+* the user can over-subscribe if desired
+*   - Tx is additionally limited by the number of hardware queues
+*/
+   rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+   rx = min(rx, pdata->channel_irq_count);
+   tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+   tx = min(tx, pdata->tx_max_q_count);
+   tx = min(tx, pdata->channel_irq_count);
+
+   combine

[PATCH net-next v2 12/12] amd-xgbe: Improve SFP 100Mbps auto-negotiation

2018-05-23 Thread Tom Lendacky
After changing speed to 100Mbps as a result of auto-negotiation (AN),
some 10/100/1000Mbps SFPs indicate a successful link (no faults or loss
of signal), but cannot successfully transmit or receive data.  These
SFPs required an extra auto-negotiation (AN) after the speed change in
order to operate properly.  Add a quirk for these SFPs so that if the
outcome of the AN actually results in changing to a new speed, re-initiate
AN at that new speed.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |   77 +++
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |6 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h|1 
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 450b89c..4b5d625 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -331,13 +331,15 @@ static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
xgbe_change_mode(pdata, pdata->phy_if.phy_impl.switch_mode(pdata));
 }
 
-static void xgbe_set_mode(struct xgbe_prv_data *pdata,
+static bool xgbe_set_mode(struct xgbe_prv_data *pdata,
  enum xgbe_mode mode)
 {
if (mode == xgbe_cur_mode(pdata))
-   return;
+   return false;
 
xgbe_change_mode(pdata, mode);
+
+   return true;
 }
 
 static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
@@ -1178,21 +1180,23 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data 
*pdata)
return 0;
 }
 
-static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata, bool set_mode)
 {
int ret;
 
+   mutex_lock(>an_mutex);
+
set_bit(XGBE_LINK_INIT, >dev_state);
pdata->link_check = jiffies;
 
ret = pdata->phy_if.phy_impl.an_config(pdata);
if (ret)
-   return ret;
+   goto out;
 
if (pdata->phy.autoneg != AUTONEG_ENABLE) {
ret = xgbe_phy_config_fixed(pdata);
if (ret || !pdata->kr_redrv)
-   return ret;
+   goto out;
 
netif_dbg(pdata, link, pdata->netdev, "AN redriver support\n");
} else {
@@ -1202,24 +1206,27 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data 
*pdata)
/* Disable auto-negotiation interrupt */
disable_irq(pdata->an_irq);
 
-   /* Start auto-negotiation in a supported mode */
-   if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
-   xgbe_set_mode(pdata, XGBE_MODE_KR);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
-   xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
-   xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
-   xgbe_set_mode(pdata, XGBE_MODE_SFI);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
-   xgbe_set_mode(pdata, XGBE_MODE_X);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
-   xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
-   xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
-   } else {
-   enable_irq(pdata->an_irq);
-   return -EINVAL;
+   if (set_mode) {
+   /* Start auto-negotiation in a supported mode */
+   if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
+   xgbe_set_mode(pdata, XGBE_MODE_KR);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
+   xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
+   xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
+   xgbe_set_mode(pdata, XGBE_MODE_SFI);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
+   xgbe_set_mode(pdata, XGBE_MODE_X);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
+   xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
+   xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
+   } else {
+   enable_irq(pdata->an_irq);
+   ret = -EINVAL;
+   goto out;
+   }
}
 
/* Disable and stop any in progress auto-negotiation */
@@ -1239,16 +1246,7 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data 
*pdata)
xgbe_an_init(pdata);
xgbe_an_restart(pdata);
 
-   return 0;
-}
-
-static int xgbe_phy_config_ane

[PATCH net-next v2 09/12] amd-xgbe: Always attempt link training in KR mode

2018-05-23 Thread Tom Lendacky
Link training is always attempted when in KR mode, but the code is
structured to check if link training has been enabled before attempting
to perform it.  Since that check will always be true, simplify the code
to always enable and start link training during KR auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   69 +++--
 1 file changed, 16 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 9c39c72..450b89c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -216,31 +216,8 @@ static void xgbe_an_clear_interrupts_all(struct 
xgbe_prv_data *pdata)
xgbe_an37_clear_interrupts(pdata);
 }
 
-static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
-{
-   unsigned int reg;
-
-   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-   reg |= XGBE_KR_TRAINING_ENABLE;
-   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
-static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
-{
-   unsigned int reg;
-
-   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-   reg &= ~XGBE_KR_TRAINING_ENABLE;
-   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
 static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 {
-   /* Enable KR training */
-   xgbe_an73_enable_kr_training(pdata);
-
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_1);
 
@@ -250,9 +227,6 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 2.5G speed */
pdata->hw_if.set_speed(pdata, SPEED_2500);
 
@@ -262,9 +236,6 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -278,9 +249,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
if (pdata->kr_redrv)
return xgbe_kr_mode(pdata);
 
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_1);
 
@@ -290,9 +258,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -302,9 +267,6 @@ static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -314,9 +276,6 @@ static void xgbe_sgmii_1000_mode(struct xgbe_prv_data 
*pdata)
 
 static void xgbe_sgmii_100_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -425,6 +384,12 @@ static void xgbe_an73_set(struct xgbe_prv_data *pdata, 
bool enable,
 {
unsigned int reg;
 
+   /* Disable KR training for now */
+   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+   reg &= ~XGBE_KR_TRAINING_ENABLE;
+   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
+
+   /* Update AN settings */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
reg &= ~MDIO_AN_CTRL1_ENABLE;
 
@@ -522,21 +487,19 @@ static enum xgbe_an xgbe_an73_tx_training(struct 
xgbe_prv_data *pdata,
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
 
/* Start KR training */
-   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-   if (reg & XGBE_KR_TRAINING_ENABLE) {
-   if (pdata->phy_if.phy_impl.kr_training_pre)
-   pdata->phy_if.phy_impl.kr_training_pre(pdata);
+   if (pdata->phy_if.phy_impl.kr_training_pre)
+   pdata->phy_if.phy_impl.kr_training_pre(pdata);
 
-   reg |= XGBE_KR_TRAINING_START;
-   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
-   reg);
+   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+   reg |= XGBE_KR_TRAINING_ENABLE;
+   reg |= XGBE_KR_TRAINING_START;
+   XMDIO_

[PATCH net-next v2 11/12] amd-xgbe: Update the BelFuse quirk to support SGMII

2018-05-23 Thread Tom Lendacky
Instead of using a quirk to make the BelFuse 1GBT-SFP06 part look like
a 1000baseX part, program the SFP PHY to support SGMII and 10/100/1000
baseT.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  109 +++
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index dd747f6..194a569 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -860,6 +860,9 @@ static bool xgbe_phy_finisar_phy_quirks(struct 
xgbe_prv_data *pdata)
struct xgbe_phy_data *phy_data = pdata->phy_data;
unsigned int phy_id = phy_data->phydev->phy_id;
 
+   if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+   return false;
+
if ((phy_id & 0xfff0) != 0x01ff0cc0)
return false;
 
@@ -885,8 +888,80 @@ static bool xgbe_phy_finisar_phy_quirks(struct 
xgbe_prv_data *pdata)
return true;
 }
 
+static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
+{
+   struct xgbe_phy_data *phy_data = pdata->phy_data;
+   struct xgbe_sfp_eeprom *sfp_eeprom = _data->sfp_eeprom;
+   unsigned int phy_id = phy_data->phydev->phy_id;
+   int reg;
+
+   if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+   return false;
+
+   if (memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
+  XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
+   return false;
+
+   if (memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
+  XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
+   return false;
+
+   if ((phy_id & 0xfff0) != 0x03625d10)
+   return false;
+
+   /* Disable RGMII mode */
+   phy_write(phy_data->phydev, 0x18, 0x7007);
+   reg = phy_read(phy_data->phydev, 0x18);
+   phy_write(phy_data->phydev, 0x18, reg & ~0x0080);
+
+   /* Enable fiber register bank */
+   phy_write(phy_data->phydev, 0x1c, 0x7c00);
+   reg = phy_read(phy_data->phydev, 0x1c);
+   reg &= 0x03ff;
+   reg &= ~0x0001;
+   phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0001);
+
+   /* Power down SerDes */
+   reg = phy_read(phy_data->phydev, 0x00);
+   phy_write(phy_data->phydev, 0x00, reg | 0x00800);
+
+   /* Configure SGMII-to-Copper mode */
+   phy_write(phy_data->phydev, 0x1c, 0x7c00);
+   reg = phy_read(phy_data->phydev, 0x1c);
+   reg &= 0x03ff;
+   reg &= ~0x0006;
+   phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0004);
+
+   /* Power up SerDes */
+   reg = phy_read(phy_data->phydev, 0x00);
+   phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+   /* Enable copper register bank */
+   phy_write(phy_data->phydev, 0x1c, 0x7c00);
+   reg = phy_read(phy_data->phydev, 0x1c);
+   reg &= 0x03ff;
+   reg &= ~0x0001;
+   phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg);
+
+   /* Power up SerDes */
+   reg = phy_read(phy_data->phydev, 0x00);
+   phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+   phy_data->phydev->supported = PHY_GBIT_FEATURES;
+   phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+   phy_data->phydev->advertising = phy_data->phydev->supported;
+
+   netif_dbg(pdata, drv, pdata->netdev,
+ "BelFuse PHY quirk in place\n");
+
+   return true;
+}
+
 static void xgbe_phy_external_phy_quirks(struct xgbe_prv_data *pdata)
 {
+   if (xgbe_phy_belfuse_phy_quirks(pdata))
+   return;
+
if (xgbe_phy_finisar_phy_quirks(pdata))
return;
 }
@@ -1027,37 +1102,6 @@ static bool xgbe_phy_check_sfp_mod_absent(struct 
xgbe_phy_data *phy_data)
return false;
 }
 
-static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
-{
-   struct xgbe_phy_data *phy_data = pdata->phy_data;
-   struct xgbe_sfp_eeprom *sfp_eeprom = _data->sfp_eeprom;
-
-   if (memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
-  XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
-   return false;
-
-   if (!memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
-   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN)) {
-   phy_data->sfp_base = XGBE_SFP_BASE_1000_SX;
-   phy_data->sfp_cable = XGBE_SFP_CABLE_ACTIVE;
-   phy_data->sfp_speed = XGBE_SFP_SPEED_1000;
-   if (phy_data->sfp_changed)
-   netif_dbg(pdata, drv, pdata->netdev,
- "Bel-Fuse SFP quirk in place\n");
-   return 

[PATCH net-next v2 10/12] amd-xgbe: Advertise FEC support with the KR re-driver

2018-05-23 Thread Tom Lendacky
When a KR re-driver is present, indicate the FEC support is available
during auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 141bb13..dd747f6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1720,6 +1720,10 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data 
*pdata,
XGBE_CLR_ADV(dlks, 1000baseKX_Full);
XGBE_CLR_ADV(dlks, 1baseKR_Full);
 
+   /* Advertise FEC support is present */
+   if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
+   XGBE_SET_ADV(dlks, 1baseR_FEC);
+
switch (phy_data->port_mode) {
case XGBE_PORT_MODE_BACKPLANE:
XGBE_SET_ADV(dlks, 1baseKR_Full);



[PATCH net-next v2 07/12] amd-xgbe: Prepare for ethtool set-channel support

2018-05-23 Thread Tom Lendacky
In order to support being able to dynamically set/change the number of
Rx and Tx channels, update the code to:
 - Move alloc and free of device memory into callable functions
 - Move setting of the real number of Rx and Tx channels to device startup
 - Move mapping of the RSS channels to device startup

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |  108 ++---
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |   20 -
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 2646c08..397e3a0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1312,14 +1312,72 @@ int xgbe_powerup(struct net_device *netdev, unsigned 
int caller)
return 0;
 }
 
+static void xgbe_free_memory(struct xgbe_prv_data *pdata)
+{
+   struct xgbe_desc_if *desc_if = >desc_if;
+
+   /* Free the ring descriptors and buffers */
+   desc_if->free_ring_resources(pdata);
+
+   /* Free the channel and ring structures */
+   xgbe_free_channels(pdata);
+}
+
+static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
+{
+   struct xgbe_desc_if *desc_if = >desc_if;
+   struct net_device *netdev = pdata->netdev;
+   int ret;
+
+   /* Calculate the Rx buffer size before allocating rings */
+   pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
+
+   /* Allocate the channel and ring structures */
+   ret = xgbe_alloc_channels(pdata);
+   if (ret)
+   return ret;
+
+   /* Allocate the ring descriptors and buffers */
+   ret = desc_if->alloc_ring_resources(pdata);
+   if (ret)
+   goto err_channels;
+
+   /* Initialize the service and Tx timers */
+   xgbe_init_timers(pdata);
+
+   return 0;
+
+err_channels:
+   xgbe_free_memory(pdata);
+
+   return ret;
+}
+
 static int xgbe_start(struct xgbe_prv_data *pdata)
 {
struct xgbe_hw_if *hw_if = >hw_if;
struct xgbe_phy_if *phy_if = >phy_if;
struct net_device *netdev = pdata->netdev;
+   unsigned int i;
int ret;
 
-   DBGPR("-->xgbe_start\n");
+   /* Set the number of queues */
+   ret = netif_set_real_num_tx_queues(netdev, pdata->tx_ring_count);
+   if (ret) {
+   netdev_err(netdev, "error setting real tx queue count\n");
+   return ret;
+   }
+
+   ret = netif_set_real_num_rx_queues(netdev, pdata->rx_ring_count);
+   if (ret) {
+   netdev_err(netdev, "error setting real rx queue count\n");
+   return ret;
+   }
+
+   /* Set RSS lookup table data for programming */
+   for (i = 0; i < XGBE_RSS_MAX_TABLE_SIZE; i++)
+   XGMAC_SET_BITS(pdata->rss_table[i], MAC_RSSDR, DMCH,
+  i % pdata->rx_ring_count);
 
ret = hw_if->init(pdata);
if (ret)
@@ -1347,8 +1405,6 @@ static int xgbe_start(struct xgbe_prv_data *pdata)
 
clear_bit(XGBE_STOPPED, >dev_state);
 
-   DBGPR("<--xgbe_start\n");
-
return 0;
 
 err_irqs:
@@ -1823,11 +1879,8 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 static int xgbe_open(struct net_device *netdev)
 {
struct xgbe_prv_data *pdata = netdev_priv(netdev);
-   struct xgbe_desc_if *desc_if = >desc_if;
int ret;
 
-   DBGPR("-->xgbe_open\n");
-
/* Create the various names based on netdev name */
snprintf(pdata->an_name, sizeof(pdata->an_name) - 1, "%s-pcs",
 netdev_name(netdev));
@@ -1872,43 +1925,25 @@ static int xgbe_open(struct net_device *netdev)
goto err_sysclk;
}
 
-   /* Calculate the Rx buffer size before allocating rings */
-   ret = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
-   if (ret < 0)
-   goto err_ptpclk;
-   pdata->rx_buf_size = ret;
-
-   /* Allocate the channel and ring structures */
-   ret = xgbe_alloc_channels(pdata);
-   if (ret)
-   goto err_ptpclk;
-
-   /* Allocate the ring descriptors and buffers */
-   ret = desc_if->alloc_ring_resources(pdata);
-   if (ret)
-   goto err_channels;
-
INIT_WORK(>service_work, xgbe_service);
INIT_WORK(>restart_work, xgbe_restart);
INIT_WORK(>stopdev_work, xgbe_stopdev);
INIT_WORK(>tx_tstamp_work, xgbe_tx_tstamp);
-   xgbe_init_timers(pdata);
+
+   ret = xgbe_alloc_memory(pdata);
+   if (ret)
+   goto err_ptpclk;
 
ret = xgbe_start(pdata);
if (ret)
-   goto err_rings;
+   goto err_mem;
 
clear_bit(XGBE_DOWN, >dev_state);
 
-   DBGPR("<--xgbe_open\

[PATCH net-next v2 05/12] amd-xgbe: Add ethtool support to retrieve SFP module info

2018-05-23 Thread Tom Lendacky
Add support to get SFP module information using ethtool.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |   18 +++
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|   21 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  137 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |   13 ++
 4 files changed, 189 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index ff397bb..57394b77 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -626,6 +626,22 @@ static int xgbe_get_ts_info(struct net_device *netdev,
return 0;
 }
 
+static int xgbe_get_module_info(struct net_device *netdev,
+   struct ethtool_modinfo *modinfo)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+   return pdata->phy_if.module_info(pdata, modinfo);
+}
+
+static int xgbe_get_module_eeprom(struct net_device *netdev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+   return pdata->phy_if.module_eeprom(pdata, eeprom, data);
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
.get_drvinfo = xgbe_get_drvinfo,
.get_msglevel = xgbe_get_msglevel,
@@ -646,6 +662,8 @@ static int xgbe_get_ts_info(struct net_device *netdev,
.get_ts_info = xgbe_get_ts_info,
.get_link_ksettings = xgbe_get_link_ksettings,
.set_link_ksettings = xgbe_set_link_ksettings,
+   .get_module_info = xgbe_get_module_info,
+   .get_module_eeprom = xgbe_get_module_eeprom,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 1b45cd7..9c39c72 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -126,6 +126,24 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
+static int xgbe_phy_module_eeprom(struct xgbe_prv_data *pdata,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+   if (!pdata->phy_if.phy_impl.module_eeprom)
+   return -ENXIO;
+
+   return pdata->phy_if.phy_impl.module_eeprom(pdata, eeprom, data);
+}
+
+static int xgbe_phy_module_info(struct xgbe_prv_data *pdata,
+   struct ethtool_modinfo *modinfo)
+{
+   if (!pdata->phy_if.phy_impl.module_info)
+   return -ENXIO;
+
+   return pdata->phy_if.phy_impl.module_info(pdata, modinfo);
+}
+
 static void xgbe_an37_clear_interrupts(struct xgbe_prv_data *pdata)
 {
int reg;
@@ -1639,4 +1657,7 @@ void xgbe_init_function_ptrs_phy(struct xgbe_phy_if 
*phy_if)
phy_if->phy_valid_speed = xgbe_phy_valid_speed;
 
phy_if->an_isr  = xgbe_an_combined_isr;
+
+   phy_if->module_info = xgbe_phy_module_info;
+   phy_if->module_eeprom   = xgbe_phy_module_eeprom;
 }
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index cb15caf..141bb13 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -119,6 +119,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "xgbe.h"
 #include "xgbe-common.h"
@@ -270,6 +271,15 @@ struct xgbe_sfp_eeprom {
u8 vendor[32];
 };
 
+#define XGBE_SFP_DIAGS_SUPPORTED(_x)   \
+   ((_x)->extd[XGBE_SFP_EXTD_SFF_8472] &&  \
+!((_x)->extd[XGBE_SFP_EXTD_DIAG] & XGBE_SFP_EXTD_DIAG_ADDR_CHANGE))
+
+#define XGBE_SFP_EEPROM_BASE_LEN   256
+#define XGBE_SFP_EEPROM_DIAG_LEN   256
+#define XGBE_SFP_EEPROM_MAX(XGBE_SFP_EEPROM_BASE_LEN + \
+XGBE_SFP_EEPROM_DIAG_LEN)
+
 #define XGBE_BEL_FUSE_VENDOR   "BEL-FUSE"
 #define XGBE_BEL_FUSE_PARTNO   "1GBT-SFP06  "
 
@@ -1301,6 +1311,130 @@ static void xgbe_phy_sfp_detect(struct xgbe_prv_data 
*pdata)
xgbe_phy_put_comm_ownership(pdata);
 }
 
+static int xgbe_phy_module_eeprom(struct xgbe_prv_data *pdata,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+   struct xgbe_phy_data *phy_data = pdata->phy_data;
+   u8 eeprom_addr, eeprom_data[XGBE_SFP_EEPROM_MAX];
+   struct xgbe_sfp_eeprom *sfp_eeprom;
+   unsigned int i, j, rem;
+   int ret;
+
+   rem = eeprom->len;
+
+   if (!eeprom->len) {
+   ret = -EINVAL;
+   goto done;
+   }
+
+   if ((eeprom->offset + eeprom->len) > XGBE_SFP_EEPROM_MAX) {
+   ret = -EINVAL;
+   goto done;
+   }
+
+   if (phy_data->port_mode != XGBE_PORT_MODE_SFP) {
+

[PATCH net-next v2 06/12] amd-xgbe: Add ethtool show/set ring parameter support

2018-05-23 Thread Tom Lendacky
Add ethtool support to show and set the number of the Rx and Tx ring
descriptors.  Changing the ring configuration will result in a device
restart.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |6 --
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |   65 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |6 ++
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 7c204f0..2646c08 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1426,10 +1426,8 @@ static void xgbe_stopdev(struct work_struct *work)
netdev_alert(pdata->netdev, "device stopped\n");
 }
 
-static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
+void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
-   DBGPR("-->xgbe_restart_dev\n");
-
/* If not running, "restart" will happen on open */
if (!netif_running(pdata->netdev))
return;
@@ -1440,8 +1438,6 @@ static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
xgbe_free_rx_data(pdata);
 
xgbe_start(pdata);
-
-   DBGPR("<--xgbe_restart_dev\n");
 }
 
 static void xgbe_restart(struct work_struct *work)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 57394b77..d12f982 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -642,6 +642,69 @@ static int xgbe_get_module_eeprom(struct net_device 
*netdev,
return pdata->phy_if.module_eeprom(pdata, eeprom, data);
 }
 
+static void xgbe_get_ringparam(struct net_device *netdev,
+  struct ethtool_ringparam *ringparam)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+   ringparam->rx_max_pending = XGBE_RX_DESC_CNT_MAX;
+   ringparam->tx_max_pending = XGBE_TX_DESC_CNT_MAX;
+   ringparam->rx_pending = pdata->rx_desc_count;
+   ringparam->tx_pending = pdata->tx_desc_count;
+}
+
+static int xgbe_set_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ringparam)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+   unsigned int rx, tx;
+
+   if (ringparam->rx_mini_pending || ringparam->rx_jumbo_pending) {
+   netdev_err(netdev, "unsupported ring parameter\n");
+   return -EINVAL;
+   }
+
+   if ((ringparam->rx_pending < XGBE_RX_DESC_CNT_MIN) ||
+   (ringparam->rx_pending > XGBE_RX_DESC_CNT_MAX)) {
+   netdev_err(netdev,
+  "rx ring parameter must be between %u and %u\n",
+  XGBE_RX_DESC_CNT_MIN, XGBE_RX_DESC_CNT_MAX);
+   return -EINVAL;
+   }
+
+   if ((ringparam->tx_pending < XGBE_TX_DESC_CNT_MIN) ||
+   (ringparam->tx_pending > XGBE_TX_DESC_CNT_MAX)) {
+   netdev_err(netdev,
+  "tx ring parameter must be between %u and %u\n",
+  XGBE_TX_DESC_CNT_MIN, XGBE_TX_DESC_CNT_MAX);
+   return -EINVAL;
+   }
+
+   rx = __rounddown_pow_of_two(ringparam->rx_pending);
+   if (rx != ringparam->rx_pending)
+   netdev_notice(netdev,
+ "rx ring parameter rounded to power of two: %u\n",
+ rx);
+
+   tx = __rounddown_pow_of_two(ringparam->tx_pending);
+   if (tx != ringparam->tx_pending)
+   netdev_notice(netdev,
+ "tx ring parameter rounded to power of two: %u\n",
+ tx);
+
+   if ((rx == pdata->rx_desc_count) &&
+   (tx == pdata->tx_desc_count))
+   goto out;
+
+   pdata->rx_desc_count = rx;
+   pdata->tx_desc_count = tx;
+
+   xgbe_restart_dev(pdata);
+
+out:
+   return 0;
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
.get_drvinfo = xgbe_get_drvinfo,
.get_msglevel = xgbe_get_msglevel,
@@ -664,6 +727,8 @@ static int xgbe_get_module_eeprom(struct net_device *netdev,
.set_link_ksettings = xgbe_set_link_ksettings,
.get_module_info = xgbe_get_module_info,
.get_module_eeprom = xgbe_get_module_eeprom,
+   .get_ringparam = xgbe_get_ringparam,
+   .set_ringparam = xgbe_set_ringparam,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h 
b/drivers/net/ethernet/amd/xgbe/xgbe.h
index f0f455b..7dc0fac 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -144,6 +144,11 @@
 #define XGBE_TX_DESC_MAX_PROC  (XGBE_T

[PATCH net-next v2 03/12] amd-xgbe: Remove use of comm_owned field

2018-05-23 Thread Tom Lendacky
The comm_owned field can hide logic where double locking is attempted
and prevent multiple threads for the same device from accessing the
mutex properly.  Remove the comm_owned field and use the mutex API
exclusively for gaining ownership.  The current driver has been audited
and is obtaining communications ownership properly.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 123ceb0..05003be 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -327,8 +327,6 @@ struct xgbe_phy_data {
 
unsigned int mdio_addr;
 
-   unsigned int comm_owned;
-
/* SFP Support */
enum xgbe_sfp_comm sfp_comm;
unsigned int sfp_mux_address;
@@ -382,12 +380,6 @@ struct xgbe_phy_data {
 static int xgbe_phy_i2c_xfer(struct xgbe_prv_data *pdata,
 struct xgbe_i2c_op *i2c_op)
 {
-   struct xgbe_phy_data *phy_data = pdata->phy_data;
-
-   /* Be sure we own the bus */
-   if (WARN_ON(!phy_data->comm_owned))
-   return -EIO;
-
return pdata->i2c_if.i2c_xfer(pdata, i2c_op);
 }
 
@@ -549,10 +541,6 @@ static int xgbe_phy_sfp_get_mux(struct xgbe_prv_data 
*pdata)
 
 static void xgbe_phy_put_comm_ownership(struct xgbe_prv_data *pdata)
 {
-   struct xgbe_phy_data *phy_data = pdata->phy_data;
-
-   phy_data->comm_owned = 0;
-
mutex_unlock(_phy_comm_lock);
 }
 
@@ -562,9 +550,6 @@ static int xgbe_phy_get_comm_ownership(struct xgbe_prv_data 
*pdata)
unsigned long timeout;
unsigned int mutex_id;
 
-   if (phy_data->comm_owned)
-   return 0;
-
/* The I2C and MDIO/GPIO bus is multiplexed between multiple devices,
 * the driver needs to take the software mutex and then the hardware
 * mutexes before being able to use the busses.
@@ -593,7 +578,6 @@ static int xgbe_phy_get_comm_ownership(struct xgbe_prv_data 
*pdata)
XP_IOWRITE(pdata, XP_I2C_MUTEX, mutex_id);
XP_IOWRITE(pdata, XP_MDIO_MUTEX, mutex_id);
 
-   phy_data->comm_owned = 1;
return 0;
}
 



[PATCH net-next v2 04/12] amd-xgbe: Remove field that indicates SFP diagnostic support

2018-05-23 Thread Tom Lendacky
The driver currently sets an indication of whether the SFP supports, and
that the driver can obtain, diagnostics data.  This isn't currently used
by the driver and the logic to set this indicator is flawed because the
field is cleared each time the SFP is checked and only set when a new SFP
is detected.  Remove this field and the logic supporting it.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 05003be..cb15caf 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -343,7 +343,6 @@ struct xgbe_phy_data {
unsigned int sfp_rx_los;
unsigned int sfp_tx_fault;
unsigned int sfp_mod_absent;
-   unsigned int sfp_diags;
unsigned int sfp_changed;
unsigned int sfp_phy_avail;
unsigned int sfp_cable_len;
@@ -1211,13 +1210,6 @@ static int xgbe_phy_sfp_read_eeprom(struct xgbe_prv_data 
*pdata)
 
memcpy(_data->sfp_eeprom, _eeprom, sizeof(sfp_eeprom));
 
-   if (sfp_eeprom.extd[XGBE_SFP_EXTD_SFF_8472]) {
-   u8 diag_type = sfp_eeprom.extd[XGBE_SFP_EXTD_DIAG];
-
-   if (!(diag_type & XGBE_SFP_EXTD_DIAG_ADDR_CHANGE))
-   phy_data->sfp_diags = 1;
-   }
-
xgbe_phy_free_phy_device(pdata);
} else {
phy_data->sfp_changed = 0;
@@ -1267,7 +1259,6 @@ static void xgbe_phy_sfp_reset(struct xgbe_phy_data 
*phy_data)
phy_data->sfp_rx_los = 0;
phy_data->sfp_tx_fault = 0;
phy_data->sfp_mod_absent = 1;
-   phy_data->sfp_diags = 0;
phy_data->sfp_base = XGBE_SFP_BASE_UNKNOWN;
phy_data->sfp_cable = XGBE_SFP_CABLE_UNKNOWN;
phy_data->sfp_speed = XGBE_SFP_SPEED_UNKNOWN;



[PATCH net-next v2 02/12] amd-xgbe: Read and save the port property registers during probe

2018-05-23 Thread Tom Lendacky
Read and save the port property registers once during the device probe
and then use the saved values as they are needed.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c|   34 ++
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   68 ---
 drivers/net/ethernet/amd/xgbe/xgbe.h|7 +++
 3 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 7b63521..7b86240 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -335,12 +335,29 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
pdata->awcr = XGBE_DMA_PCI_AWCR;
pdata->awarcr = XGBE_DMA_PCI_AWARCR;
 
+   /* Read the port property registers */
+   pdata->pp0 = XP_IOREAD(pdata, XP_PROP_0);
+   pdata->pp1 = XP_IOREAD(pdata, XP_PROP_1);
+   pdata->pp2 = XP_IOREAD(pdata, XP_PROP_2);
+   pdata->pp3 = XP_IOREAD(pdata, XP_PROP_3);
+   pdata->pp4 = XP_IOREAD(pdata, XP_PROP_4);
+   if (netif_msg_probe(pdata)) {
+   dev_dbg(dev, "port property 0 = %#010x\n", pdata->pp0);
+   dev_dbg(dev, "port property 1 = %#010x\n", pdata->pp1);
+   dev_dbg(dev, "port property 2 = %#010x\n", pdata->pp2);
+   dev_dbg(dev, "port property 3 = %#010x\n", pdata->pp3);
+   dev_dbg(dev, "port property 4 = %#010x\n", pdata->pp4);
+   }
+
/* Set the maximum channels and queues */
-   reg = XP_IOREAD(pdata, XP_PROP_1);
-   pdata->tx_max_channel_count = XP_GET_BITS(reg, XP_PROP_1, MAX_TX_DMA);
-   pdata->rx_max_channel_count = XP_GET_BITS(reg, XP_PROP_1, MAX_RX_DMA);
-   pdata->tx_max_q_count = XP_GET_BITS(reg, XP_PROP_1, MAX_TX_QUEUES);
-   pdata->rx_max_q_count = XP_GET_BITS(reg, XP_PROP_1, MAX_RX_QUEUES);
+   pdata->tx_max_channel_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+ MAX_TX_DMA);
+   pdata->rx_max_channel_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+ MAX_RX_DMA);
+   pdata->tx_max_q_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+   MAX_TX_QUEUES);
+   pdata->rx_max_q_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+   MAX_RX_QUEUES);
if (netif_msg_probe(pdata)) {
dev_dbg(dev, "max tx/rx channel count = %u/%u\n",
pdata->tx_max_channel_count,
@@ -353,12 +370,13 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
xgbe_set_counts(pdata);
 
/* Set the maximum fifo amounts */
-   reg = XP_IOREAD(pdata, XP_PROP_2);
-   pdata->tx_max_fifo_size = XP_GET_BITS(reg, XP_PROP_2, TX_FIFO_SIZE);
+   pdata->tx_max_fifo_size = XP_GET_BITS(pdata->pp2, XP_PROP_2,
+ TX_FIFO_SIZE);
pdata->tx_max_fifo_size *= 16384;
pdata->tx_max_fifo_size = min(pdata->tx_max_fifo_size,
  pdata->vdata->tx_max_fifo_size);
-   pdata->rx_max_fifo_size = XP_GET_BITS(reg, XP_PROP_2, RX_FIFO_SIZE);
+   pdata->rx_max_fifo_size = XP_GET_BITS(pdata->pp2, XP_PROP_2,
+ RX_FIFO_SIZE);
pdata->rx_max_fifo_size *= 16384;
pdata->rx_max_fifo_size = min(pdata->rx_max_fifo_size,
  pdata->vdata->rx_max_fifo_size);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index aac8843..123ceb0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -2421,22 +2421,21 @@ static int xgbe_phy_link_status(struct xgbe_prv_data 
*pdata, int *an_restart)
 static void xgbe_phy_sfp_gpio_setup(struct xgbe_prv_data *pdata)
 {
struct xgbe_phy_data *phy_data = pdata->phy_data;
-   unsigned int reg;
-
-   reg = XP_IOREAD(pdata, XP_PROP_3);
 
phy_data->sfp_gpio_address = XGBE_GPIO_ADDRESS_PCA9555 +
-XP_GET_BITS(reg, XP_PROP_3, GPIO_ADDR);
+XP_GET_BITS(pdata->pp3, XP_PROP_3,
+GPIO_ADDR);
 
-   phy_data->sfp_gpio_mask = XP_GET_BITS(reg, XP_PROP_3, GPIO_MASK);
+   phy_data->sfp_gpio_mask = XP_GET_BITS(pdata->pp3, XP_PROP_3,
+ GPIO_MASK);
 
-   phy_data->sfp_gpio_rx_los = XP_GET_BITS(reg, XP_PROP_3,
+   phy_data->sfp_gpio_rx_los = XP_GET_BITS(pdata->

[PATCH net-next v2 00/12] amd-xgbe: AMD XGBE driver updates 2018-05-21

2018-05-23 Thread Tom Lendacky
The following updates are included in this driver update series:

- Fix the debug output for the max channels count
- Read (once) and save the port property registers during probe
- Remove the use of the comm_owned field
- Remove unused SFP diagnostic support indicator field
- Add ethtool --module-info support
- Add ethtool --show-ring/--set-ring support
- Update the driver in preparation for ethtool --set-channels support
- Add ethtool --show-channels/--set-channels support
- Update the driver to always perform link training in KR mode
- Advertise FEC support when using a KR re-driver
- Update the BelFuse quirk to now support SGMII
- Improve 100Mbps auto-negotiation for BelFuse parts

This patch series is based on net-next.

---

Changes since v1:
- Update the --set-channels support to the use of the combined, rx and
  tx options as specified in the ethtool man page (in other words, don't
  create combined channels based on the min of the tx and rx channels
  specified).

Tom Lendacky (12):
  amd-xgbe: Fix debug output of max channel counts
  amd-xgbe: Read and save the port property registers during probe
  amd-xgbe: Remove use of comm_owned field
  amd-xgbe: Remove field that indicates SFP diagnostic support
  amd-xgbe: Add ethtool support to retrieve SFP module info
  amd-xgbe: Add ethtool show/set ring parameter support
  amd-xgbe: Prepare for ethtool set-channel support
  amd-xgbe: Add ethtool show/set channels support
  amd-xgbe: Always attempt link training in KR mode
  amd-xgbe: Advertise FEC support with the KR re-driver
  amd-xgbe: Update the BelFuse quirk to support SGMII
  amd-xgbe: Improve SFP 100Mbps auto-negotiation


 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |  137 +++---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  217 
 drivers/net/ethernet/amd/xgbe/xgbe-main.c|   20 -
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|  167 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |   36 ++-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  349 +++---
 drivers/net/ethernet/amd/xgbe/xgbe.h |   31 ++
 7 files changed, 699 insertions(+), 258 deletions(-)

-- 
Tom Lendacky


[PATCH net-next v2 01/12] amd-xgbe: Fix debug output of max channel counts

2018-05-23 Thread Tom Lendacky
A debug output print statement uses the wrong variable to output the
maximum Rx channel count (cut and paste error, basically).  Fix the
statement to use the proper variable.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 82d1f41..7b63521 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -344,7 +344,7 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
if (netif_msg_probe(pdata)) {
dev_dbg(dev, "max tx/rx channel count = %u/%u\n",
pdata->tx_max_channel_count,
-   pdata->tx_max_channel_count);
+   pdata->rx_max_channel_count);
dev_dbg(dev, "max tx/rx hw queue count = %u/%u\n",
pdata->tx_max_q_count, pdata->rx_max_q_count);
}



Re: [PATCH net-next 08/12] amd-xgbe: Add ethtool show/set channels support

2018-05-22 Thread Tom Lendacky
On 5/22/2018 8:29 AM, Edward Cree wrote:
> On 22/05/18 14:24, Tom Lendacky wrote:
>> The amd-xgbe driver is not designed to allocate separate IRQs for Rx and
>> Tx.  In general, there is one IRQ for a channel of which Tx and Rx are
>> shared.  You can have more Tx channels than Rx channels or vice-versa, but
>> the min() of those numbers will be a combined Tx/Rx with the excess being
>> Tx or Rx only: e.g. combined 0 tx 8 rx 10 results in 8 combined channels
>> plus two Rx only channels.
> If you cannot allocate the channels requested by the user, surely the correct
>  thing is not to fudge it into something similar, but rather to return an
>  error from the ethtool set_channels() op.

Ok, another vote on changing the logic.  I'll update it and submit a v2.

Thanks,
Tom

> 
> -Ed
> 


Re: [net-next PATCH v2 2/4] net: Enable Tx queue selection based on Rx queues

2018-05-22 Thread Tom Herbert
On Mon, May 21, 2018 at 8:12 AM, Willem de Bruijn
<willemdebruijn.ker...@gmail.com> wrote:
> On Mon, May 21, 2018 at 10:51 AM, Tom Herbert <t...@herbertland.com> wrote:
>> On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
>> <willemdebruijn.ker...@gmail.com> wrote:
>>> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
>>> <willemdebruijn.ker...@gmail.com> wrote:
>>>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <t...@herbertland.com> wrote:
>>>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>>>> <amritha.namb...@intel.com> wrote:
>>>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>>>> configuration set by the admin through the sysfs attribute
>>>>>> for each Tx queue. If the user configuration for receive
>>>>>> queue map does not apply, then the Tx queue selection falls back
>>>>>> to CPU map based selection and finally to hashing.
>>>>>>
>>>>>> Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com>
>>>>>> ---
>>>
>>>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>>> +{
>>>>>> +#ifdef CONFIG_XPS
>>>>>> +   enum xps_map_type i = XPS_MAP_RXQS;
>>>>>> +   struct xps_dev_maps *dev_maps;
>>>>>> +   struct sock *sk = skb->sk;
>>>>>> +   int queue_index = -1;
>>>>>> +   unsigned int tci = 0;
>>>>>> +
>>>>>> +   if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>>>> +   dev->ifindex == sk->sk_rx_ifindex)
>>>>>> +   tci = sk->sk_rx_queue_mapping;
>>>>>> +
>>>>>> +   rcu_read_lock();
>>>>>> +   while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>>>> +   if (i == XPS_MAP_CPUS)
>>>>>
>>>>> This while loop typifies exactly why I don't think the XPS maps should
>>>>> be an array.
>>>>
>>>> +1
>>>
>>> as a matter of fact, as enabling both cpu and rxqueue map at the same
>>> time makes no sense, only one map is needed at any one time. The
>>> only difference is in how it is indexed. It should probably not be possible
>>> to configure both at the same time. Keeping a single map probably also
>>> significantly simplifies patch 1/4.
>>
>> Willem,
>>
>> I think it might makes sense to have them both. Maybe one application
>> is spin polling that needs this, where others might be happy with
>> normal CPU mappings as default.
>
> Some entries in the rx_queue table have queue_pair affinity
> configured, the others return -1 to fall through to the cpu
> affinity table?
>
Right, that's the intent of the while loop.

> I guess that implies flow steering to those special purpose
> queues. I wonder whether this would be used this in practice.
> I does make the code more complex by having to duplicate
> the map lookup logic (mostly, patch 1/4).

That's a good pont. I think we need more information on how the
feature is going to be used in practice. My assumption is that there
are some number of "special" queues for which spin polling is being
done.

Tom


Re: [PATCH net-next 08/12] amd-xgbe: Add ethtool show/set channels support

2018-05-22 Thread Tom Lendacky
On 5/22/2018 12:35 AM, Jakub Kicinski wrote:
> On Mon, 21 May 2018 16:59:37 -0500, Tom Lendacky wrote:
>> +rx = combined + channels->rx_count;
>> +tx = combined + channels->tx_count;
>> +netdev_notice(netdev, "final channel count assignment: combined=%u, 
>> rx-only=%u, tx-only=%u\n",
>> +  min(rx, tx), rx - min(rx, tx), tx - min(rx, tx));
> 
> If user requests combined 0 rx 8 tx 8 they will end up with combined 8
> rx 0 tx 0.  Is that expected?

Yes, which is the reason that I issue the final channel count message. I
debated on how to do all this and looked at other drivers as well as the
ethtool man page and decided on this logic.

> 
> The man page clearly sayeth:
> 
>-L --set-channels
>   Changes the numbers of channels of the specified network device.
> 
>rx N   Changes the number of channels with only receive queues.
> 
>tx N   Changes the number of channels with only transmit queues.
> 
>other N
>   Changes the number of channels used only for other  purposes
>   e.g. link interrupts or SR-IOV co-ordination.
> 
>combined N
>   Changes the number of multi-purpose channels.
> 
> Note the use of word *only*.  There are drivers in tree which adhere to
> this interpretation and dutifully allocate separate IRQs if RX and TX
> channels are requested separately.

The amd-xgbe driver is not designed to allocate separate IRQs for Rx and
Tx.  In general, there is one IRQ for a channel of which Tx and Rx are
shared.  You can have more Tx channels than Rx channels or vice-versa, but
the min() of those numbers will be a combined Tx/Rx with the excess being
Tx or Rx only: e.g. combined 0 tx 8 rx 10 results in 8 combined channels
plus two Rx only channels.

I thought this was the most reasonable way to do this, please let me know
if there's a strong objection to this.

Thanks,
Tom

> 
> Which is not to claim that majority of existing drivers adhere to this
> wording :)
> 


[PATCH net-next 09/12] amd-xgbe: Always attempt link training in KR mode

2018-05-21 Thread Tom Lendacky
Link training is always attempted when in KR mode, but the code is
structured to check if link training has been enabled before attempting
to perform it.  Since that check will always be true, simplify the code
to always enable and start link training during KR auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   69 +++--
 1 file changed, 16 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 9c39c72..450b89c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -216,31 +216,8 @@ static void xgbe_an_clear_interrupts_all(struct 
xgbe_prv_data *pdata)
xgbe_an37_clear_interrupts(pdata);
 }
 
-static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
-{
-   unsigned int reg;
-
-   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-   reg |= XGBE_KR_TRAINING_ENABLE;
-   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
-static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
-{
-   unsigned int reg;
-
-   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-   reg &= ~XGBE_KR_TRAINING_ENABLE;
-   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
 static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 {
-   /* Enable KR training */
-   xgbe_an73_enable_kr_training(pdata);
-
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_1);
 
@@ -250,9 +227,6 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 2.5G speed */
pdata->hw_if.set_speed(pdata, SPEED_2500);
 
@@ -262,9 +236,6 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -278,9 +249,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
if (pdata->kr_redrv)
return xgbe_kr_mode(pdata);
 
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_1);
 
@@ -290,9 +258,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -302,9 +267,6 @@ static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -314,9 +276,6 @@ static void xgbe_sgmii_1000_mode(struct xgbe_prv_data 
*pdata)
 
 static void xgbe_sgmii_100_mode(struct xgbe_prv_data *pdata)
 {
-   /* Disable KR training */
-   xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -425,6 +384,12 @@ static void xgbe_an73_set(struct xgbe_prv_data *pdata, 
bool enable,
 {
unsigned int reg;
 
+   /* Disable KR training for now */
+   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+   reg &= ~XGBE_KR_TRAINING_ENABLE;
+   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
+
+   /* Update AN settings */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
reg &= ~MDIO_AN_CTRL1_ENABLE;
 
@@ -522,21 +487,19 @@ static enum xgbe_an xgbe_an73_tx_training(struct 
xgbe_prv_data *pdata,
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
 
/* Start KR training */
-   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-   if (reg & XGBE_KR_TRAINING_ENABLE) {
-   if (pdata->phy_if.phy_impl.kr_training_pre)
-   pdata->phy_if.phy_impl.kr_training_pre(pdata);
+   if (pdata->phy_if.phy_impl.kr_training_pre)
+   pdata->phy_if.phy_impl.kr_training_pre(pdata);
 
-   reg |= XGBE_KR_TRAINING_START;
-   XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
-   reg);
+   reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+   reg |= XGBE_KR_TRAINING_ENABLE;
+   reg |= XGBE_KR_TRAINING_START;
+   XMDIO_

[PATCH net-next 11/12] amd-xgbe: Update the BelFuse quirk to support SGMII

2018-05-21 Thread Tom Lendacky
Instead of using a quirk to make the BelFuse 1GBT-SFP06 part look like
a 1000baseX part, program the SFP PHY to support SGMII and 10/100/1000
baseT.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  109 +++
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index dd747f6..194a569 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -860,6 +860,9 @@ static bool xgbe_phy_finisar_phy_quirks(struct 
xgbe_prv_data *pdata)
struct xgbe_phy_data *phy_data = pdata->phy_data;
unsigned int phy_id = phy_data->phydev->phy_id;
 
+   if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+   return false;
+
if ((phy_id & 0xfff0) != 0x01ff0cc0)
return false;
 
@@ -885,8 +888,80 @@ static bool xgbe_phy_finisar_phy_quirks(struct 
xgbe_prv_data *pdata)
return true;
 }
 
+static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
+{
+   struct xgbe_phy_data *phy_data = pdata->phy_data;
+   struct xgbe_sfp_eeprom *sfp_eeprom = _data->sfp_eeprom;
+   unsigned int phy_id = phy_data->phydev->phy_id;
+   int reg;
+
+   if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+   return false;
+
+   if (memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
+  XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
+   return false;
+
+   if (memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
+  XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
+   return false;
+
+   if ((phy_id & 0xfff0) != 0x03625d10)
+   return false;
+
+   /* Disable RGMII mode */
+   phy_write(phy_data->phydev, 0x18, 0x7007);
+   reg = phy_read(phy_data->phydev, 0x18);
+   phy_write(phy_data->phydev, 0x18, reg & ~0x0080);
+
+   /* Enable fiber register bank */
+   phy_write(phy_data->phydev, 0x1c, 0x7c00);
+   reg = phy_read(phy_data->phydev, 0x1c);
+   reg &= 0x03ff;
+   reg &= ~0x0001;
+   phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0001);
+
+   /* Power down SerDes */
+   reg = phy_read(phy_data->phydev, 0x00);
+   phy_write(phy_data->phydev, 0x00, reg | 0x00800);
+
+   /* Configure SGMII-to-Copper mode */
+   phy_write(phy_data->phydev, 0x1c, 0x7c00);
+   reg = phy_read(phy_data->phydev, 0x1c);
+   reg &= 0x03ff;
+   reg &= ~0x0006;
+   phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0004);
+
+   /* Power up SerDes */
+   reg = phy_read(phy_data->phydev, 0x00);
+   phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+   /* Enable copper register bank */
+   phy_write(phy_data->phydev, 0x1c, 0x7c00);
+   reg = phy_read(phy_data->phydev, 0x1c);
+   reg &= 0x03ff;
+   reg &= ~0x0001;
+   phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg);
+
+   /* Power up SerDes */
+   reg = phy_read(phy_data->phydev, 0x00);
+   phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+   phy_data->phydev->supported = PHY_GBIT_FEATURES;
+   phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+   phy_data->phydev->advertising = phy_data->phydev->supported;
+
+   netif_dbg(pdata, drv, pdata->netdev,
+ "BelFuse PHY quirk in place\n");
+
+   return true;
+}
+
 static void xgbe_phy_external_phy_quirks(struct xgbe_prv_data *pdata)
 {
+   if (xgbe_phy_belfuse_phy_quirks(pdata))
+   return;
+
if (xgbe_phy_finisar_phy_quirks(pdata))
return;
 }
@@ -1027,37 +1102,6 @@ static bool xgbe_phy_check_sfp_mod_absent(struct 
xgbe_phy_data *phy_data)
return false;
 }
 
-static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
-{
-   struct xgbe_phy_data *phy_data = pdata->phy_data;
-   struct xgbe_sfp_eeprom *sfp_eeprom = _data->sfp_eeprom;
-
-   if (memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
-  XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
-   return false;
-
-   if (!memcmp(_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
-   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN)) {
-   phy_data->sfp_base = XGBE_SFP_BASE_1000_SX;
-   phy_data->sfp_cable = XGBE_SFP_CABLE_ACTIVE;
-   phy_data->sfp_speed = XGBE_SFP_SPEED_1000;
-   if (phy_data->sfp_changed)
-   netif_dbg(pdata, drv, pdata->netdev,
- "Bel-Fuse SFP quirk in place\n");
-   return 

[PATCH net-next 12/12] amd-xgbe: Improve SFP 100Mbps auto-negotiation

2018-05-21 Thread Tom Lendacky
After changing speed to 100Mbps as a result of auto-negotiation (AN),
some 10/100/1000Mbps SFPs indicate a successful link (no faults or loss
of signal), but cannot successfully transmit or receive data.  These
SFPs required an extra auto-negotiation (AN) after the speed change in
order to operate properly.  Add a quirk for these SFPs so that if the
outcome of the AN actually results in changing to a new speed, re-initiate
AN at that new speed.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |   77 +++
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |6 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h|1 
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 450b89c..4b5d625 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -331,13 +331,15 @@ static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
xgbe_change_mode(pdata, pdata->phy_if.phy_impl.switch_mode(pdata));
 }
 
-static void xgbe_set_mode(struct xgbe_prv_data *pdata,
+static bool xgbe_set_mode(struct xgbe_prv_data *pdata,
  enum xgbe_mode mode)
 {
if (mode == xgbe_cur_mode(pdata))
-   return;
+   return false;
 
xgbe_change_mode(pdata, mode);
+
+   return true;
 }
 
 static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
@@ -1178,21 +1180,23 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data 
*pdata)
return 0;
 }
 
-static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata, bool set_mode)
 {
int ret;
 
+   mutex_lock(>an_mutex);
+
set_bit(XGBE_LINK_INIT, >dev_state);
pdata->link_check = jiffies;
 
ret = pdata->phy_if.phy_impl.an_config(pdata);
if (ret)
-   return ret;
+   goto out;
 
if (pdata->phy.autoneg != AUTONEG_ENABLE) {
ret = xgbe_phy_config_fixed(pdata);
if (ret || !pdata->kr_redrv)
-   return ret;
+   goto out;
 
netif_dbg(pdata, link, pdata->netdev, "AN redriver support\n");
} else {
@@ -1202,24 +1206,27 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data 
*pdata)
/* Disable auto-negotiation interrupt */
disable_irq(pdata->an_irq);
 
-   /* Start auto-negotiation in a supported mode */
-   if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
-   xgbe_set_mode(pdata, XGBE_MODE_KR);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
-   xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
-   xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
-   xgbe_set_mode(pdata, XGBE_MODE_SFI);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
-   xgbe_set_mode(pdata, XGBE_MODE_X);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
-   xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
-   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
-   xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
-   } else {
-   enable_irq(pdata->an_irq);
-   return -EINVAL;
+   if (set_mode) {
+   /* Start auto-negotiation in a supported mode */
+   if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
+   xgbe_set_mode(pdata, XGBE_MODE_KR);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
+   xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
+   xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
+   xgbe_set_mode(pdata, XGBE_MODE_SFI);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
+   xgbe_set_mode(pdata, XGBE_MODE_X);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
+   xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
+   } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
+   xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
+   } else {
+   enable_irq(pdata->an_irq);
+   ret = -EINVAL;
+   goto out;
+   }
}
 
/* Disable and stop any in progress auto-negotiation */
@@ -1239,16 +1246,7 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data 
*pdata)
xgbe_an_init(pdata);
xgbe_an_restart(pdata);
 
-   return 0;
-}
-
-static int xgbe_phy_config_ane

[PATCH net-next 10/12] amd-xgbe: Advertise FEC support with the KR re-driver

2018-05-21 Thread Tom Lendacky
When a KR re-driver is present, indicate the FEC support is available
during auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 141bb13..dd747f6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1720,6 +1720,10 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data 
*pdata,
XGBE_CLR_ADV(dlks, 1000baseKX_Full);
XGBE_CLR_ADV(dlks, 1baseKR_Full);
 
+   /* Advertise FEC support is present */
+   if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
+   XGBE_SET_ADV(dlks, 1baseR_FEC);
+
switch (phy_data->port_mode) {
case XGBE_PORT_MODE_BACKPLANE:
XGBE_SET_ADV(dlks, 1baseKR_Full);



[PATCH net-next 08/12] amd-xgbe: Add ethtool show/set channels support

2018-05-21 Thread Tom Lendacky
Add ethtool support to show and set the device channel configuration.
Changing the channel configuration will result in a device restart.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |   25 +
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  131 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |4 +
 3 files changed, 160 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 397e3a0..24f1053 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1329,6 +1329,17 @@ static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
struct net_device *netdev = pdata->netdev;
int ret;
 
+   if (pdata->new_tx_ring_count) {
+   pdata->tx_ring_count = pdata->new_tx_ring_count;
+   pdata->tx_q_count = pdata->tx_ring_count;
+   pdata->new_tx_ring_count = 0;
+   }
+
+   if (pdata->new_rx_ring_count) {
+   pdata->rx_ring_count = pdata->new_rx_ring_count;
+   pdata->new_rx_ring_count = 0;
+   }
+
/* Calculate the Rx buffer size before allocating rings */
pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
 
@@ -1482,6 +1493,20 @@ static void xgbe_stopdev(struct work_struct *work)
netdev_alert(pdata->netdev, "device stopped\n");
 }
 
+void xgbe_full_restart_dev(struct xgbe_prv_data *pdata)
+{
+   /* If not running, "restart" will happen on open */
+   if (!netif_running(pdata->netdev))
+   return;
+
+   xgbe_stop(pdata);
+
+   xgbe_free_memory(pdata);
+   xgbe_alloc_memory(pdata);
+
+   xgbe_start(pdata);
+}
+
 void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
/* If not running, "restart" will happen on open */
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index d12f982..d26fd95 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -705,6 +705,135 @@ static int xgbe_set_ringparam(struct net_device *netdev,
return 0;
 }
 
+static void xgbe_get_channels(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+   unsigned int rx, tx, combined;
+
+   /* Calculate maximums allowed:
+*   - Take into account the number of available IRQs
+*   - Do not take into account the number of online CPUs so that
+* the user can over-subscribe if desired
+*   - Tx is additionally limited by the number of hardware queues
+*/
+   rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+   rx = min(rx, pdata->channel_irq_count);
+   tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+   tx = min(tx, pdata->channel_irq_count);
+   tx = min(tx, pdata->tx_max_q_count);
+
+   combined = min(rx, tx);
+
+   channels->max_combined = combined;
+   channels->max_rx = rx;
+   channels->max_tx = tx;
+
+   /* Current running settings */
+   rx = pdata->rx_ring_count;
+   tx = pdata->tx_ring_count;
+
+   combined = min(rx, tx);
+   rx -= combined;
+   tx -= combined;
+
+   channels->combined_count = combined;
+   channels->rx_count = rx;
+   channels->tx_count = tx;
+}
+
+static void xgbe_print_set_channels_input(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+   netdev_err(netdev, "channel inputs: combined=%u, rx-only=%u, 
tx-only=%u\n",
+  channels->combined_count, channels->rx_count,
+  channels->tx_count);
+}
+
+static int xgbe_set_channels(struct net_device *netdev,
+struct ethtool_channels *channels)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+   unsigned int rx, tx, combined;
+
+   /* Calculate maximums allowed:
+*   - Take into account the number of available IRQs
+*   - Do not take into account the number of online CPUs so that
+* the user can over-subscribe if desired
+*   - Tx is additionally limited by the number of hardware queues
+*/
+   rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+   rx = min(rx, pdata->channel_irq_count);
+   tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+   tx = min(tx, pdata->tx_max_q_count);
+   tx = min(tx, pdata->channel_irq_count);
+
+   combined = min(rx, tx);
+
+   /* Should not be setting other count */
+   if (channels->other_count) {
+   n

[PATCH net-next 05/12] amd-xgbe: Add ethtool support to retrieve SFP module info

2018-05-21 Thread Tom Lendacky
Add support to get SFP module information using ethtool.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |   18 +++
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|   21 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  137 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |   13 ++
 4 files changed, 189 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index ff397bb..57394b77 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -626,6 +626,22 @@ static int xgbe_get_ts_info(struct net_device *netdev,
return 0;
 }
 
+static int xgbe_get_module_info(struct net_device *netdev,
+   struct ethtool_modinfo *modinfo)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+   return pdata->phy_if.module_info(pdata, modinfo);
+}
+
+static int xgbe_get_module_eeprom(struct net_device *netdev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+   return pdata->phy_if.module_eeprom(pdata, eeprom, data);
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
.get_drvinfo = xgbe_get_drvinfo,
.get_msglevel = xgbe_get_msglevel,
@@ -646,6 +662,8 @@ static int xgbe_get_ts_info(struct net_device *netdev,
.get_ts_info = xgbe_get_ts_info,
.get_link_ksettings = xgbe_get_link_ksettings,
.set_link_ksettings = xgbe_set_link_ksettings,
+   .get_module_info = xgbe_get_module_info,
+   .get_module_eeprom = xgbe_get_module_eeprom,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 1b45cd7..9c39c72 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -126,6 +126,24 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
+static int xgbe_phy_module_eeprom(struct xgbe_prv_data *pdata,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+   if (!pdata->phy_if.phy_impl.module_eeprom)
+   return -ENXIO;
+
+   return pdata->phy_if.phy_impl.module_eeprom(pdata, eeprom, data);
+}
+
+static int xgbe_phy_module_info(struct xgbe_prv_data *pdata,
+   struct ethtool_modinfo *modinfo)
+{
+   if (!pdata->phy_if.phy_impl.module_info)
+   return -ENXIO;
+
+   return pdata->phy_if.phy_impl.module_info(pdata, modinfo);
+}
+
 static void xgbe_an37_clear_interrupts(struct xgbe_prv_data *pdata)
 {
int reg;
@@ -1639,4 +1657,7 @@ void xgbe_init_function_ptrs_phy(struct xgbe_phy_if 
*phy_if)
phy_if->phy_valid_speed = xgbe_phy_valid_speed;
 
phy_if->an_isr  = xgbe_an_combined_isr;
+
+   phy_if->module_info = xgbe_phy_module_info;
+   phy_if->module_eeprom   = xgbe_phy_module_eeprom;
 }
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index cb15caf..141bb13 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -119,6 +119,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "xgbe.h"
 #include "xgbe-common.h"
@@ -270,6 +271,15 @@ struct xgbe_sfp_eeprom {
u8 vendor[32];
 };
 
+#define XGBE_SFP_DIAGS_SUPPORTED(_x)   \
+   ((_x)->extd[XGBE_SFP_EXTD_SFF_8472] &&  \
+!((_x)->extd[XGBE_SFP_EXTD_DIAG] & XGBE_SFP_EXTD_DIAG_ADDR_CHANGE))
+
+#define XGBE_SFP_EEPROM_BASE_LEN   256
+#define XGBE_SFP_EEPROM_DIAG_LEN   256
+#define XGBE_SFP_EEPROM_MAX(XGBE_SFP_EEPROM_BASE_LEN + \
+XGBE_SFP_EEPROM_DIAG_LEN)
+
 #define XGBE_BEL_FUSE_VENDOR   "BEL-FUSE"
 #define XGBE_BEL_FUSE_PARTNO   "1GBT-SFP06  "
 
@@ -1301,6 +1311,130 @@ static void xgbe_phy_sfp_detect(struct xgbe_prv_data 
*pdata)
xgbe_phy_put_comm_ownership(pdata);
 }
 
+static int xgbe_phy_module_eeprom(struct xgbe_prv_data *pdata,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+   struct xgbe_phy_data *phy_data = pdata->phy_data;
+   u8 eeprom_addr, eeprom_data[XGBE_SFP_EEPROM_MAX];
+   struct xgbe_sfp_eeprom *sfp_eeprom;
+   unsigned int i, j, rem;
+   int ret;
+
+   rem = eeprom->len;
+
+   if (!eeprom->len) {
+   ret = -EINVAL;
+   goto done;
+   }
+
+   if ((eeprom->offset + eeprom->len) > XGBE_SFP_EEPROM_MAX) {
+   ret = -EINVAL;
+   goto done;
+   }
+
+   if (phy_data->port_mode != XGBE_PORT_MODE_SFP) {
+

[PATCH net-next 07/12] amd-xgbe: Prepare for ethtool set-channel support

2018-05-21 Thread Tom Lendacky
In order to support being able to dynamically set/change the number of
Rx and Tx channels, update the code to:
 - Move alloc and free of device memory into callable functions
 - Move setting of the real number of Rx and Tx channels to device startup
 - Move mapping of the RSS channels to device startup

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |  108 ++---
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |   20 -
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 2646c08..397e3a0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1312,14 +1312,72 @@ int xgbe_powerup(struct net_device *netdev, unsigned 
int caller)
return 0;
 }
 
+static void xgbe_free_memory(struct xgbe_prv_data *pdata)
+{
+   struct xgbe_desc_if *desc_if = >desc_if;
+
+   /* Free the ring descriptors and buffers */
+   desc_if->free_ring_resources(pdata);
+
+   /* Free the channel and ring structures */
+   xgbe_free_channels(pdata);
+}
+
+static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
+{
+   struct xgbe_desc_if *desc_if = >desc_if;
+   struct net_device *netdev = pdata->netdev;
+   int ret;
+
+   /* Calculate the Rx buffer size before allocating rings */
+   pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
+
+   /* Allocate the channel and ring structures */
+   ret = xgbe_alloc_channels(pdata);
+   if (ret)
+   return ret;
+
+   /* Allocate the ring descriptors and buffers */
+   ret = desc_if->alloc_ring_resources(pdata);
+   if (ret)
+   goto err_channels;
+
+   /* Initialize the service and Tx timers */
+   xgbe_init_timers(pdata);
+
+   return 0;
+
+err_channels:
+   xgbe_free_memory(pdata);
+
+   return ret;
+}
+
 static int xgbe_start(struct xgbe_prv_data *pdata)
 {
struct xgbe_hw_if *hw_if = >hw_if;
struct xgbe_phy_if *phy_if = >phy_if;
struct net_device *netdev = pdata->netdev;
+   unsigned int i;
int ret;
 
-   DBGPR("-->xgbe_start\n");
+   /* Set the number of queues */
+   ret = netif_set_real_num_tx_queues(netdev, pdata->tx_ring_count);
+   if (ret) {
+   netdev_err(netdev, "error setting real tx queue count\n");
+   return ret;
+   }
+
+   ret = netif_set_real_num_rx_queues(netdev, pdata->rx_ring_count);
+   if (ret) {
+   netdev_err(netdev, "error setting real rx queue count\n");
+   return ret;
+   }
+
+   /* Set RSS lookup table data for programming */
+   for (i = 0; i < XGBE_RSS_MAX_TABLE_SIZE; i++)
+   XGMAC_SET_BITS(pdata->rss_table[i], MAC_RSSDR, DMCH,
+  i % pdata->rx_ring_count);
 
ret = hw_if->init(pdata);
if (ret)
@@ -1347,8 +1405,6 @@ static int xgbe_start(struct xgbe_prv_data *pdata)
 
clear_bit(XGBE_STOPPED, >dev_state);
 
-   DBGPR("<--xgbe_start\n");
-
return 0;
 
 err_irqs:
@@ -1823,11 +1879,8 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 static int xgbe_open(struct net_device *netdev)
 {
struct xgbe_prv_data *pdata = netdev_priv(netdev);
-   struct xgbe_desc_if *desc_if = >desc_if;
int ret;
 
-   DBGPR("-->xgbe_open\n");
-
/* Create the various names based on netdev name */
snprintf(pdata->an_name, sizeof(pdata->an_name) - 1, "%s-pcs",
 netdev_name(netdev));
@@ -1872,43 +1925,25 @@ static int xgbe_open(struct net_device *netdev)
goto err_sysclk;
}
 
-   /* Calculate the Rx buffer size before allocating rings */
-   ret = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
-   if (ret < 0)
-   goto err_ptpclk;
-   pdata->rx_buf_size = ret;
-
-   /* Allocate the channel and ring structures */
-   ret = xgbe_alloc_channels(pdata);
-   if (ret)
-   goto err_ptpclk;
-
-   /* Allocate the ring descriptors and buffers */
-   ret = desc_if->alloc_ring_resources(pdata);
-   if (ret)
-   goto err_channels;
-
INIT_WORK(>service_work, xgbe_service);
INIT_WORK(>restart_work, xgbe_restart);
INIT_WORK(>stopdev_work, xgbe_stopdev);
INIT_WORK(>tx_tstamp_work, xgbe_tx_tstamp);
-   xgbe_init_timers(pdata);
+
+   ret = xgbe_alloc_memory(pdata);
+   if (ret)
+   goto err_ptpclk;
 
ret = xgbe_start(pdata);
if (ret)
-   goto err_rings;
+   goto err_mem;
 
clear_bit(XGBE_DOWN, >dev_state);
 
-   DBGPR("<--xgbe_open\

[PATCH net-next 06/12] amd-xgbe: Add ethtool show/set ring parameter support

2018-05-21 Thread Tom Lendacky
Add ethtool support to show and set the number of the Rx and Tx ring
descriptors.  Changing the ring configuration will result in a device
restart.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |6 --
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |   65 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |6 ++
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 7c204f0..2646c08 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1426,10 +1426,8 @@ static void xgbe_stopdev(struct work_struct *work)
netdev_alert(pdata->netdev, "device stopped\n");
 }
 
-static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
+void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
-   DBGPR("-->xgbe_restart_dev\n");
-
/* If not running, "restart" will happen on open */
if (!netif_running(pdata->netdev))
return;
@@ -1440,8 +1438,6 @@ static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
xgbe_free_rx_data(pdata);
 
xgbe_start(pdata);
-
-   DBGPR("<--xgbe_restart_dev\n");
 }
 
 static void xgbe_restart(struct work_struct *work)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 57394b77..d12f982 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -642,6 +642,69 @@ static int xgbe_get_module_eeprom(struct net_device 
*netdev,
return pdata->phy_if.module_eeprom(pdata, eeprom, data);
 }
 
+static void xgbe_get_ringparam(struct net_device *netdev,
+  struct ethtool_ringparam *ringparam)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+   ringparam->rx_max_pending = XGBE_RX_DESC_CNT_MAX;
+   ringparam->tx_max_pending = XGBE_TX_DESC_CNT_MAX;
+   ringparam->rx_pending = pdata->rx_desc_count;
+   ringparam->tx_pending = pdata->tx_desc_count;
+}
+
+static int xgbe_set_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ringparam)
+{
+   struct xgbe_prv_data *pdata = netdev_priv(netdev);
+   unsigned int rx, tx;
+
+   if (ringparam->rx_mini_pending || ringparam->rx_jumbo_pending) {
+   netdev_err(netdev, "unsupported ring parameter\n");
+   return -EINVAL;
+   }
+
+   if ((ringparam->rx_pending < XGBE_RX_DESC_CNT_MIN) ||
+   (ringparam->rx_pending > XGBE_RX_DESC_CNT_MAX)) {
+   netdev_err(netdev,
+  "rx ring parameter must be between %u and %u\n",
+  XGBE_RX_DESC_CNT_MIN, XGBE_RX_DESC_CNT_MAX);
+   return -EINVAL;
+   }
+
+   if ((ringparam->tx_pending < XGBE_TX_DESC_CNT_MIN) ||
+   (ringparam->tx_pending > XGBE_TX_DESC_CNT_MAX)) {
+   netdev_err(netdev,
+  "tx ring parameter must be between %u and %u\n",
+  XGBE_TX_DESC_CNT_MIN, XGBE_TX_DESC_CNT_MAX);
+   return -EINVAL;
+   }
+
+   rx = __rounddown_pow_of_two(ringparam->rx_pending);
+   if (rx != ringparam->rx_pending)
+   netdev_notice(netdev,
+ "rx ring parameter rounded to power of two: %u\n",
+ rx);
+
+   tx = __rounddown_pow_of_two(ringparam->tx_pending);
+   if (tx != ringparam->tx_pending)
+   netdev_notice(netdev,
+ "tx ring parameter rounded to power of two: %u\n",
+ tx);
+
+   if ((rx == pdata->rx_desc_count) &&
+   (tx == pdata->tx_desc_count))
+   goto out;
+
+   pdata->rx_desc_count = rx;
+   pdata->tx_desc_count = tx;
+
+   xgbe_restart_dev(pdata);
+
+out:
+   return 0;
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
.get_drvinfo = xgbe_get_drvinfo,
.get_msglevel = xgbe_get_msglevel,
@@ -664,6 +727,8 @@ static int xgbe_get_module_eeprom(struct net_device *netdev,
.set_link_ksettings = xgbe_set_link_ksettings,
.get_module_info = xgbe_get_module_info,
.get_module_eeprom = xgbe_get_module_eeprom,
+   .get_ringparam = xgbe_get_ringparam,
+   .set_ringparam = xgbe_set_ringparam,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h 
b/drivers/net/ethernet/amd/xgbe/xgbe.h
index f0f455b..7dc0fac 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -144,6 +144,11 @@
 #define XGBE_TX_DESC_MAX_PROC  (XGBE_T

[PATCH net-next 04/12] amd-xgbe: Remove field that indicates SFP diagnostic support

2018-05-21 Thread Tom Lendacky
The driver currently sets an indication of whether the SFP supports, and
that the driver can obtain, diagnostics data.  This isn't currently used
by the driver and the logic to set this indicator is flawed because the
field is cleared each time the SFP is checked and only set when a new SFP
is detected.  Remove this field and the logic supporting it.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 05003be..cb15caf 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -343,7 +343,6 @@ struct xgbe_phy_data {
unsigned int sfp_rx_los;
unsigned int sfp_tx_fault;
unsigned int sfp_mod_absent;
-   unsigned int sfp_diags;
unsigned int sfp_changed;
unsigned int sfp_phy_avail;
unsigned int sfp_cable_len;
@@ -1211,13 +1210,6 @@ static int xgbe_phy_sfp_read_eeprom(struct xgbe_prv_data 
*pdata)
 
memcpy(_data->sfp_eeprom, _eeprom, sizeof(sfp_eeprom));
 
-   if (sfp_eeprom.extd[XGBE_SFP_EXTD_SFF_8472]) {
-   u8 diag_type = sfp_eeprom.extd[XGBE_SFP_EXTD_DIAG];
-
-   if (!(diag_type & XGBE_SFP_EXTD_DIAG_ADDR_CHANGE))
-   phy_data->sfp_diags = 1;
-   }
-
xgbe_phy_free_phy_device(pdata);
} else {
phy_data->sfp_changed = 0;
@@ -1267,7 +1259,6 @@ static void xgbe_phy_sfp_reset(struct xgbe_phy_data 
*phy_data)
phy_data->sfp_rx_los = 0;
phy_data->sfp_tx_fault = 0;
phy_data->sfp_mod_absent = 1;
-   phy_data->sfp_diags = 0;
phy_data->sfp_base = XGBE_SFP_BASE_UNKNOWN;
phy_data->sfp_cable = XGBE_SFP_CABLE_UNKNOWN;
phy_data->sfp_speed = XGBE_SFP_SPEED_UNKNOWN;



[PATCH net-next 03/12] amd-xgbe: Remove use of comm_owned field

2018-05-21 Thread Tom Lendacky
The comm_owned field can hide logic where double locking is attempted
and prevent multiple threads for the same device from accessing the
mutex properly.  Remove the comm_owned field and use the mutex API
exclusively for gaining ownership.  The current driver has been audited
and is obtaining communications ownership properly.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 123ceb0..05003be 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -327,8 +327,6 @@ struct xgbe_phy_data {
 
unsigned int mdio_addr;
 
-   unsigned int comm_owned;
-
/* SFP Support */
enum xgbe_sfp_comm sfp_comm;
unsigned int sfp_mux_address;
@@ -382,12 +380,6 @@ struct xgbe_phy_data {
 static int xgbe_phy_i2c_xfer(struct xgbe_prv_data *pdata,
 struct xgbe_i2c_op *i2c_op)
 {
-   struct xgbe_phy_data *phy_data = pdata->phy_data;
-
-   /* Be sure we own the bus */
-   if (WARN_ON(!phy_data->comm_owned))
-   return -EIO;
-
return pdata->i2c_if.i2c_xfer(pdata, i2c_op);
 }
 
@@ -549,10 +541,6 @@ static int xgbe_phy_sfp_get_mux(struct xgbe_prv_data 
*pdata)
 
 static void xgbe_phy_put_comm_ownership(struct xgbe_prv_data *pdata)
 {
-   struct xgbe_phy_data *phy_data = pdata->phy_data;
-
-   phy_data->comm_owned = 0;
-
mutex_unlock(_phy_comm_lock);
 }
 
@@ -562,9 +550,6 @@ static int xgbe_phy_get_comm_ownership(struct xgbe_prv_data 
*pdata)
unsigned long timeout;
unsigned int mutex_id;
 
-   if (phy_data->comm_owned)
-   return 0;
-
/* The I2C and MDIO/GPIO bus is multiplexed between multiple devices,
 * the driver needs to take the software mutex and then the hardware
 * mutexes before being able to use the busses.
@@ -593,7 +578,6 @@ static int xgbe_phy_get_comm_ownership(struct xgbe_prv_data 
*pdata)
XP_IOWRITE(pdata, XP_I2C_MUTEX, mutex_id);
XP_IOWRITE(pdata, XP_MDIO_MUTEX, mutex_id);
 
-   phy_data->comm_owned = 1;
return 0;
}
 



[PATCH net-next 02/12] amd-xgbe: Read and save the port property registers during probe

2018-05-21 Thread Tom Lendacky
Read and save the port property registers once during the device probe
and then use the saved values as they are needed.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c|   34 ++
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   68 ---
 drivers/net/ethernet/amd/xgbe/xgbe.h|7 +++
 3 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 7b63521..7b86240 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -335,12 +335,29 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
pdata->awcr = XGBE_DMA_PCI_AWCR;
pdata->awarcr = XGBE_DMA_PCI_AWARCR;
 
+   /* Read the port property registers */
+   pdata->pp0 = XP_IOREAD(pdata, XP_PROP_0);
+   pdata->pp1 = XP_IOREAD(pdata, XP_PROP_1);
+   pdata->pp2 = XP_IOREAD(pdata, XP_PROP_2);
+   pdata->pp3 = XP_IOREAD(pdata, XP_PROP_3);
+   pdata->pp4 = XP_IOREAD(pdata, XP_PROP_4);
+   if (netif_msg_probe(pdata)) {
+   dev_dbg(dev, "port property 0 = %#010x\n", pdata->pp0);
+   dev_dbg(dev, "port property 1 = %#010x\n", pdata->pp1);
+   dev_dbg(dev, "port property 2 = %#010x\n", pdata->pp2);
+   dev_dbg(dev, "port property 3 = %#010x\n", pdata->pp3);
+   dev_dbg(dev, "port property 4 = %#010x\n", pdata->pp4);
+   }
+
/* Set the maximum channels and queues */
-   reg = XP_IOREAD(pdata, XP_PROP_1);
-   pdata->tx_max_channel_count = XP_GET_BITS(reg, XP_PROP_1, MAX_TX_DMA);
-   pdata->rx_max_channel_count = XP_GET_BITS(reg, XP_PROP_1, MAX_RX_DMA);
-   pdata->tx_max_q_count = XP_GET_BITS(reg, XP_PROP_1, MAX_TX_QUEUES);
-   pdata->rx_max_q_count = XP_GET_BITS(reg, XP_PROP_1, MAX_RX_QUEUES);
+   pdata->tx_max_channel_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+ MAX_TX_DMA);
+   pdata->rx_max_channel_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+ MAX_RX_DMA);
+   pdata->tx_max_q_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+   MAX_TX_QUEUES);
+   pdata->rx_max_q_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+   MAX_RX_QUEUES);
if (netif_msg_probe(pdata)) {
dev_dbg(dev, "max tx/rx channel count = %u/%u\n",
pdata->tx_max_channel_count,
@@ -353,12 +370,13 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
xgbe_set_counts(pdata);
 
/* Set the maximum fifo amounts */
-   reg = XP_IOREAD(pdata, XP_PROP_2);
-   pdata->tx_max_fifo_size = XP_GET_BITS(reg, XP_PROP_2, TX_FIFO_SIZE);
+   pdata->tx_max_fifo_size = XP_GET_BITS(pdata->pp2, XP_PROP_2,
+ TX_FIFO_SIZE);
pdata->tx_max_fifo_size *= 16384;
pdata->tx_max_fifo_size = min(pdata->tx_max_fifo_size,
  pdata->vdata->tx_max_fifo_size);
-   pdata->rx_max_fifo_size = XP_GET_BITS(reg, XP_PROP_2, RX_FIFO_SIZE);
+   pdata->rx_max_fifo_size = XP_GET_BITS(pdata->pp2, XP_PROP_2,
+ RX_FIFO_SIZE);
pdata->rx_max_fifo_size *= 16384;
pdata->rx_max_fifo_size = min(pdata->rx_max_fifo_size,
  pdata->vdata->rx_max_fifo_size);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index aac8843..123ceb0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -2421,22 +2421,21 @@ static int xgbe_phy_link_status(struct xgbe_prv_data 
*pdata, int *an_restart)
 static void xgbe_phy_sfp_gpio_setup(struct xgbe_prv_data *pdata)
 {
struct xgbe_phy_data *phy_data = pdata->phy_data;
-   unsigned int reg;
-
-   reg = XP_IOREAD(pdata, XP_PROP_3);
 
phy_data->sfp_gpio_address = XGBE_GPIO_ADDRESS_PCA9555 +
-XP_GET_BITS(reg, XP_PROP_3, GPIO_ADDR);
+XP_GET_BITS(pdata->pp3, XP_PROP_3,
+GPIO_ADDR);
 
-   phy_data->sfp_gpio_mask = XP_GET_BITS(reg, XP_PROP_3, GPIO_MASK);
+   phy_data->sfp_gpio_mask = XP_GET_BITS(pdata->pp3, XP_PROP_3,
+ GPIO_MASK);
 
-   phy_data->sfp_gpio_rx_los = XP_GET_BITS(reg, XP_PROP_3,
+   phy_data->sfp_gpio_rx_los = XP_GET_BITS(pdata->

[PATCH net-next 00/12] amd-xgbe: AMD XGBE driver updates 2018-05-21

2018-05-21 Thread Tom Lendacky
The following updates are included in this driver update series:

- Fix the debug output for the max channels count
- Read (once) and save the port property registers during probe
- Remove the use of the comm_owned field
- Remove unused SFP diagnostic support indicator field
- Add ethtool --module-info support
- Add ethtool --show-ring/--set-ring support
- Update the driver in preparation for ethtool --set-channels support
- Add ethtool --show-channels/--set-channels support
- Update the driver to always perform link training in KR mode
- Advertise FEC support when using a KR re-driver
- Update the BelFuse quirk to now support SGMII
- Improve 100Mbps auto-negotiation for BelFuse parts

This patch series is based on net-next.

---

Tom Lendacky (12):
  amd-xgbe: Fix debug output of max channel counts
  amd-xgbe: Read and save the port property registers during probe
  amd-xgbe: Remove use of comm_owned field
  amd-xgbe: Remove field that indicates SFP diagnostic support
  amd-xgbe: Add ethtool support to retrieve SFP module info
  amd-xgbe: Add ethtool show/set ring parameter support
  amd-xgbe: Prepare for ethtool set-channel support
  amd-xgbe: Add ethtool show/set channels support
  amd-xgbe: Always attempt link training in KR mode
  amd-xgbe: Advertise FEC support with the KR re-driver
  amd-xgbe: Update the BelFuse quirk to support SGMII
  amd-xgbe: Improve SFP 100Mbps auto-negotiation


 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |  137 +++---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  214 
 drivers/net/ethernet/amd/xgbe/xgbe-main.c|   20 -
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|  167 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |   36 ++-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  349 +++---
 drivers/net/ethernet/amd/xgbe/xgbe.h |   31 ++
 7 files changed, 696 insertions(+), 258 deletions(-)

-- 
Tom Lendacky


[PATCH net-next 01/12] amd-xgbe: Fix debug output of max channel counts

2018-05-21 Thread Tom Lendacky
A debug output print statement uses the wrong variable to output the
maximum Rx channel count (cut and paste error, basically).  Fix the
statement to use the proper variable.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 82d1f41..7b63521 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -344,7 +344,7 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
if (netif_msg_probe(pdata)) {
dev_dbg(dev, "max tx/rx channel count = %u/%u\n",
pdata->tx_max_channel_count,
-   pdata->tx_max_channel_count);
+   pdata->rx_max_channel_count);
dev_dbg(dev, "max tx/rx hw queue count = %u/%u\n",
pdata->tx_max_q_count, pdata->rx_max_q_count);
}



Re: [net-next PATCH v2 2/4] net: Enable Tx queue selection based on Rx queues

2018-05-21 Thread Tom Herbert
On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
<willemdebruijn.ker...@gmail.com> wrote:
> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
> <willemdebruijn.ker...@gmail.com> wrote:
>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <t...@herbertland.com> wrote:
>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>> <amritha.namb...@intel.com> wrote:
>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>> configuration set by the admin through the sysfs attribute
>>>> for each Tx queue. If the user configuration for receive
>>>> queue map does not apply, then the Tx queue selection falls back
>>>> to CPU map based selection and finally to hashing.
>>>>
>>>> Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com>
>>>> ---
>
>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>> +{
>>>> +#ifdef CONFIG_XPS
>>>> +   enum xps_map_type i = XPS_MAP_RXQS;
>>>> +   struct xps_dev_maps *dev_maps;
>>>> +   struct sock *sk = skb->sk;
>>>> +   int queue_index = -1;
>>>> +   unsigned int tci = 0;
>>>> +
>>>> +   if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>> +   dev->ifindex == sk->sk_rx_ifindex)
>>>> +   tci = sk->sk_rx_queue_mapping;
>>>> +
>>>> +   rcu_read_lock();
>>>> +   while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>> +   if (i == XPS_MAP_CPUS)
>>>
>>> This while loop typifies exactly why I don't think the XPS maps should
>>> be an array.
>>
>> +1
>
> as a matter of fact, as enabling both cpu and rxqueue map at the same
> time makes no sense, only one map is needed at any one time. The
> only difference is in how it is indexed. It should probably not be possible
> to configure both at the same time. Keeping a single map probably also
> significantly simplifies patch 1/4.

Willem,

I think it might makes sense to have them both. Maybe one application
is spin polling that needs this, where others might be happy with
normal CPU mappings as default.

Tom


Re: [PATCH net-next] sctp: add support for SCTP_REUSE_PORT sockopt

2018-05-20 Thread Tom Herbert
On Sun, May 20, 2018 at 6:54 PM, Marcelo Ricardo Leitner
<marcelo.leit...@gmail.com> wrote:
> On Sun, May 20, 2018 at 08:50:59PM -0400, Neil Horman wrote:
>> On Sat, May 19, 2018 at 03:44:40PM +0800, Xin Long wrote:
>> > This feature is actually already supported by sk->sk_reuse which can be
>> > set by SO_REUSEADDR. But it's not working exactly as RFC6458 demands in
>> > section 8.1.27, like:
>> >
>> >   - This option only supports one-to-one style SCTP sockets
>> >   - This socket option must not be used after calling bind()
>> > or sctp_bindx().
>> >
>> > Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
>> > Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
>> > work in linux.
>> >
>> > This patch reuses sk->sk_reuse and works pretty much as SO_REUSEADDR,
>> > just with some extra setup limitations that are neeeded when it is being
>> > enabled.
>> >
>> > "It should be noted that the behavior of the socket-level socket option
>> > to reuse ports and/or addresses for SCTP sockets is unspecified", so it
>> > leaves SO_REUSEADDR as is for the compatibility.
>> >
>> > Signed-off-by: Xin Long <lucien@gmail.com>
>> > ---
>> >  include/uapi/linux/sctp.h |  1 +
>> >  net/sctp/socket.c | 48 
>> > +++
>> >  2 files changed, 49 insertions(+)
>> >
>> A few things:
>>
>> 1) I agree with Tom, this feature is a complete duplication of the 
>> SK_REUSEPORT
>> socket option.  I understand that this is an implementation of the option in 
>> the
>> RFC, but its definately a duplication of a feature, which makes several 
>> things
>> really messy.
>>
>> 2) The overloading of the sk_reuse opeion is a bad idea, for several reasons.
>> Chief among them is the behavioral interference between this patch and the
>> SO_REUSEADDR socket level option, that also sets this feature.  If you set
>> sk_reuse via SO_REUSEADDR, you will set the SCTP port reuse feature 
>> regardless
>> of the bind or 1:1/1:m state of the socket.  Vice versa, if you set this 
>> socket
>> option via the SCTP_PORT_REUSE option you will inadvertently turn on address
>> reuse for the socket.  We can't do that.
>
> Given your comments, going a bit further here, one other big
> implication is that a port would never be able to be considered to
> fully meet SCTP standards regarding reuse because a rogue application
> may always abuse of the socket level opt to gain access to the port.
>
There are mitigations in SO_REUSEPORT to prevent port hijacking. Don't
see why they can't be applied to SCTP.

Tom

> IOW, the patch allows the application to use such restrictions against
> itself and nothing else, which undermines the patch idea.
>
> I lack the knowledge on why the SCTP option was proposed in the RFC. I
> guess they had a good reason to add the restriction on 1:1/1:m style.
> Does the usage of the current imply in any risk to SCTP sockets? If
> yes, that would give some grounds for going forward with the SCTP
> option.
>
>>
>> Its a bit frustrating, since SO_REUSEPORT is widely available on multiple
>> operating systems, but isn't standard (AFAIK).  I would say however, given 
>> the
>> prevalence of the socket level option, we should likely advocate for the 
>> removal
>> of the sctp specific option, or at the least implement and document it as 
>> being
>
> Is it possible, to remove/deprecate an option once it is published on a RFC?
>
>> an alias for SO_REUSEPORT
>>
>>
>> As this stands however, its a NACK from me.
>>
>> Neil
>>


Re: [PATCH net-next] sctp: add support for SCTP_REUSE_PORT sockopt

2018-05-20 Thread Tom Herbert
On Sat, May 19, 2018 at 12:44 AM, Xin Long  wrote:
> This feature is actually already supported by sk->sk_reuse which can be
> set by SO_REUSEADDR. But it's not working exactly as RFC6458 demands in
> section 8.1.27, like:
>
>   - This option only supports one-to-one style SCTP sockets
>   - This socket option must not be used after calling bind()
> or sctp_bindx().
>
> Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
> Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
> work in linux.
>
How is this different than SO_REUSEPORT?

> This patch reuses sk->sk_reuse and works pretty much as SO_REUSEADDR,
> just with some extra setup limitations that are neeeded when it is being
> enabled.
>
> "It should be noted that the behavior of the socket-level socket option
> to reuse ports and/or addresses for SCTP sockets is unspecified", so it
> leaves SO_REUSEADDR as is for the compatibility.
>
> Signed-off-by: Xin Long 
> ---
>  include/uapi/linux/sctp.h |  1 +
>  net/sctp/socket.c | 48 
> +++
>  2 files changed, 49 insertions(+)
>
> diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
> index b64d583..c02986a 100644
> --- a/include/uapi/linux/sctp.h
> +++ b/include/uapi/linux/sctp.h
> @@ -100,6 +100,7 @@ typedef __s32 sctp_assoc_t;
>  #define SCTP_RECVNXTINFO   33
>  #define SCTP_DEFAULT_SNDINFO   34
>  #define SCTP_AUTH_DEACTIVATE_KEY   35
> +#define SCTP_REUSE_PORT36
>
>  /* Internal Socket Options. Some of the sctp library functions are
>   * implemented using these socket options.
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 1b4593b..8dfcc79 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -4170,6 +4170,28 @@ static int 
> sctp_setsockopt_interleaving_supported(struct sock *sk,
> return retval;
>  }
>
> +static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
> + unsigned int optlen)
> +{
> +   int val;
> +
> +   if (!sctp_style(sk, TCP))
> +   return -EOPNOTSUPP;
> +
> +   if (sctp_sk(sk)->ep->base.bind_addr.port)
> +   return -EFAULT;
> +
> +   if (optlen < sizeof(int))
> +   return -EINVAL;
> +
> +   if (get_user(val, (int __user *)optval))
> +   return -EFAULT;
> +
> +   sk->sk_reuse = val ? SK_CAN_REUSE : SK_NO_REUSE;
> +
> +   return 0;
> +}
> +
>  /* API 6.2 setsockopt(), getsockopt()
>   *
>   * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -4364,6 +4386,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
> int optname,
> retval = sctp_setsockopt_interleaving_supported(sk, optval,
> optlen);
> break;
> +   case SCTP_REUSE_PORT:
> +   retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
> +   break;
> default:
> retval = -ENOPROTOOPT;
> break;
> @@ -7175,6 +7200,26 @@ static int 
> sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
> return retval;
>  }
>
> +static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
> + char __user *optval,
> + int __user *optlen)
> +{
> +   int val = 0;
> +
> +   if (len < sizeof(int))
> +   return -EINVAL;
> +
> +   len = sizeof(int);
> +   if (sk->sk_reuse != SK_NO_REUSE)
> +   val = 1;
> +   if (put_user(len, optlen))
> +   return -EFAULT;
> +   if (copy_to_user(optval, , len))
> +   return -EFAULT;
> +
> +   return 0;
> +}
> +
>  static int sctp_getsockopt(struct sock *sk, int level, int optname,
>char __user *optval, int __user *optlen)
>  {
> @@ -7370,6 +7415,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
> int optname,
> retval = sctp_getsockopt_interleaving_supported(sk, len, 
> optval,
> optlen);
> break;
> +   case SCTP_REUSE_PORT:
> +   retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
> +   break;
> default:
> retval = -ENOPROTOOPT;
> break;
> --
> 2.1.0
>


Re: [net-next PATCH v2 0/4] Symmetric queue selection using XPS for Rx queues

2018-05-17 Thread Tom Herbert
On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
 wrote:
> This patch series implements support for Tx queue selection based on
> Rx queue(s) map. This is done by configuring Rx queue(s) map per Tx-queue
> using sysfs attribute. If the user configuration for Rx queues does
> not apply, then the Tx queue selection falls back to XPS using CPUs and
> finally to hashing.
>
> XPS is refactored to support Tx queue selection based on either the
> CPUs map or the Rx-queues map. The config option CONFIG_XPS needs to be
> enabled. By default no receive queues are configured for the Tx queue.
>
> - /sys/class/net//queues/tx-*/xps_rxqs
>
> This is to enable sending packets on the same Tx-Rx queue pair as this

If I'm reading the patch correctly, isn't this mapping rxq to a set of
txqs (in other words not strictly queue pair which has other
connotations in NIC HW). It is important to make it clear that this
feature is no HW dependent.

> is useful for busy polling multi-threaded workloads where it is not
> possible to pin the threads to a CPU. This is a rework of Sridhar's
> patch for symmetric queueing via socket option:
> https://www.spinics.net/lists/netdev/msg453106.html
>
Please add something about how this was tested and what the
performance gain is to justify the feature.

> v2:
> - Added documentation in networking/scaling.txt
> - Added a simple routine to replace multiple ifdef blocks.
>
> ---
>
> Amritha Nambiar (4):
>   net: Refactor XPS for CPUs and Rx queues
>   net: Enable Tx queue selection based on Rx queues
>   net-sysfs: Add interface for Rx queue map per Tx queue
>   Documentation: Add explanation for XPS using Rx-queue map
>
>
>  Documentation/networking/scaling.txt |   58 +++-
>  include/linux/cpumask.h  |   11 +-
>  include/linux/netdevice.h|   72 ++
>  include/net/sock.h   |   18 +++
>  net/core/dev.c   |  242 
> +++---
>  net/core/net-sysfs.c |   85 
>  net/core/sock.c  |5 +
>  net/ipv4/tcp_input.c |7 +
>  net/ipv4/tcp_ipv4.c  |1
>  net/ipv4/tcp_minisocks.c |1
>  10 files changed, 404 insertions(+), 96 deletions(-)
>
> --


Re: [net-next PATCH v2 1/4] net: Refactor XPS for CPUs and Rx queues

2018-05-17 Thread Tom Herbert
On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
 wrote:
> Refactor XPS code to support Tx queue selection based on
> CPU map or Rx queue map.
>
> Signed-off-by: Amritha Nambiar 
> ---
>  include/linux/cpumask.h   |   11 ++
>  include/linux/netdevice.h |   72 +++-
>  net/core/dev.c|  208 
> +
>  net/core/net-sysfs.c  |4 -
>  4 files changed, 215 insertions(+), 80 deletions(-)
>
> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> index bf53d89..57f20a0 100644
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
>  #define cpu_active(cpu)((cpu) == 0)
>  #endif
>
> -/* verify cpu argument to cpumask_* operators */
> -static inline unsigned int cpumask_check(unsigned int cpu)
> +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
>  {
>  #ifdef CONFIG_DEBUG_PER_CPU_MAPS
> -   WARN_ON_ONCE(cpu >= nr_cpumask_bits);
> +   WARN_ON_ONCE(cpu >= bits);
>  #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +}
> +
> +/* verify cpu argument to cpumask_* operators */
> +static inline unsigned int cpumask_check(unsigned int cpu)
> +{
> +   cpu_max_bits_warn(cpu, nr_cpumask_bits);
> return cpu;
>  }
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 03ed492..c2eeb36 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -730,10 +730,21 @@ struct xps_map {
>   */
>  struct xps_dev_maps {
> struct rcu_head rcu;
> -   struct xps_map __rcu *cpu_map[0];
> +   struct xps_map __rcu *attr_map[0];
>  };
> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
> +
> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
> (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
> +
> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
> +   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
> +
> +enum xps_map_type {
> +   XPS_MAP_RXQS,
> +   XPS_MAP_CPUS,
> +   __XPS_MAP_MAX
> +};
> +
>  #endif /* CONFIG_XPS */
>
>  #define TC_MAX_QUEUE   16
> @@ -1891,7 +1902,7 @@ struct net_device {
> int watchdog_timeo;
>
>  #ifdef CONFIG_XPS
> -   struct xps_dev_maps __rcu *xps_maps;
> +   struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>  #endif
>  #ifdef CONFIG_NET_CLS_ACT
> struct mini_Qdisc __rcu *miniq_egress;
> @@ -3229,6 +3240,61 @@ static inline void netif_wake_subqueue(struct 
> net_device *dev, u16 queue_index)
>  #ifdef CONFIG_XPS
>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> u16 index);
> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
> + u16 index, enum xps_map_type type);
> +
> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
> + unsigned int nr_bits)
> +{
> +   cpu_max_bits_warn(j, nr_bits);
> +   return test_bit(j, mask);
> +}
> +
> +static inline bool attr_test_online(unsigned long j,
> +   const unsigned long *online_mask,
> +   unsigned int nr_bits)
> +{
> +   cpu_max_bits_warn(j, nr_bits);
> +
> +   if (online_mask)
> +   return test_bit(j, online_mask);
> +
> +   if (j >= 0 && j < nr_bits)
> +   return true;
> +
> +   return false;
> +}
> +
> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
> +unsigned int nr_bits)
> +{
> +   /* -1 is a legal arg here. */
> +   if (n != -1)
> +   cpu_max_bits_warn(n, nr_bits);
> +
> +   if (srcp)
> +   return find_next_bit(srcp, nr_bits, n + 1);
> +
> +   return n + 1;
> +}
> +
> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
> +   const unsigned long *src2p,
> +   unsigned int nr_bits)
> +{
> +   /* -1 is a legal arg here. */
> +   if (n != -1)
> +   cpu_max_bits_warn(n, nr_bits);
> +
> +   if (src1p && src2p)
> +   return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
> +   else if (src1p)
> +   return find_next_bit(src1p, nr_bits, n + 1);
> +   else if (src2p)
> +   return find_next_bit(src2p, nr_bits, n + 1);
> +
> +   return n + 1;
> +}
>  #else
>  static inline int netif_set_xps_queue(struct net_device *dev,
>   const struct cpumask *mask,
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9f43901..7e5dfdb 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
> *dev_maps,
>   

Re: [net-next PATCH v2 2/4] net: Enable Tx queue selection based on Rx queues

2018-05-17 Thread Tom Herbert
On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
 wrote:
> This patch adds support to pick Tx queue based on the Rx queue map
> configuration set by the admin through the sysfs attribute
> for each Tx queue. If the user configuration for receive
> queue map does not apply, then the Tx queue selection falls back
> to CPU map based selection and finally to hashing.
>
> Signed-off-by: Amritha Nambiar 
> Signed-off-by: Sridhar Samudrala 
> ---
>  include/net/sock.h   |   18 ++
>  net/core/dev.c   |   36 +---
>  net/core/sock.c  |5 +
>  net/ipv4/tcp_input.c |7 +++
>  net/ipv4/tcp_ipv4.c  |1 +
>  net/ipv4/tcp_minisocks.c |1 +
>  6 files changed, 61 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4f7c584..0613f63 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
>   * @skc_node: main hash linkage for various protocol lookup tables
>   * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   * @skc_tx_queue_mapping: tx queue number for this connection
> + * @skc_rx_queue_mapping: rx queue number for this connection
> + * @skc_rx_ifindex: rx ifindex for this connection
>   * @skc_flags: place holder for sk_flags
>   * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>   * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
> @@ -215,6 +217,10 @@ struct sock_common {
> struct hlist_nulls_node skc_nulls_node;
> };
> int skc_tx_queue_mapping;
> +#ifdef CONFIG_XPS
> +   int skc_rx_queue_mapping;
> +   int skc_rx_ifindex;

Isn't this increasing size of sock_common for a narrow use case functionality?

> +#endif
> union {
> int skc_incoming_cpu;
> u32 skc_rcv_wnd;
> @@ -326,6 +332,10 @@ struct sock {
>  #define sk_nulls_node  __sk_common.skc_nulls_node
>  #define sk_refcnt  __sk_common.skc_refcnt
>  #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
> +#ifdef CONFIG_XPS
> +#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
> +#define sk_rx_ifindex  __sk_common.skc_rx_ifindex
> +#endif
>
>  #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
>  #define sk_dontcopy_end__sk_common.skc_dontcopy_end
> @@ -1696,6 +1706,14 @@ static inline int sk_tx_queue_get(const struct sock 
> *sk)
> return sk ? sk->sk_tx_queue_mapping : -1;
>  }
>
> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
> +{
> +#ifdef CONFIG_XPS
> +   sk->sk_rx_ifindex = skb->skb_iif;
> +   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +#endif
> +}
> +
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
> sk_tx_queue_clear(sk);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 7e5dfdb..4030368 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3458,18 +3458,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, 
> struct net_device *dev)
>  }
>  #endif /* CONFIG_NET_EGRESS */
>
> -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
> -{
>  #ifdef CONFIG_XPS
> -   struct xps_dev_maps *dev_maps;
> +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
> +  struct xps_dev_maps *dev_maps, unsigned int 
> tci)
> +{
> struct xps_map *map;
> int queue_index = -1;
>
> -   rcu_read_lock();
> -   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
> if (dev_maps) {
> -   unsigned int tci = skb->sender_cpu - 1;
> -
> if (dev->num_tc) {
> tci *= dev->num_tc;
> tci += netdev_get_prio_tc_map(dev, skb->priority);
> @@ -3486,6 +3482,32 @@ static inline int get_xps_queue(struct net_device 
> *dev, struct sk_buff *skb)
> queue_index = -1;
> }
> }
> +   return queue_index;
> +}
> +#endif
> +
> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
> +{
> +#ifdef CONFIG_XPS
> +   enum xps_map_type i = XPS_MAP_RXQS;
> +   struct xps_dev_maps *dev_maps;
> +   struct sock *sk = skb->sk;
> +   int queue_index = -1;
> +   unsigned int tci = 0;
> +
> +   if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
> +   dev->ifindex == sk->sk_rx_ifindex)
> +   tci = sk->sk_rx_queue_mapping;
> +
> +   rcu_read_lock();
> +   while (queue_index < 0 && i < __XPS_MAP_MAX) {
> +   if (i == XPS_MAP_CPUS)

This while loop typifies exactly why I don't think the XPS maps should
be 

Re: [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues

2018-05-14 Thread Tom Herbert
On Wed, May 9, 2018 at 1:54 PM, Nambiar, Amritha
<amritha.namb...@intel.com> wrote:
> On 5/9/2018 1:31 PM, Tom Herbert wrote:
>> On Thu, Apr 19, 2018 at 6:04 PM, Amritha Nambiar
>> <amritha.namb...@intel.com> wrote:
>>> Refactor XPS code to support Tx queue selection based on
>>> CPU map or Rx queue map.
>>>
>>> Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
>>> ---
>>>  include/linux/netdevice.h |   82 +-
>>>  net/core/dev.c|  206 
>>> +
>>>  net/core/net-sysfs.c  |4 -
>>>  3 files changed, 216 insertions(+), 76 deletions(-)
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index 14e0777..40a9171 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -730,10 +730,21 @@ struct xps_map {
>>>   */
>>>  struct xps_dev_maps {
>>> struct rcu_head rcu;
>>> -   struct xps_map __rcu *cpu_map[0];
>>> +   struct xps_map __rcu *attr_map[0];
>>
>> This seems unnecessarily complicated to me. Why not just add another
>> map called something like "rxq2txq_map". Then when selecting TXQ just
>> check the new map first and then the normal cpu_map if there's not a
>> hit.
>>
>
> This is just a change in the name to something more generic ('attr')
> since the maps can either be cpu based or rxq based. I have added two
> map types, XPS_MAP_RXQS, XPS_MAP_CPUS and the TXQ selection (in patch

I think adding map types is overkill and we really don't want to turn
this in to a generic but complex interface with a bunch of map types.
Just have two pointers to the two different maps.

> 2/3) works how you described,  first based on the RXQ map and if there
> is no hit, falls to the normal CPU map.
>
>>>  };
>>> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
>>> +
>>> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
>>> (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
>>> +
>>> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
>>> +   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
>>> +
>>> +enum xps_map_type {
>>> +   XPS_MAP_RXQS,
>>> +   XPS_MAP_CPUS,
>>> +   __XPS_MAP_MAX
>>> +};
>>> +
>>>  #endif /* CONFIG_XPS */
>>>
>>>  #define TC_MAX_QUEUE   16
>>> @@ -1867,7 +1878,7 @@ struct net_device {
>>> int watchdog_timeo;
>>>
>>>  #ifdef CONFIG_XPS
>>> -   struct xps_dev_maps __rcu *xps_maps;
>>> +   struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>>>  #endif
>>>  #ifdef CONFIG_NET_CLS_ACT
>>> struct mini_Qdisc __rcu *miniq_egress;
>>> @@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct 
>>> net_device *dev, u16 queue_index)
>>>  #ifdef CONFIG_XPS
>>>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>> u16 index);
>>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long 
>>> *mask,
>>> + u16 index, enum xps_map_type type);
>>> +
>>> +static inline bool attr_test_mask(unsigned long j, const unsigned long 
>>> *mask,
>>> + unsigned int nr_bits)
>>> +{
>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>> +   WARN_ON_ONCE(j >= nr_bits);
>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>
>> This #ifdef block appears 3 times in the patch. Seems like it should
>> be replace by simple macro.
>
> Sure, will do in the next version.
>
>>
>>> +   return test_bit(j, mask);
>>> +}
>>> +
>>> +static inline bool attr_test_online(unsigned long j,
>>> +   const unsigned long *online_mask,
>>> +   unsigned int nr_bits)
>>> +{
>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>> +   WARN_ON_ONCE(j >= nr_bits);
>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>> +
>>> +   if (online_mask)
>>> +   return test_bit(j, online_mask);
>>> +
>>> +   if (j >= 0 && j < nr_bits)
>>> +   return true;
>>> +
>>> +   return false;
>&

Re: [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues

2018-05-09 Thread Tom Herbert
On Thu, Apr 19, 2018 at 6:04 PM, Amritha Nambiar
 wrote:
> Refactor XPS code to support Tx queue selection based on
> CPU map or Rx queue map.
>
> Signed-off-by: Amritha Nambiar 
> ---
>  include/linux/netdevice.h |   82 +-
>  net/core/dev.c|  206 
> +
>  net/core/net-sysfs.c  |4 -
>  3 files changed, 216 insertions(+), 76 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 14e0777..40a9171 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -730,10 +730,21 @@ struct xps_map {
>   */
>  struct xps_dev_maps {
> struct rcu_head rcu;
> -   struct xps_map __rcu *cpu_map[0];
> +   struct xps_map __rcu *attr_map[0];

This seems unnecessarily complicated to me. Why not just add another
map called something like "rxq2txq_map". Then when selecting TXQ just
check the new map first and then the normal cpu_map if there's not a
hit.

>  };
> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
> +
> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
> (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
> +
> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
> +   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
> +
> +enum xps_map_type {
> +   XPS_MAP_RXQS,
> +   XPS_MAP_CPUS,
> +   __XPS_MAP_MAX
> +};
> +
>  #endif /* CONFIG_XPS */
>
>  #define TC_MAX_QUEUE   16
> @@ -1867,7 +1878,7 @@ struct net_device {
> int watchdog_timeo;
>
>  #ifdef CONFIG_XPS
> -   struct xps_dev_maps __rcu *xps_maps;
> +   struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>  #endif
>  #ifdef CONFIG_NET_CLS_ACT
> struct mini_Qdisc __rcu *miniq_egress;
> @@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct 
> net_device *dev, u16 queue_index)
>  #ifdef CONFIG_XPS
>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> u16 index);
> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
> + u16 index, enum xps_map_type type);
> +
> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
> + unsigned int nr_bits)
> +{
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +   WARN_ON_ONCE(j >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */

This #ifdef block appears 3 times in the patch. Seems like it should
be replace by simple macro.

> +   return test_bit(j, mask);
> +}
> +
> +static inline bool attr_test_online(unsigned long j,
> +   const unsigned long *online_mask,
> +   unsigned int nr_bits)
> +{
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +   WARN_ON_ONCE(j >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +
> +   if (online_mask)
> +   return test_bit(j, online_mask);
> +
> +   if (j >= 0 && j < nr_bits)
> +   return true;
> +
> +   return false;
> +}
> +
> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
> +unsigned int nr_bits)
> +{
> +   /* -1 is a legal arg here. */
> +   if (n != -1) {
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +   WARN_ON_ONCE(n >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +   }
> +
> +   if (srcp)
> +   return find_next_bit(srcp, nr_bits, n + 1);
> +
> +   return n + 1;
> +}
> +
> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
> +   const unsigned long *src2p,
> +   unsigned int nr_bits)
> +{
> +   /* -1 is a legal arg here. */
> +   if (n != -1) {
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +   WARN_ON_ONCE(n >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +   }
> +
> +   if (src1p && src2p)
> +   return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
> +   else if (src1p)
> +   return find_next_bit(src1p, nr_bits, n + 1);
> +   else if (src2p)
> +   return find_next_bit(src2p, nr_bits, n + 1);
> +
> +   return n + 1;
> +}
>  #else
>  static inline int netif_set_xps_queue(struct net_device *dev,
>   const struct cpumask *mask,
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a490ef6..17c4883 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
> *dev_maps,
> int pos;
>
> if (dev_maps)
> -   map = xmap_dereference(dev_maps->cpu_map[tci]);
> +   map = xmap_dereference(dev_maps->attr_map[tci]);
> if (!map)
> return false;
>
> @@ -2105,7 +2105,7 @@ 

Re: [net-next PATCH 0/3] Symmetric queue selection using XPS for Rx queues

2018-05-08 Thread Tom Herbert
On Thu, Apr 19, 2018 at 7:41 PM, Eric Dumazet <eduma...@google.com> wrote:
> On Thu, Apr 19, 2018 at 6:07 PM Amritha Nambiar <amritha.namb...@intel.com>
> wrote:
>
>> This patch series implements support for Tx queue selection based on
>> Rx queue map. This is done by configuring Rx queue map per Tx-queue
>> using sysfs attribute. If the user configuration for Rx queues does
>> not apply, then the Tx queue selection falls back to XPS using CPUs and
>> finally to hashing.
>
>> XPS is refactored to support Tx queue selection based on either the
>> CPU map or the Rx-queue map. The config option CONFIG_XPS needs to be
>> enabled. By default no receive queues are configured for the Tx queue.
>
>> - /sys/class/net/eth0/queues/tx-*/xps_rxqs
>
>> This is to enable sending packets on the same Tx-Rx queue pair as this
>> is useful for busy polling multi-threaded workloads where it is not
>> possible to pin the threads to a CPU. This is a rework of Sridhar's
>> patch for symmetric queueing via socket option:
>> https://www.spinics.net/lists/netdev/msg453106.html
>
I suspect this is an artifact of flow director which I believe
required queue pairs to be able to work (i.e. receive queue chose
hardware is determined send queue). But that was only required because
of hardware design, I don't see the rationale for introducing queue
pairs in the software stack. There's no need to correlate the transmit
path with receive path, no need to enforce a 1-1 mapping between RX
and TX queues, and the OOO mitigations should be sufficient when TX
queue changes for a flow.

Tom

>> ---
>
>> Amritha Nambiar (3):
>>net: Refactor XPS for CPUs and Rx queues
>>net: Enable Tx queue selection based on Rx queues
>>net-sysfs: Add interface for Rx queue map per Tx queue
>
>
>>   include/linux/netdevice.h |   82 +++
>>   include/net/sock.h|   18 +++
>>   net/core/dev.c|  240
> +++--
>>   net/core/net-sysfs.c  |   85 
>>   net/core/sock.c   |5 +
>>   net/ipv4/tcp_input.c  |7 +
>>   net/ipv4/tcp_ipv4.c   |1
>>   net/ipv4/tcp_minisocks.c  |1
>>   8 files changed, 357 insertions(+), 82 deletions(-)
>
>
> Without a clear documentation (for example in
> Documentation/networking/scaling.txt)
> , I really do not understand what problem you want to solve, and why we
> need ~300 additional LOC in kernel.
>
> Referring to an old thread (
> https://www.spinics.net/lists/netdev/msg453106.html ) is not the way to go.
>
> Sorry :/


Re: [PATCH 7/8] rhashtable: add rhashtable_walk_prev()

2018-05-05 Thread Tom Herbert
On Sat, May 5, 2018 at 2:43 AM, Herbert Xu <herb...@gondor.apana.org.au> wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> rhashtable_walk_prev() returns the object returned by
>> the previous rhashtable_walk_next(), providing it is still in the
>> table (or was during this grace period).
>> This works even if rhashtable_walk_stop() and rhashtable_talk_start()
>> have been called since the last rhashtable_walk_next().
>>
>> If there have been no calls to rhashtable_walk_next(), or if the
>> object is gone from the table, then NULL is returned.
>>
>> This can usefully be used in a seq_file ->start() function.
>> If the pos is the same as was returned by the last ->next() call,
>> then rhashtable_walk_prev() can be used to re-establish the
>> current location in the table.  If it returns NULL, then
>> rhashtable_walk_next() should be used.
>>
>> Signed-off-by: NeilBrown <ne...@suse.com>
>
> I will ack this if Tom is OK with replacing peek with it.
>
I'm not following why this is any better than peek. The point of
having rhashtable_walk_peek is to to allow the caller to get then
current element not the next one. This is needed when table is read in
multiple parts and we need to pick up with what was returned from the
last call to rhashtable_walk_next (which apparently is what this patch
is also trying to do).

There is one significant difference in that peek will return the
element in the table regardless of where the iterator is at (this is
why peek can move the iterator) and only returns NULL at end of table.
As mentioned above rhashtable_walk_prev can return NULL so then caller
needs and so rhashtable_walk_next "should be used" to get the previous
element. Doing a peek is a lot cleaner and more straightforward API in
this regard.

Tom

> Cheers,
> --
> Email: Herbert Xu <herb...@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH net 3/3] amd-xgbe: Only use the SFP supported transceiver signals

2018-04-23 Thread Tom Lendacky
The SFP eeprom indicates the transceiver signals (Rx LOS, Tx Fault, etc.)
that it supports.  Update the driver to include checking the eeprom data
when deciding whether to use a transceiver signal.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   71 +--
 1 file changed, 54 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index b48efc0..aac8843 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -253,6 +253,10 @@ enum xgbe_sfp_speed {
 #define XGBE_SFP_BASE_VENDOR_SN4
 #define XGBE_SFP_BASE_VENDOR_SN_LEN16
 
+#define XGBE_SFP_EXTD_OPT1 1
+#define XGBE_SFP_EXTD_OPT1_RX_LOS  BIT(1)
+#define XGBE_SFP_EXTD_OPT1_TX_FAULTBIT(3)
+
 #define XGBE_SFP_EXTD_DIAG 28
 #define XGBE_SFP_EXTD_DIAG_ADDR_CHANGE BIT(2)
 
@@ -332,6 +336,7 @@ struct xgbe_phy_data {
 
unsigned int sfp_gpio_address;
unsigned int sfp_gpio_mask;
+   unsigned int sfp_gpio_inputs;
unsigned int sfp_gpio_rx_los;
unsigned int sfp_gpio_tx_fault;
unsigned int sfp_gpio_mod_absent;
@@ -986,6 +991,49 @@ static void xgbe_phy_sfp_external_phy(struct xgbe_prv_data 
*pdata)
phy_data->sfp_phy_avail = 1;
 }
 
+static bool xgbe_phy_check_sfp_rx_los(struct xgbe_phy_data *phy_data)
+{
+   u8 *sfp_extd = phy_data->sfp_eeprom.extd;
+
+   if (!(sfp_extd[XGBE_SFP_EXTD_OPT1] & XGBE_SFP_EXTD_OPT1_RX_LOS))
+   return false;
+
+   if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_RX_LOS)
+   return false;
+
+   if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_rx_los))
+   return true;
+
+   return false;
+}
+
+static bool xgbe_phy_check_sfp_tx_fault(struct xgbe_phy_data *phy_data)
+{
+   u8 *sfp_extd = phy_data->sfp_eeprom.extd;
+
+   if (!(sfp_extd[XGBE_SFP_EXTD_OPT1] & XGBE_SFP_EXTD_OPT1_TX_FAULT))
+   return false;
+
+   if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_TX_FAULT)
+   return false;
+
+   if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_tx_fault))
+   return true;
+
+   return false;
+}
+
+static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data)
+{
+   if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_MOD_ABSENT)
+   return false;
+
+   if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_mod_absent))
+   return true;
+
+   return false;
+}
+
 static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
 {
struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -1031,6 +1079,10 @@ static void xgbe_phy_sfp_parse_eeprom(struct 
xgbe_prv_data *pdata)
if (sfp_base[XGBE_SFP_BASE_EXT_ID] != XGBE_SFP_EXT_ID_SFP)
return;
 
+   /* Update transceiver signals (eeprom extd/options) */
+   phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data);
+   phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data);
+
if (xgbe_phy_sfp_parse_quirks(pdata))
return;
 
@@ -1196,7 +1248,6 @@ static int xgbe_phy_sfp_read_eeprom(struct xgbe_prv_data 
*pdata)
 static void xgbe_phy_sfp_signals(struct xgbe_prv_data *pdata)
 {
struct xgbe_phy_data *phy_data = pdata->phy_data;
-   unsigned int gpio_input;
u8 gpio_reg, gpio_ports[2];
int ret;
 
@@ -1211,23 +1262,9 @@ static void xgbe_phy_sfp_signals(struct xgbe_prv_data 
*pdata)
return;
}
 
-   gpio_input = (gpio_ports[1] << 8) | gpio_ports[0];
-
-   if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_MOD_ABSENT) {
-   /* No GPIO, just assume the module is present for now */
-   phy_data->sfp_mod_absent = 0;
-   } else {
-   if (!(gpio_input & (1 << phy_data->sfp_gpio_mod_absent)))
-   phy_data->sfp_mod_absent = 0;
-   }
-
-   if (!(phy_data->sfp_gpio_mask & XGBE_GPIO_NO_RX_LOS) &&
-   (gpio_input & (1 << phy_data->sfp_gpio_rx_los)))
-   phy_data->sfp_rx_los = 1;
+   phy_data->sfp_gpio_inputs = (gpio_ports[1] << 8) | gpio_ports[0];
 
-   if (!(phy_data->sfp_gpio_mask & XGBE_GPIO_NO_TX_FAULT) &&
-   (gpio_input & (1 << phy_data->sfp_gpio_tx_fault)))
-   phy_data->sfp_tx_fault = 1;
+   phy_data->sfp_mod_absent = xgbe_phy_check_sfp_mod_absent(phy_data);
 }
 
 static void xgbe_phy_sfp_mod_absent(struct xgbe_prv_data *pdata)



[PATCH net 2/3] amd-xgbe: Improve KR auto-negotiation and training

2018-04-23 Thread Tom Lendacky
Update xgbe-phy-v2.c to make use of the auto-negotiation (AN) phy hooks
to improve the ability to successfully complete Clause 73 AN when running
at 10gbps.  Hardware can sometimes have issues with CDR lock when the
AN DME page exchange is being performed.

The AN and KR training hooks are used as follows:
- The pre AN hook is used to disable CDR tracking in the PHY so that the
  DME page exchange can be successfully and consistently completed.
- The post KR training hook is used to re-enable the CDR tracking so that
  KR training can successfully complete.
- The post AN hook is used to check for an unsuccessful AN which will
  increase a CDR tracking enablement delay (up to a maximum value).

Add two debugfs entries to allow control over use of the CDR tracking
workaround.  The debugfs entries allow the CDR tracking workaround to
be disabled and determine whether to re-enable CDR tracking before or
after link training has been initiated.

Also, with these changes the receiver reset cycle that is performed during
the link status check can be performed less often.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h  |8 ++
 drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c |   16 +++
 drivers/net/ethernet/amd/xgbe/xgbe-main.c|1 
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|8 +-
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |2 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  125 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |4 +
 7 files changed, 160 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h 
b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index 7ea72ef..d272dc6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -1321,6 +1321,10 @@
 #define MDIO_VEND2_AN_STAT 0x8002
 #endif
 
+#ifndef MDIO_VEND2_PMA_CDR_CONTROL
+#define MDIO_VEND2_PMA_CDR_CONTROL 0x8056
+#endif
+
 #ifndef MDIO_CTRL1_SPEED1G
 #define MDIO_CTRL1_SPEED1G (MDIO_CTRL1_SPEED10G & ~BMCR_SPEED100)
 #endif
@@ -1369,6 +1373,10 @@
 #define XGBE_AN_CL37_TX_CONFIG_MASK0x08
 #define XGBE_AN_CL37_MII_CTRL_8BIT 0x0100
 
+#define XGBE_PMA_CDR_TRACK_EN_MASK 0x01
+#define XGBE_PMA_CDR_TRACK_EN_OFF  0x00
+#define XGBE_PMA_CDR_TRACK_EN_ON   0x01
+
 /* Bit setting and getting macros
  *  The get macro will extract the current bit field value from within
  *  the variable
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
index 7d128be..b911439 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
@@ -519,6 +519,22 @@ void xgbe_debugfs_init(struct xgbe_prv_data *pdata)
   "debugfs_create_file failed\n");
}
 
+   if (pdata->vdata->an_cdr_workaround) {
+   pfile = debugfs_create_bool("an_cdr_workaround", 0600,
+   pdata->xgbe_debugfs,
+   >debugfs_an_cdr_workaround);
+   if (!pfile)
+   netdev_err(pdata->netdev,
+  "debugfs_create_bool failed\n");
+
+   pfile = debugfs_create_bool("an_cdr_track_early", 0600,
+   pdata->xgbe_debugfs,
+   >debugfs_an_cdr_track_early);
+   if (!pfile)
+   netdev_err(pdata->netdev,
+  "debugfs_create_bool failed\n");
+   }
+
kfree(buf);
 }
 
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 795e556..441d0973 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -349,6 +349,7 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, UDP4TE, 1);
 
/* Call MDIO/PHY initialization routine */
+   pdata->debugfs_an_cdr_workaround = pdata->vdata->an_cdr_workaround;
ret = pdata->phy_if.phy_init(pdata);
if (ret)
return ret;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index e3d361e..1b45cd7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -432,6 +432,8 @@ static void xgbe_an73_disable(struct xgbe_prv_data *pdata)
xgbe_an73_set(pdata, false, false);
xgbe_an73_disable_interrupts(pdata);
 
+   pdata->an_start = 0;
+
netif_dbg(pdata, link, pdata->netdev, "CL73 AN disabled\n");
 }
 
@@ -511,11 +513,11 @@ static enum xgbe_an xgbe_an73_tx_training(struct 
xgbe_prv_data *

[PATCH net 1/3] amd-xgbe: Add pre/post auto-negotiation phy hooks

2018-04-23 Thread Tom Lendacky
Add hooks to the driver auto-negotiation (AN) flow to allow the different
phy implementations to perform any steps necessary to improve AN.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   16 ++--
 drivers/net/ethernet/amd/xgbe/xgbe.h  |5 +
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 072b9f6..e3d361e 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -437,6 +437,9 @@ static void xgbe_an73_disable(struct xgbe_prv_data *pdata)
 
 static void xgbe_an_restart(struct xgbe_prv_data *pdata)
 {
+   if (pdata->phy_if.phy_impl.an_pre)
+   pdata->phy_if.phy_impl.an_pre(pdata);
+
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
case XGBE_AN_MODE_CL73_REDRV:
@@ -453,6 +456,9 @@ static void xgbe_an_restart(struct xgbe_prv_data *pdata)
 
 static void xgbe_an_disable(struct xgbe_prv_data *pdata)
 {
+   if (pdata->phy_if.phy_impl.an_post)
+   pdata->phy_if.phy_impl.an_post(pdata);
+
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
case XGBE_AN_MODE_CL73_REDRV:
@@ -637,11 +643,11 @@ static enum xgbe_an xgbe_an73_incompat_link(struct 
xgbe_prv_data *pdata)
return XGBE_AN_NO_LINK;
}
 
-   xgbe_an73_disable(pdata);
+   xgbe_an_disable(pdata);
 
xgbe_switch_mode(pdata);
 
-   xgbe_an73_restart(pdata);
+   xgbe_an_restart(pdata);
 
return XGBE_AN_INCOMPAT_LINK;
 }
@@ -820,6 +826,9 @@ static void xgbe_an37_state_machine(struct xgbe_prv_data 
*pdata)
pdata->an_result = pdata->an_state;
pdata->an_state = XGBE_AN_READY;
 
+   if (pdata->phy_if.phy_impl.an_post)
+   pdata->phy_if.phy_impl.an_post(pdata);
+
netif_dbg(pdata, link, pdata->netdev, "CL37 AN result: %s\n",
  xgbe_state_as_string(pdata->an_result));
}
@@ -903,6 +912,9 @@ static void xgbe_an73_state_machine(struct xgbe_prv_data 
*pdata)
pdata->kx_state = XGBE_RX_BPA;
pdata->an_start = 0;
 
+   if (pdata->phy_if.phy_impl.an_post)
+   pdata->phy_if.phy_impl.an_post(pdata);
+
netif_dbg(pdata, link, pdata->netdev, "CL73 AN result: %s\n",
  xgbe_state_as_string(pdata->an_result));
}
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h 
b/drivers/net/ethernet/amd/xgbe/xgbe.h
index ad102c8..fa0b51e 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -833,6 +833,7 @@ struct xgbe_hw_if {
 /* This structure represents implementation specific routines for an
  * implementation of a PHY. All routines are required unless noted below.
  *   Optional routines:
+ * an_pre, an_post
  * kr_training_pre, kr_training_post
  */
 struct xgbe_phy_impl_if {
@@ -875,6 +876,10 @@ struct xgbe_phy_impl_if {
/* Process results of auto-negotiation */
enum xgbe_mode (*an_outcome)(struct xgbe_prv_data *);
 
+   /* Pre/Post auto-negotiation support */
+   void (*an_pre)(struct xgbe_prv_data *);
+   void (*an_post)(struct xgbe_prv_data *);
+
/* Pre/Post KR training enablement support */
void (*kr_training_pre)(struct xgbe_prv_data *);
void (*kr_training_post)(struct xgbe_prv_data *);



[PATCH net 0/3] amd-xgbe: AMD XGBE driver fixes 2018-04-23

2018-04-23 Thread Tom Lendacky
This patch series addresses some issues in the AMD XGBE driver.

The following fixes are included in this driver update series:

- Improve KR auto-negotiation and training (2 patches)
  - Add pre and post auto-negotiation hooks
  - Use the pre and post auto-negotiation hooks to disable CDR tracking
during auto-negotiation page exchange in KR mode
- Check for SFP tranceiver signal support and only use the signal if the
  SFP indicates that it is supported

This patch series is based on net.

---

Please queue this patch series up to stable releases 4.14 and above.

Tom Lendacky (3):
  amd-xgbe: Add pre/post auto-negotiation phy hooks
  amd-xgbe: Improve KR auto-negotiation and training
  amd-xgbe: Only use the SFP supported transceiver signals


 drivers/net/ethernet/amd/xgbe/xgbe-common.h  |8 +
 drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c |   16 ++
 drivers/net/ethernet/amd/xgbe/xgbe-main.c|1 
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|   24 +++
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |2 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  196 --
 drivers/net/ethernet/amd/xgbe/xgbe.h |9 +
 7 files changed, 233 insertions(+), 23 deletions(-)

-- 
Tom Lendacky


Re: Creating FOU tunnels to the same destination IP but different port

2018-04-13 Thread Tom Herbert
On Fri, Apr 13, 2018 at 9:57 AM, Kostas Peletidis <kpeleti...@gmail.com> wrote:
> Hello,
>
> I am having trouble with a particular case of setting up a fou tunnel
> and I would really appreciate your help.
>
> I have a remote multihomed host behind a NAT box and I want to create
> a fou tunnel for each of its IP addresses, from my machine.
>
> A typical case would be something like that (output from the local machine):
>
> # ip tun
> ipudp09602: ip/ip remote 135.196.22.100 local 172.31.0.140 ttl 225
> ipudp00101: ip/ip remote 148.252.129.30 local 172.31.0.140 ttl 225
> ipudp09604: ip/ip remote 77.247.11.249 local 172.31.0.140 ttl 225
> tunl0: any/ip remote any local any ttl inherit nopmtudisc
> ipudp00102: ip/ip remote 213.205.194.18 local 172.31.0.140 ttl 225
>
> However, if the remote end has the same IP address with the remote end
> of an existing tunnel (but a different remote port)
> tunnel creation fails. In this example there is already a tunnel to
> 135.196.22.100:32270 and I wanted to create a new tunnel
> to 135.196.22.100:24822 as below:
>
> # ip link add name ipudp09603 mtu 1356 type ipip \
>   remote 135.196.22.100 \
>   local 172.31.0.140 \
>   ttl 225 \
>   encap fou \
>  encap-sport 4500 \
>  encap-dport 24822
>
> RTNETLINK answers: File exists
>
> The remote IP addresses in this case are identical because there is a
> NAT box in the way, but the port numbers are different. The source
> address and port are the same in all cases.
>
> I noticed that ip_tunnel_find() does not check port numbers - being IP
> and all - so I am thinking that a not-so-elegant way to do it is to
> get the port numbers from the netlink request and have
> ip_tunnel_find() compare them against encap.{sport, dport} of existing
> tunnels.
>
> Is there a better way to create a second fou tunnel to the same IP
> address but a different port? Use of keys as unique tunnel IDs maybe?
> Any feedback is appreciated. Thank you.
>
Hi Kostas,

This is an interesting problem, thanks for reporting it! FOU in this
case is being used as modified ipip tunnel so the check of uniqueness
is only based on local and remote addresses for an IP tunnel. As you
point out, the port information does provide more specific information
that could be be used to distinguish between the tunnels (especially
on receive). Using the information is tricky since the FOU and ipip
layers are pretty much independent. The keys approach might be
possible. I'll try to take a closer look.

Tom

>
> Regards,
> Kostas


Re: [PATCH net] strparser: Fix sign of err codes

2018-03-26 Thread Tom Herbert
On Mon, Mar 26, 2018 at 12:31 PM, Dave Watson <davejwat...@fb.com> wrote:
> strp_parser_err is called with a negative code everywhere, which then
> calls abort_parser with a negative code.  strp_msg_timeout calls
> abort_parser directly with a positive code.  Negate ETIMEDOUT
> to match signed-ness of other calls.
>
> The default abort_parser callback, strp_abort_strp, sets
> sk->sk_err to err.  Also negate the error here so sk_err always
> holds a positive value, as the rest of the net code expects.  Currently
> a negative sk_err can result in endless loops, or user code that
> thinks it actually sent/received err bytes.
>
> Found while testing net/tls_sw recv path.
>
Nice catch!

It might be nice to have a comment at strp_parser_err and abort_parser
description in Documentation/networking/strparser.txt should also be
updated that err is a negative error value.

Tom


> Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
> Signed-off-by: Dave Watson <davejwat...@fb.com>
> ---
>  net/strparser/strparser.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
> index 1fdab5c..b9283ce 100644
> --- a/net/strparser/strparser.c
> +++ b/net/strparser/strparser.c
> @@ -60,7 +60,7 @@ static void strp_abort_strp(struct strparser *strp, int err)
> struct sock *sk = strp->sk;
>
> /* Report an error on the lower socket */
> -   sk->sk_err = err;
> +   sk->sk_err = -err;
> sk->sk_error_report(sk);
> }
>  }
> @@ -458,7 +458,7 @@ static void strp_msg_timeout(struct work_struct *w)
> /* Message assembly timed out */
> STRP_STATS_INCR(strp->stats.msg_timeouts);
> strp->cb.lock(strp);
> -   strp->cb.abort_parser(strp, ETIMEDOUT);
> +   strp->cb.abort_parser(strp, -ETIMEDOUT);
> strp->cb.unlock(strp);
>  }
>
> --
> 2.9.5
>


[PATCH v2 net] kcm: lock lower socket in kcm_attach

2018-03-13 Thread Tom Herbert
Need to lock lower socket in order to provide mutual exclusion
with kcm_unattach.

v2: Add Reported-by for syzbot

Fixes: ab7ac4eb9832e32a09f4e804 ("kcm: Kernel Connection Multiplexor module")
Reported-by: 
syzbot+ea75c0ffcd353d32515f064aaebefc5279e61...@syzkaller.appspotmail.com
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index f297d53a11aa..34355fd19f27 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1381,24 +1381,32 @@ static int kcm_attach(struct socket *sock, struct 
socket *csock,
.parse_msg = kcm_parse_func_strparser,
.read_sock_done = kcm_read_sock_done,
};
-   int err;
+   int err = 0;
 
csk = csock->sk;
if (!csk)
return -EINVAL;
 
+   lock_sock(csk);
+
/* Only allow TCP sockets to be attached for now */
if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
-   csk->sk_protocol != IPPROTO_TCP)
-   return -EOPNOTSUPP;
+   csk->sk_protocol != IPPROTO_TCP) {
+   err = -EOPNOTSUPP;
+   goto out;
+   }
 
/* Don't allow listeners or closed sockets */
-   if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE)
-   return -EOPNOTSUPP;
+   if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) {
+   err = -EOPNOTSUPP;
+   goto out;
+   }
 
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
-   if (!psock)
-   return -ENOMEM;
+   if (!psock) {
+   err = -ENOMEM;
+   goto out;
+   }
 
psock->mux = mux;
psock->sk = csk;
@@ -1407,7 +1415,7 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
err = strp_init(>strp, csk, );
if (err) {
kmem_cache_free(kcm_psockp, psock);
-   return err;
+   goto out;
}
 
write_lock_bh(>sk_callback_lock);
@@ -1419,7 +1427,8 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
write_unlock_bh(>sk_callback_lock);
strp_done(>strp);
kmem_cache_free(kcm_psockp, psock);
-   return -EALREADY;
+   err = -EALREADY;
+   goto out;
}
 
psock->save_data_ready = csk->sk_data_ready;
@@ -1455,7 +1464,10 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
/* Schedule RX work in case there are already bytes queued */
strp_check_rcv(>strp);
 
-   return 0;
+out:
+   release_sock(csk);
+
+   return err;
 }
 
 static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
@@ -1507,6 +1519,7 @@ static void kcm_unattach(struct kcm_psock *psock)
 
if (WARN_ON(psock->rx_kcm)) {
write_unlock_bh(>sk_callback_lock);
+   release_sock(csk);
return;
}
 
-- 
2.11.0



Re: [PATCH net] kcm: lock lower socket in kcm_attach

2018-03-12 Thread Tom Herbert
On Mon, Mar 12, 2018 at 2:09 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> On Mon, Mar 12, 2018 at 02:04:12PM -0700, Tom Herbert wrote:
>> Need to lock lower socket in order to provide mutual exclusion
>> with kcm_unattach.
>>
>> Fixes: ab7ac4eb9832e32a09f4e804 ("kcm: Kernel Connection Multiplexor module")
>> Signed-off-by: Tom Herbert <t...@quantonium.net>
>> ---
>
> Is this fixing the syzbot-reported bug "KASAN: use-after-free Read in
> get_work_pool"?  If so, please add:
>
> Reported-by: 
> syzbot+ea75c0ffcd353d32515f064aaebefc5279e61...@syzkaller.appspotmail.com

Yeah, I was looking for a "reported by". I didn't see it in the email
from syzbot. Where is this found?

Thanks,
Tom


[PATCH net] kcm: lock lower socket in kcm_attach

2018-03-12 Thread Tom Herbert
Need to lock lower socket in order to provide mutual exclusion
with kcm_unattach.

Fixes: ab7ac4eb9832e32a09f4e804 ("kcm: Kernel Connection Multiplexor module")
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index f297d53a11aa..34355fd19f27 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1381,24 +1381,32 @@ static int kcm_attach(struct socket *sock, struct 
socket *csock,
.parse_msg = kcm_parse_func_strparser,
.read_sock_done = kcm_read_sock_done,
};
-   int err;
+   int err = 0;
 
csk = csock->sk;
if (!csk)
return -EINVAL;
 
+   lock_sock(csk);
+
/* Only allow TCP sockets to be attached for now */
if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
-   csk->sk_protocol != IPPROTO_TCP)
-   return -EOPNOTSUPP;
+   csk->sk_protocol != IPPROTO_TCP) {
+   err = -EOPNOTSUPP;
+   goto out;
+   }
 
/* Don't allow listeners or closed sockets */
-   if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE)
-   return -EOPNOTSUPP;
+   if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) {
+   err = -EOPNOTSUPP;
+   goto out;
+   }
 
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
-   if (!psock)
-   return -ENOMEM;
+   if (!psock) {
+   err = -ENOMEM;
+   goto out;
+   }
 
psock->mux = mux;
psock->sk = csk;
@@ -1407,7 +1415,7 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
err = strp_init(>strp, csk, );
if (err) {
kmem_cache_free(kcm_psockp, psock);
-   return err;
+   goto out;
}
 
write_lock_bh(>sk_callback_lock);
@@ -1419,7 +1427,8 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
write_unlock_bh(>sk_callback_lock);
strp_done(>strp);
kmem_cache_free(kcm_psockp, psock);
-   return -EALREADY;
+   err = -EALREADY;
+   goto out;
}
 
psock->save_data_ready = csk->sk_data_ready;
@@ -1455,7 +1464,10 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
/* Schedule RX work in case there are already bytes queued */
strp_check_rcv(>strp);
 
-   return 0;
+out:
+   release_sock(csk);
+
+   return err;
 }
 
 static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
@@ -1507,6 +1519,7 @@ static void kcm_unattach(struct kcm_psock *psock)
 
if (WARN_ON(psock->rx_kcm)) {
write_unlock_bh(>sk_callback_lock);
+   release_sock(csk);
return;
}
 
-- 
2.11.0



Re: KASAN: use-after-free Read in get_work_pool

2018-03-11 Thread Tom Herbert
On Sun, Mar 11, 2018 at 2:34 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> On Wed, Feb 14, 2018 at 02:45:05PM +0100, 'Dmitry Vyukov' via syzkaller-bugs 
> wrote:
>> On Wed, Dec 6, 2017 at 1:50 PM, Dmitry Vyukov <dvyu...@google.com> wrote:
>> > On Fri, Oct 27, 2017 at 11:18 PM, Cong Wang <xiyou.wangc...@gmail.com> 
>> > wrote:
>> >> On Thu, Oct 26, 2017 at 11:00 PM, Dmitry Vyukov <dvyu...@google.com> 
>> >> wrote:
>> >>> On Thu, Oct 26, 2017 at 7:58 PM, Tejun Heo <t...@kernel.org> wrote:
>> >>>> Hello,
>> >>>>
>> >>>> On Thu, Oct 26, 2017 at 09:35:44AM -0700, syzbot wrote:
>> >>>>> BUG: KASAN: use-after-free in __read_once_size
>> >>>>> include/linux/compiler.h:276 [inline]
>> >>>>> BUG: KASAN: use-after-free in atomic64_read
>> >>>>> arch/x86/include/asm/atomic64_64.h:21 [inline]
>> >>>>> BUG: KASAN: use-after-free in atomic_long_read
>> >>>>> include/asm-generic/atomic-long.h:44 [inline]
>> >>>>> BUG: KASAN: use-after-free in get_work_pool+0x1c2/0x1e0
>> >>>>> kernel/workqueue.c:709
>> >>>>> Read of size 8 at addr 8801cc58c378 by task syz-executor5/21326
>> >>>>>
>> >>>>> CPU: 1 PID: 21326 Comm: syz-executor5 Not tainted 4.13.0+ #43
>> >>>>> Hardware name: Google Google Compute Engine/Google Compute Engine,
>> >>>>> BIOS Google 01/01/2011
>> >>>>> Call Trace:
>> >>>>>  __dump_stack lib/dump_stack.c:16 [inline]
>> >>>>>  dump_stack+0x194/0x257 lib/dump_stack.c:52
>> >>>>>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>> >>>>>  kasan_report_error mm/kasan/report.c:351 [inline]
>> >>>>>  kasan_report+0x24e/0x340 mm/kasan/report.c:409
>> >>>>>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
>> >>>>>  __read_once_size include/linux/compiler.h:276 [inline]
>> >>>>>  atomic64_read arch/x86/include/asm/atomic64_64.h:21 [inline]
>> >>>>>  atomic_long_read include/asm-generic/atomic-long.h:44 [inline]
>> >>>>>  get_work_pool+0x1c2/0x1e0 kernel/workqueue.c:709
>> >>>>>  __queue_work+0x235/0x1150 kernel/workqueue.c:1401
>> >>>>>  queue_work_on+0x16a/0x1c0 kernel/workqueue.c:1486
>> >>>>>  queue_work include/linux/workqueue.h:489 [inline]
>> >>>>>  strp_check_rcv+0x25/0x30 net/strparser/strparser.c:553
>> >>>>>  kcm_attach net/kcm/kcmsock.c:1439 [inline]
>> >>>>>  kcm_attach_ioctl net/kcm/kcmsock.c:1460 [inline]
>> >>>>>  kcm_ioctl+0x826/0x1610 net/kcm/kcmsock.c:1695
>> >>>>>  sock_do_ioctl+0x65/0xb0 net/socket.c:961
>> >>>>>  sock_ioctl+0x2c2/0x440 net/socket.c:1058
>> >>>>>  vfs_ioctl fs/ioctl.c:45 [inline]
>> >>>>>  do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685
>> >>>>>  SYSC_ioctl fs/ioctl.c:700 [inline]
>> >>>>>  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
>> >>>>>  entry_SYSCALL_64_fastpath+0x1f/0xbe
>> >>>>
>> >>>> Looks like kcm is trying to reuse a work item whose last workqueue has
>> >>>> been destroyed without re-initing it.  A work item needs to be
>> >>>> reinit'd.
>> >>>
>> >>> +kcm maintainers
>> >>
>> >> Can you try the fix below? There is no C reproducer so I can't verify it.
>> >
>> >
>> > Hi Cong,
>> >
>> > syzbot can now test proposed patches, see
>> > https://github.com/google/syzkaller/blob/master/docs/syzbot.md#communication-with-syzbot
>> > for details. Please give it a try.
>> >
>> >> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
>> >> index af4e76ac88ff..7816f44c576a 100644
>> >> --- a/net/kcm/kcmsock.c
>> >> +++ b/net/kcm/kcmsock.c
>> >> @@ -1433,11 +1433,12 @@ static int kcm_attach(struct socket *sock,
>> >> struct socket *csock,
>> >> KCM_STATS_INCR(mux->stats.psock_attach);
>> >> mux->psocks_cnt++;
>> >> psock_now_avail(psock);
>> >> -   spin_unlock_bh(>lock);
>> >>
>> >> /* Schedule RX work in case there are already bytes queued */
>>

[PATCH net] amd-xgbe: Restore PCI interrupt enablement setting on resume

2018-02-20 Thread Tom Lendacky
After resuming from suspend, the PCI device support must re-enable the
interrupt setting so that interrupts are actually delivered.

Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
---

Please queue this patch up to stable releases 4.14 and above.

 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 3e5833c..eb23f9b 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -426,6 +426,8 @@ static int xgbe_pci_resume(struct pci_dev *pdev)
struct net_device *netdev = pdata->netdev;
int ret = 0;
 
+   XP_IOWRITE(pdata, XP_INT_EN, 0x1f);
+
pdata->lpm_ctrl &= ~MDIO_CTRL1_LPOWER;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, pdata->lpm_ctrl);
 



Re: BUG: free active (active state 0) object type: work_struct hint: strp_work

2018-02-14 Thread Tom Herbert
On Tue, Feb 13, 2018 at 12:15 PM, Dmitry Vyukov <dvyu...@google.com> wrote:
>
> On Thu, Jan 4, 2018 at 8:36 PM, Tom Herbert <t...@quantonium.net> wrote:
> > On Thu, Jan 4, 2018 at 4:10 AM, syzbot
> > <syzbot+3c6c745b0d2f341bb...@syzkaller.appspotmail.com> wrote:
> >> Hello,
> >>
> >> syzkaller hit the following crash on
> >> 6bb8824732f69de0f233ae6b1a8158e149627b38
> >> git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/master
> >> compiler: gcc (GCC) 7.1.1 20170620
> >> .config is attached
> >> Raw console output is attached.
> >> Unfortunately, I don't have any reproducer for this bug yet.
> >>
> >>
> >> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> >> Reported-by: syzbot+3c6c745b0d2f341bb...@syzkaller.appspotmail.com
> >> It will help syzbot understand when the bug is fixed. See footer for
> >> details.
> >> If you forward the report, please keep this part and the footer.
> >>
> >> Use struct sctp_assoc_value instead
> >> sctp: [Deprecated]: syz-executor4 (pid 12483) Use of int in maxseg socket
> >> option.
> >> Use struct sctp_assoc_value instead
> >> [ cut here ]
> >> ODEBUG: free active (active state 0) object type: work_struct hint:
> >> strp_work+0x0/0xf0 net/strparser/strparser.c:381
> >> WARNING: CPU: 1 PID: 3502 at lib/debugobjects.c:291
> >> debug_print_object+0x166/0x220 lib/debugobjects.c:288
> >> Kernel panic - not syncing: panic_on_warn set ...
> >>
> >> CPU: 1 PID: 3502 Comm: kworker/u4:4 Not tainted 4.15.0-rc5+ #170
> >> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> >> Google 01/01/2011
> >> Workqueue: kkcmd kcm_tx_work
> >> Call Trace:
> >>  __dump_stack lib/dump_stack.c:17 [inline]
> >>  dump_stack+0x194/0x257 lib/dump_stack.c:53
> >>  panic+0x1e4/0x41c kernel/panic.c:183
> >>  __warn+0x1dc/0x200 kernel/panic.c:547
> >>  report_bug+0x211/0x2d0 lib/bug.c:184
> >>  fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
> >>  fixup_bug arch/x86/kernel/traps.c:247 [inline]
> >>  do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
> >>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
> >>  invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:1061
> >> RIP: 0010:debug_print_object+0x166/0x220 lib/debugobjects.c:288
> >> RSP: 0018:8801c0ee7068 EFLAGS: 00010086
> >> RAX: dc08 RBX: 0003 RCX: 8159bc3e
> >> RDX:  RSI: 1100381dcdc8 RDI: 8801db317dd0
> >> RBP: 8801c0ee70a8 R08:  R09: 1100381dcd9a
> >> R10: ed00381dce3c R11: 86137ad8 R12: 0001
> >> R13: 86113480 R14: 8560dc40 R15: 8146e5f0
> >>  __debug_check_no_obj_freed lib/debugobjects.c:745 [inline]
> >>  debug_check_no_obj_freed+0x662/0xf1f lib/debugobjects.c:774
> >>  kmem_cache_free+0x253/0x2a0 mm/slab.c:3745
> >
> > I believe we just need to defer kmem_cache_free to call_rcu.
>
>
> Hi Tom,
>
> Was this ever submitted? I don't any such change in net/kcm/kcmsock.c.


Hi Dmitry,

I am looking at it. Not yet convinced that call_rcu is right fix.

Tom


[PATCH net-next] kcm: Call strp_stop before strp_done in kcm_attach

2018-02-14 Thread Tom Herbert
In kcm_attach strp_done is called when sk_user_data is already
set to fail the attach. strp_done needs the strp to be stopped and
warns if it isn't. Call strp_stop in this case to eliminate the
warning message.

Reported-by: syzbot+88dfb55e4c8b770d8...@syzkaller.appspotmail.com
Fixes: e5571240236c5652f ("kcm: Check if sk_user_data already set in kcm_attach"
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index f297d53a11aa..435594648dac 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1417,6 +1417,7 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
 */
if (csk->sk_user_data) {
write_unlock_bh(>sk_callback_lock);
+   strp_stop(>strp);
strp_done(>strp);
kmem_cache_free(kcm_psockp, psock);
return -EALREADY;
-- 
2.11.0



Re: [PATCH net-next 0/3] eBPF Seccomp filters

2018-02-13 Thread Tom Hromatka



On 02/13/2018 01:35 PM, Kees Cook wrote:

On Tue, Feb 13, 2018 at 12:33 PM, Tom Hromatka <tom.hroma...@oracle.com> wrote:

On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sar...@sargun.me> wrote:

This patchset enables seccomp filters to be written in eBPF. Although,
this patchset doesn't introduce much of the functionality enabled by
eBPF, it lays the ground work for it.

It also introduces the capability to dump eBPF filters via the PTRACE
API in order to make it so that CHECKPOINT_RESTORE will be satisifed.
In the attached samples, there's an example of this. One can then use
BPF_OBJ_GET_INFO_BY_FD in order to get the actual code of the program,
and use that at reload time.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.



Coincidentally I also sent an RFC for adding eBPF hash maps to the seccomp
userspace mailing list just last week:
https://groups.google.com/forum/#!topic/libseccomp/pX6QkVF0F74

The kernel changes I proposed are in this email:
https://groups.google.com/d/msg/libseccomp/pX6QkVF0F74/ZUJlwI5qAwAJ

In that email thread, Kees requested that I try out a binary tree in cBPF
and evaluate its performance.  I just got a rough prototype working, and
while not as fast as an eBPF hash map, the cBPF binary tree was a
significant
improvement over the linear list of ifs that are currently generated.  Also,
it only required changing a single function within the libseccomp libary
itself.

https://github.com/drakenclimber/libseccomp/commit/87b36369f17385f5a7a4d95101185577fbf6203b

Here are the results I am currently seeing using an in-house customer's
seccomp filter and a simplistic test program that runs getppid() thousands
of times.

Test Case  minimum TSC ticks to make syscall

seccomp disabled 620
getppid() at the front of 306-syscall seccomp filter 722
getppid() in middle of 306-syscall seccomp filter   1392
getppid() at the end of the 306-syscall filter  2452
seccomp using a 306-syscall-sized EBPF hash map  800
cBPF filter using a binary tree  922

I still think that's a crazy filter. :) It should be inverted to just
check the 26 syscalls and a final "greater than" test. I would expect
it to be faster still. :)

-Kees


I completely agree it's a crazy filter, but it seems to be a
common "mistake" our users are making.  It would be nice to
help them out if we can.

Tom



Re: [PATCH net-next 0/3] eBPF Seccomp filters

2018-02-13 Thread Tom Hromatka

On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sar...@sargun.me> wrote:

This patchset enables seccomp filters to be written in eBPF. Although,
this patchset doesn't introduce much of the functionality enabled by
eBPF, it lays the ground work for it.

It also introduces the capability to dump eBPF filters via the PTRACE
API in order to make it so that CHECKPOINT_RESTORE will be satisifed.
In the attached samples, there's an example of this. One can then use
BPF_OBJ_GET_INFO_BY_FD in order to get the actual code of the program,
and use that at reload time.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.



Coincidentally I also sent an RFC for adding eBPF hash maps to the seccomp
userspace mailing list just last week:
https://groups.google.com/forum/#!topic/libseccomp/pX6QkVF0F74

The kernel changes I proposed are in this email:
https://groups.google.com/d/msg/libseccomp/pX6QkVF0F74/ZUJlwI5qAwAJ

In that email thread, Kees requested that I try out a binary tree in cBPF
and evaluate its performance.  I just got a rough prototype working, and
while not as fast as an eBPF hash map, the cBPF binary tree was a significant
improvement over the linear list of ifs that are currently generated.  Also,
it only required changing a single function within the libseccomp libary
itself.

https://github.com/drakenclimber/libseccomp/commit/87b36369f17385f5a7a4d95101185577fbf6203b

Here are the results I am currently seeing using an in-house customer's
seccomp filter and a simplistic test program that runs getppid() thousands
of times.

Test Case  minimum TSC ticks to make syscall

seccomp disabled 620
getppid() at the front of 306-syscall seccomp filter 722
getppid() in middle of 306-syscall seccomp filter   1392
getppid() at the end of the 306-syscall filter  2452
seccomp using a 306-syscall-sized EBPF hash map  800
cBPF filter using a binary tree  922

Thanks.

Tom



Re: [RFC PATCH 00/24] Introducing AF_XDP support

2018-02-07 Thread Tom Herbert
that packets should be sent out on. In
> contrast to ndo_xdp_xmit, it is asynchronous and pulls packets to be
> sent from the xdp socket (associated with the dev and queue
> combination that was provided with the NDO call) using a callback
> (get_tx_packet), and when they have been transmitted it uses another
> callback (tx_completion) to signal completion of packets. These
> callbacks are set via ndo_bpf in the new XDP_REGISTER_XSK
> command. ndo_xdp_xmit_xsk is exclusively used by the XDP socket code
> and thus does not clash with the XDP_REDIRECT use of
> ndo_xdp_xmit. This is one of the reasons that the XDP_DRV mode
> (without ZC) is currently not supported by TX. Please have a look at
> the challenges section for further discussions.
>
> The AF_XDP bind call acts on a queue pair (channel in ethtool speak),
> so the user needs to steer the traffic to the zero-copy enabled queue
> pair. Which queue to use, is up to the user.
>
> For an untrusted application, HW packet steering to a specific queue
> pair (the one associated with the application) is a requirement, as
> the application would otherwise be able to see other user space
> processes' packets. If the HW cannot support the required packet
> steering, XDP_DRV or XDP_SKB mode have to be used as they do not
> expose the NIC's packet buffer into user space as the packets are
> copied into user space from the NIC's packet buffer in the kernel.
>
> There is a xdpsock benchmarking/test application included. Say that
> you would like your UDP traffic from port 4242 to end up in queue 16,
> that we will enable AF_XDP on. Here, we use ethtool for this:
>
>   ethtool -N p3p2 rx-flow-hash udp4 fn
>   ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
>   action 16
>
> Running the l2fwd benchmark in XDP_DRV_ZC mode can then be done using:
>
>   samples/bpf/xdpsock -i p3p2 -q 16 -l -N
>
> For XDP_SKB mode, use the switch "-S" instead of "-N" and all options
> can be displayed with "-h", as usual.
>
> We have run some benchmarks on a dual socket system with two Broadwell
> E5 2660 @ 2.0 GHz with hyperthreading turned off. Each socket has 14
> cores which gives a total of 28, but only two cores are used in these
> experiments. One for TR/RX and one for the user space application. The
> memory is DDR4 @ 2133 MT/s (1067 MHz) and the size of each DIMM is
> 8192MB and with 8 of those DIMMs in the system we have 64 GB of total
> memory. The compiler used is gcc version 5.4.0 20160609. The NIC is an
> Intel I40E 40Gbit/s using the i40e driver.
>
> Below are the results in Mpps of the I40E NIC benchmark runs for 64
> byte packets, generated by commercial packet generator HW that is
> generating packets at full 40 Gbit/s line rate.
>
> XDP baseline numbers without this RFC:
> xdp_rxq_info --action XDP_DROP 31.3 Mpps
> xdp_rxq_info --action XDP_TX   16.7 Mpps
>
> XDP performance with this RFC i.e. with the buffer allocator:
> XDP_DROP 21.0 Mpps
> XDP_TX   11.9 Mpps
>
> AF_PACKET V4 performance from previous RFC on 4.14-rc7:
> Benchmark   V2 V3 V4 V4+ZC
> rxdrop  0.67   0.73   0.74   33.7
> txpush  0.98   0.98   0.91   19.6
> l2fwd   0.66   0.71   0.67   15.5
>
> AF_XDP performance:
> Benchmark   XDP_SKB   XDP_DRVXDP_DRV_ZC (all in Mpps)
> rxdrop  3.311.6 16.9
> txpush  2.2 NA* 21.8
> l2fwd   1.7 NA* 10.4
>
Hi Bjorn,

This is very impressive work, thank you for contributing it!

For these benchmarks how are the AF_PACKET and AF_XDP numbers to be
compared. For instance is rxdpop comparable to XDP_DROP and
"xdp_rxq_info --action XDP_DROP"? Given your explanation below, I
believe they are, but it might be better to make that clear up front.

Tom


> * NA since there is no XDP_DRV mode (without ZC) for TX in this RFC,
>   see challenges below.
>
> If we start by comparing XDP_SKB performance with copy mode in
> AF_PACKET V4, we can see that AF_XDP delivers 3-5 times the
> throughput, which is positive. We are also happy with the XDP_DRV
> performance that provides 11.6 Mpps for rxdrop, and should work on any
> driver implementing full XDP support. Now to the problematic part:
> XDP_DRV_ZC. The txpush (TX only) benchmark shows decent results at
> 21.8 Mpps and is better than it was for V4, even though we have spent
> no time optimizing the code in AF_XDP. (We did that in AF_PACKET V4.)
> But the RX performance is sliced by half, which is not good. The
> reason for this is, for the major part, the new buffer allocator which
> is used for RX ZC only (at this point, see todo section). If you take
> a look at the XDP baseline numbers, introducing the buffer pool
> allocator dr

Re: [PATCH 4/4] net: amd-xgbe: fix comparison to bitshift when dealing with a mask

2018-02-05 Thread Tom Lendacky
On 2/5/2018 2:10 PM, Wolfram Sang wrote:
> Due to a typo, the mask was destroyed by a comparison instead of a bit
> shift.
> 
> Signed-off-by: Wolfram Sang <wsa+rene...@sang-engineering.com>

Excellent find.

Acked-by: Tom Lendacky <thomas.lenda...@amd.com>

David, this should also be applied to the 4.14 and 4.15 stable releases.

Thanks,
Tom

> ---
> Only build tested. To be applied individually per subsystem.
> 
>  drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
> b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
> index 7a3ebfd236f5eb..100adee778dfd6 100644
> --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
> +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
> @@ -595,7 +595,7 @@ static void xgbe_isr_task(unsigned long data)
>  
>   reissue_mask = 1 << 0;
>   if (!pdata->per_channel_irq)
> - reissue_mask |= 0x < 4;
> + reissue_mask |= 0x << 4;
>  
>   XP_IOWRITE(pdata, XP_INT_REISSUE_EN, reissue_mask);
>   }
> 


[PATCH v2 net-next 0/2] kcm: fix two syzcaller issues

2018-01-24 Thread Tom Herbert
In this patch set:

- Don't allow attaching non-TCP or listener sockets to a KCM mux.
- In kcm_attach Check if sk_user_data is already set. This is
  under lock to avoid race conditions. More work is need to make
  all of the users of sk_user_data to use the same locking.

- v2
  Remove unncessary check for not PF_KCM in kcm_attach (suggested by
  Guillaume Nault)

Tom Herbert (2):
  kcm: Only allow TCP sockets to be attached to a KCM mux
  kcm: Check if sk_user_data already set in kcm_attach

 net/kcm/kcmsock.c | 25 +
 1 file changed, 21 insertions(+), 4 deletions(-)

-- 
2.11.0



[PATCH v2 net-next 1/2] kcm: Only allow TCP sockets to be attached to a KCM mux

2018-01-24 Thread Tom Herbert
TCP sockets for IPv4 and IPv6 that are not listeners or in closed
stated are allowed to be attached to a KCM mux.

Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Reported-by: syzbot+8865eaff7f9acd593...@syzkaller.appspotmail.com
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d4e98f20fc2a..7632797fb68e 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1387,8 +1387,13 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
if (!csk)
return -EINVAL;
 
-   /* We must prevent loops or risk deadlock ! */
-   if (csk->sk_family == PF_KCM)
+   /* Only allow TCP sockets to be attached for now */
+   if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
+   csk->sk_protocol != IPPROTO_TCP)
+   return -EOPNOTSUPP;
+
+   /* Don't allow listeners or closed sockets */
+   if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE)
return -EOPNOTSUPP;
 
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
-- 
2.11.0



[PATCH v2 net-next 2/2] kcm: Check if sk_user_data already set in kcm_attach

2018-01-24 Thread Tom Herbert
This is needed to prevent sk_user_data being overwritten.
The check is done under the callback lock. This should prevent
a socket from being attached twice to a KCM mux. It also prevents
a socket from being attached for other use cases of sk_user_data
as long as the other cases set sk_user_data under the lock.
Followup work is needed to unify all the use cases of sk_user_data
to use the same locking.

Reported-by: syzbot+114b15f2be420a888...@syzkaller.appspotmail.com
Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7632797fb68e..4a8d407f8902 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1410,9 +1410,18 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
return err;
}
 
-   sock_hold(csk);
-
write_lock_bh(>sk_callback_lock);
+
+   /* Check if sk_user_data is aready by KCM or someone else.
+* Must be done under lock to prevent race conditions.
+*/
+   if (csk->sk_user_data) {
+   write_unlock_bh(>sk_callback_lock);
+   strp_done(>strp);
+   kmem_cache_free(kcm_psockp, psock);
+   return -EALREADY;
+   }
+
psock->save_data_ready = csk->sk_data_ready;
psock->save_write_space = csk->sk_write_space;
psock->save_state_change = csk->sk_state_change;
@@ -1420,8 +1429,11 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
csk->sk_data_ready = psock_data_ready;
csk->sk_write_space = psock_write_space;
csk->sk_state_change = psock_state_change;
+
write_unlock_bh(>sk_callback_lock);
 
+   sock_hold(csk);
+
/* Finished initialization, now add the psock to the MUX. */
spin_lock_bh(>lock);
head = >psocks;
-- 
2.11.0



[PATCH net-next 2/2] kcm: Check if sk_user_data already set in kcm_attach

2018-01-23 Thread Tom Herbert
This is needed to prevent sk_user_data being overwritten.
The check is done under the callback lock. This should prevent
a socket from being attached twice to a KCM mux. It also prevents
a socket from being attached for other use cases of sk_user_data
as long as the other cases set sk_user_data under the lock.
Followup work is needed to unify all the use cases of sk_user_data
to use the same locking.

Reported-by: syzbot+114b15f2be420a888...@syzkaller.appspotmail.com
Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 474ea2200592..5ae02a5b0838 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1414,9 +1414,18 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
return err;
}
 
-   sock_hold(csk);
-
write_lock_bh(>sk_callback_lock);
+
+   /* Check if sk_user_data is aready by KCM or someone else.
+* Must be done under lock to prevent race conditions.
+*/
+   if (csk->sk_user_data) {
+   write_unlock_bh(>sk_callback_lock);
+   strp_done(>strp);
+   kmem_cache_free(kcm_psockp, psock);
+   return -EALREADY;
+   }
+
psock->save_data_ready = csk->sk_data_ready;
psock->save_write_space = csk->sk_write_space;
psock->save_state_change = csk->sk_state_change;
@@ -1424,8 +1433,11 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
csk->sk_data_ready = psock_data_ready;
csk->sk_write_space = psock_write_space;
csk->sk_state_change = psock_state_change;
+
write_unlock_bh(>sk_callback_lock);
 
+   sock_hold(csk);
+
/* Finished initialization, now add the psock to the MUX. */
spin_lock_bh(>lock);
head = >psocks;
-- 
2.11.0



[PATCH net-next 1/2] kcm: Only allow TCP sockets to be attached to a KCM mux

2018-01-23 Thread Tom Herbert
TCP sockets for IPv4 and IPv6 that are not listeners or in closed
stated are allowed to be attached to a KCM mux.

Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Reported-by: syzbot+8865eaff7f9acd593...@syzkaller.appspotmail.com
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/kcm/kcmsock.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d4e98f20fc2a..474ea2200592 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1391,6 +1391,15 @@ static int kcm_attach(struct socket *sock, struct socket 
*csock,
if (csk->sk_family == PF_KCM)
return -EOPNOTSUPP;
 
+   /* Only allow TCP sockets to be attached for now */
+   if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
+   csk->sk_protocol != IPPROTO_TCP)
+   return -EOPNOTSUPP;
+
+   /* Don't allow listeners or closed sockets */
+   if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE)
+   return -EOPNOTSUPP;
+
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
if (!psock)
return -ENOMEM;
-- 
2.11.0



[PATCH net-next 0/2] kcm: fix two syzcaller issues

2018-01-23 Thread Tom Herbert
In this patch set:

- Don't allow attaching non-TCP or listener sockets to a KCM mux.
- In kcm_attach Check if sk_user_data is already set. This is
  under lock to avoid race conditions. More work is need to make
  all of the users of sk_user_data to use the same locking.

Tom Herbert (2):
  kcm: Only allow TCP sockets to be attached to a KCM mux
  kcm: Check if sk_user_data already set in kcm_attach

 net/kcm/kcmsock.c | 25 +++--
 1 file changed, 23 insertions(+), 2 deletions(-)

-- 
2.11.0



Re: [PATCH net-next] kcm: do not attach sockets if sk_user_data is already used

2018-01-18 Thread Tom Herbert
On Thu, Jan 18, 2018 at 10:08 AM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Thu, 2018-01-18 at 09:46 -0800, Tom Herbert wrote:
>>
>> Then that's increasing the udp_sock structure size for a narrow use
>> case which will get push back. I think it's going to be better to
>> stick with one sock pointer. We could maybe redefine sk_user_data as a
>> pointer to an allocated structure or array so it can hold multiple
>> user_data pointers (in lieu of chaining).
>>
>
> We do not have a lot of UDP sockets per host, I do not believe it
> should be a problem adding stuff in them.
>
Eric,

Is QUIC using unconnected sockets then?

Tom


Re: [PATCH net-next] kcm: do not attach sockets if sk_user_data is already used

2018-01-18 Thread Tom Herbert
On Thu, Jan 18, 2018 at 7:40 AM, James Chapman <jchap...@katalix.com> wrote:
> On 18 January 2018 at 15:18, Guillaume Nault <g.na...@alphalink.fr> wrote:
>> On Wed, Jan 17, 2018 at 02:25:38PM -0500, David Miller wrote:
>>> From: James Chapman <jchap...@katalix.com>
>>> Date: Wed, 17 Jan 2018 11:13:33 +
>>>
>>> > On 16 January 2018 at 19:00, David Miller <da...@davemloft.net> wrote:
>>> >> From: Tom Herbert <t...@herbertland.com>
>>> >> Date: Tue, 16 Jan 2018 09:36:41 -0800
>>> >>
>>> >>> sk_user_data is set with the sk_callback lock held in code below.
>>> >>> Should be able to take the lock earlier can do this check under the
>>> >>> lock.
>>> >>
>>> >> csock, and this csk, is obtained from an arbitrary one of the
>>> >> process's FDs.  It can be any socket type or family, and that socket's
>>> >> family might set sk_user_data without the callback lock.
>>> >>
>>> >> The only socket type check is making sure it is not another PF_KCM
>>> >> socket.  So that doesn't help with this problem.
>>> >
>>> > Is it the intention to update all socket code over time to write
>>> > sk_user_data within the sk_callback lock? If so, I'm happy to address
>>> > that in the l2tp code (and update the kcm patch to check sk_user_data
>>> > within the sk_callback lock). Or is the preferred solution to restrict
>>> > KCM to specific socket families, as suggested by Guillaume earlier in
>>> > the thread?
>>>
>>> I think we have a more fundamental issue here.
>>>
>>> sk->sk_user_data is a place where RPC layer specific data is hung off
>>> of.  By this definition SunRPC, RXRPC, RDS, TIPC, and KCM are all
>>> using it correctly.
>>>
>>> Phonet has a similar issue to the one seen here, it tests and changes
>>> sk_user_data under lock_sock().  The only requirement it makes is
>>> that the socket type is not SOCK_STREAM.  However, this one might be OK
>>> since only pep_sock sockets can be passed down into gprs_attach().
>>>
>> But, if I read it correctly, that doesn't prevent it from being passed
>> to kcm_attach() later on, which will overwrite sk_user_data (unless we
>> update the locking scheme and refuse to overwrite sk_user_data in a
>> race-free way).
>>
>> BTW couldn't the gprs_dev pointer be embedded in struct pep_sock?
>> This way pep_sk(sk)->gp could be used instead of sk->sk_user_data.
>> That'd probably be a violation of the phonet's layering, as that'd
>> tie gprs_dev to pep sockets. OTOH, only pep sockets can currently be
>> attached to gprs_dev, so in practice that might be a reasonable
>> compromise.
>>
>>> Most of these cases like SunRPC, RXRPC, etc. are fine because they
>>> only graft on top of TCP and UDP sockets.
>>>
>>> The weird situation here is that L2TP does tunneling and stores it's
>>> private state in sk->sk_user_data like an RPC layer would.  And KCM
>>> allows basically any socket type to be attached.
>>>
>>> The RPC layers create their sockets internally, so I cannot see a way
>>> that those can be sent to a KCM attach operations.  And I think that
>>> is why this RPC invariant is important for sk_user_data usage.
>>>
>> SunRPC seems to possibly set sk_user_data on user sockets: svc_addsock()
>> gets a socket using sockfd_lookup() then passes it to svc_setup_socket()
>> which in turn sets sk_user_data. I don't know anything about SunRPC, so
>> I might very well have missed important details, but I believe such a
>> socket could be passed to KCM which could lead to the same kind of
>> issues as for L2TP. Other RPCs look safe to me.
>>
>>> If all else was equal, even though it doesn't make much sense to KCM
>>> attach L2TP sockets to KCM, I would suggest to change L2TP to store
>>> it's private stuff elsewhere.
>>>
>>> But that is not the case.  Anything using the generic UDP
>>> encapsulation layer is going to make use of sk->sk_user_data like this
>>> (see setup_udp_tunnel_sock).
>>>
>> Most UDP encapsulations only use kernel sockets though. It seems that
>> only L2TP and GTP use setup_udp_tunnel_sock() with userpsace sockets.
>> So it might be feasible to restrict usage of sk_user_data to kernel
>> sockets only.
>>
>> For L2TP, we probably can adapt l2tp_sock_to_tunnel() so that it does
>> a lo

Re: [PATCH net-next] kcm: do not attach sockets if sk_user_data is already used

2018-01-18 Thread Tom Herbert
On Wed, Jan 17, 2018 at 11:25 AM, David Miller <da...@davemloft.net> wrote:
> From: James Chapman <jchap...@katalix.com>
> Date: Wed, 17 Jan 2018 11:13:33 +
>
>> On 16 January 2018 at 19:00, David Miller <da...@davemloft.net> wrote:
>>> From: Tom Herbert <t...@herbertland.com>
>>> Date: Tue, 16 Jan 2018 09:36:41 -0800
>>>
>>>> sk_user_data is set with the sk_callback lock held in code below.
>>>> Should be able to take the lock earlier can do this check under the
>>>> lock.
>>>
>>> csock, and this csk, is obtained from an arbitrary one of the
>>> process's FDs.  It can be any socket type or family, and that socket's
>>> family might set sk_user_data without the callback lock.
>>>
>>> The only socket type check is making sure it is not another PF_KCM
>>> socket.  So that doesn't help with this problem.
>>
>> Is it the intention to update all socket code over time to write
>> sk_user_data within the sk_callback lock? If so, I'm happy to address
>> that in the l2tp code (and update the kcm patch to check sk_user_data
>> within the sk_callback lock). Or is the preferred solution to restrict
>> KCM to specific socket families, as suggested by Guillaume earlier in
>> the thread?
>
> I think we have a more fundamental issue here.
>
> sk->sk_user_data is a place where RPC layer specific data is hung off
> of.  By this definition SunRPC, RXRPC, RDS, TIPC, and KCM are all
> using it correctly.
>
> Phonet has a similar issue to the one seen here, it tests and changes
> sk_user_data under lock_sock().  The only requirement it makes is
> that the socket type is not SOCK_STREAM.  However, this one might be OK
> since only pep_sock sockets can be passed down into gprs_attach().
>
> Most of these cases like SunRPC, RXRPC, etc. are fine because they
> only graft on top of TCP and UDP sockets.
>
> The weird situation here is that L2TP does tunneling and stores it's
> private state in sk->sk_user_data like an RPC layer would.  And KCM
> allows basically any socket type to be attached.
>
> The RPC layers create their sockets internally, so I cannot see a way
> that those can be sent to a KCM attach operations.  And I think that
> is why this RPC invariant is important for sk_user_data usage.
>
> If all else was equal, even though it doesn't make much sense to KCM
> attach L2TP sockets to KCM, I would suggest to change L2TP to store
> it's private stuff elsewhere.
>
> But that is not the case.  Anything using the generic UDP
> encapsulation layer is going to make use of sk->sk_user_data like this
> (see setup_udp_tunnel_sock).
>
> It looks like over time we've accumulated this new class of uses
> of sk->sk_user_data, ho hum...
>
> And it's not like we can add a test to KCM to avoid these socket
> types, because they will look like normal UDP datagram sockets.
>
> What a mess...
>
> Furthermore, even if you add a test to KCM, you will now need to
> add the same test to L2TP and anything else which uses sk_user_data
> for tunneling and for which userspace has access to the socket fd.
>
> And it will be racy, indeed, until all such users align to the same
> precise locking scheme for tests and updates to sk_user_data.
>
> Again, what a mess...
>
It's not so surprising that sk_user_data is being used for so many
purposes, it's quite a powerful and useful notion. So, to a large
extent I think it's a victim of it's own success.

Aligning to one locking scheme is the first task to clean this. The
second would be how to deal with multiple simulataneous use on a
socket (or maybe not allow). I've thought about having a chain of
sk_user_data, but that's only useful is the write/read callback are
also chained. All this starts to look like STREAMS at some point ;-)

Tom


Re: KASAN: use-after-free Read in psock_write_space

2018-01-16 Thread Tom Herbert
On Tue, Jan 16, 2018 at 12:40 PM, syzbot
 wrote:
> syzkaller has found reproducer for the following crash on
> a8750ddca918032d6349adbf9a4b6555e7db20da
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
>
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+8865eaff7f9acd593...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed.
>
> ==
> BUG: KASAN: use-after-free in psock_write_space+0x143/0x160
> net/kcm/kcmsock.c:418
> Read of size 8 at addr 8801bb731d20 by task syzkaller858097/3665
>
> CPU: 0 PID: 3665 Comm: syzkaller858097 Not tainted 4.15.0-rc8+ #263
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>  kasan_report_error mm/kasan/report.c:351 [inline]
>  kasan_report+0x25b/0x340 mm/kasan/report.c:409
>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
>  psock_write_space+0x143/0x160 net/kcm/kcmsock.c:418
>  sock_wfree+0x10b/0x140 net/core/sock.c:1805
>  skb_orphan include/linux/skbuff.h:2521 [inline]
>  loopback_xmit+0x12e/0x6b0 drivers/net/loopback.c:78
>  __netdev_start_xmit include/linux/netdevice.h:4042 [inline]
>  netdev_start_xmit include/linux/netdevice.h:4051 [inline]
>  xmit_one net/core/dev.c:3003 [inline]
>  dev_hard_start_xmit+0x24e/0xac0 net/core/dev.c:3019
>  __dev_queue_xmit+0x206d/0x2920 net/core/dev.c:3500
>  dev_queue_xmit+0x17/0x20 net/core/dev.c:3533
>  neigh_hh_output include/net/neighbour.h:472 [inline]
>  neigh_output include/net/neighbour.h:480 [inline]
>  ip6_finish_output2+0x1498/0x23a0 net/ipv6/ip6_output.c:120
>  ip6_finish_output+0x302/0x930 net/ipv6/ip6_output.c:146
>  NF_HOOK_COND include/linux/netfilter.h:239 [inline]
>  ip6_output+0x1eb/0x840 net/ipv6/ip6_output.c:163
>  dst_output include/net/dst.h:460 [inline]
>  NF_HOOK include/linux/netfilter.h:250 [inline]
>  ip6_xmit+0xd84/0x2090 net/ipv6/ip6_output.c:269
>  inet6_csk_xmit+0x2fc/0x580 net/ipv6/inet6_connection_sock.c:139
>  dccp_transmit_skb+0x9ac/0x10f0 net/dccp/output.c:142
>  dccp_send_reset+0x21c/0x2a0 net/dccp/output.c:530
>  dccp_disconnect+0x90e/0xbb0 net/dccp/proto.c:276
>  inet_child_forget+0x6b/0x320 net/ipv4/inet_connection_sock.c:899
>  inet_csk_listen_stop+0x128/0x920 net/ipv4/inet_connection_sock.c:987
>  dccp_close+0x780/0xc20 net/dccp/proto.c:1007
>  inet_release+0xed/0x1c0 net/ipv4/af_inet.c:426
>  inet6_release+0x50/0x70 net/ipv6/af_inet6.c:432
>  sock_release+0x8d/0x1e0 net/socket.c:602
>  sock_close+0x16/0x20 net/socket.c:1131
>  __fput+0x327/0x7e0 fs/file_table.c:210
>  fput+0x15/0x20 fs/file_table.c:244
>  task_work_run+0x199/0x270 kernel/task_work.c:113
>  exit_task_work include/linux/task_work.h:22 [inline]
>  do_exit+0x9bb/0x1ad0 kernel/exit.c:865
>  do_group_exit+0x149/0x400 kernel/exit.c:968
>  get_signal+0x73f/0x16c0 kernel/signal.c:2335
>  do_signal+0x90/0x1eb0 arch/x86/kernel/signal.c:809
>  exit_to_usermode_loop+0x214/0x310 arch/x86/entry/common.c:158
>  prepare_exit_to_usermode arch/x86/entry/common.c:195 [inline]
>  syscall_return_slowpath+0x490/0x550 arch/x86/entry/common.c:264
>  entry_SYSCALL_64_fastpath+0x9e/0xa0
> RIP: 0033:0x445819
> RSP: 002b:7fad17780da8 EFLAGS: 0246 ORIG_RAX: 00ca
> RAX: fe00 RBX: 006dac3c RCX: 00445819
> RDX:  RSI:  RDI: 006dac3c
> RBP:  R08:  R09: 
> R10:  R11: 0246 R12: 006dac38
> R13: 0100 R14: 656c6c616b7a7973 R15: 000b
>
> Allocated by task 3664:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>  kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
>  kmem_cache_alloc+0x12e/0x760 mm/slab.c:3544
>  kmem_cache_zalloc include/linux/slab.h:678 [inline]
>  kcm_attach net/kcm/kcmsock.c:1394 [inline]
>  kcm_attach_ioctl net/kcm/kcmsock.c:1460 [inline]
>  kcm_ioctl+0x2d2/0x1690 net/kcm/kcmsock.c:1665
>  sock_do_ioctl+0x65/0xb0 net/socket.c:966
>  sock_ioctl+0x2c2/0x440 net/socket.c:1063
>  vfs_ioctl fs/ioctl.c:46 [inline]
>  do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686
>  SYSC_ioctl fs/ioctl.c:701 [inline]
>  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>  entry_SYSCALL_64_fastpath+0x29/0xa0
>
> Freed by task 3665:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_slab_free+0x71/0xc0 

Re: [PATCH net-next] kcm: do not attach sockets if sk_user_data is already used

2018-01-16 Thread Tom Herbert
On Sun, Jan 14, 2018 at 3:32 AM, James Chapman <jchap...@katalix.com> wrote:
> SIOCKCMATTACH writes a connected socket's sk_user_data for its own
> use. Prevent it doing so if the socket's sk_user_data is already set
> since some sockets (e.g. encapsulated sockets) use sk_user_data
> internally.
>
> This issue was found by syzbot.
>
> kernel BUG at net/l2tp/l2tp_ppp.c:176!
> invalid opcode:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 1 PID: 3503 Comm: syzkaller938388 Not tainted 4.15.0-rc7+ #181
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> RIP: 0010:pppol2tp_sock_to_session net/l2tp/l2tp_ppp.c:176 [inline]
> RIP: 0010:pppol2tp_sendmsg+0x512/0x670 net/l2tp/l2tp_ppp.c:304
> RSP: 0018:8801d4887438 EFLAGS: 00010293
> RAX: 8801bfef2180 RBX: 8801bff88440 RCX: 84ffbca2
> RDX:  RSI: 8801d4887598 RDI: 8801bff88820
> RBP: 8801d48874a8 R08:  R09: 11003a910e17
> R10: 0003 R11: 0001 R12: 8801bfff9bc0
> R13:  R14: 8000 R15: 
> FS:  01194880() GS:8801db30() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 20ea CR3: 0001bfecf001 CR4: 001606e0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  sock_sendmsg_nosec net/socket.c:628 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:638
>  kernel_sendmsg+0x47/0x60 net/socket.c:646
>  sock_no_sendpage+0x1cc/0x280 net/core/sock.c:2581
>  kernel_sendpage+0xbf/0xe0 net/socket.c:3349
>  kcm_write_msgs+0x404/0x1b80 net/kcm/kcmsock.c:646
>  kcm_sendmsg+0x148d/0x22d0 net/kcm/kcmsock.c:1035
>  sock_sendmsg_nosec net/socket.c:628 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:638
>  ___sys_sendmsg+0x767/0x8b0 net/socket.c:2018
>  __sys_sendmsg+0xe5/0x210 net/socket.c:2052
>  SYSC_sendmsg net/socket.c:2063 [inline]
>  SyS_sendmsg+0x2d/0x50 net/socket.c:2059
>  entry_SYSCALL_64_fastpath+0x23/0x9a
> RIP: 0033:0x440159
> RSP: 002b:7ffe74df8288 EFLAGS: 0217 ORIG_RAX: 002e
> RAX: ffda RBX:  RCX: 00440159
> RDX:  RSI: 201fcfc8 RDI: 0005
> RBP: 006ca018 R08:  R09: 
> R10:  R11: 0217 R12: 00401ac0
> R13: 00401b50 R14:  R15: 
> Code: c5 61 70 fc 48 8b 7d d0 e8 7c c2 5b fd 84 c0 74 0d e8 b3 61 70 fc 48 89 
> df e8 3b 49 2f ff 41 bd f7 ff ff ff eb 86 e8 9e 61 70 fc <0f> 0b 41 bd 95 ff 
> ff ff e9 74 ff ff ff e8 ec 32 a8 fc e9 77 fb
> RIP: pppol2tp_sock_to_session net/l2tp/l2tp_ppp.c:176 [inline] RSP: 
> 8801d4887438
> RIP: pppol2tp_sendmsg+0x512/0x670 net/l2tp/l2tp_ppp.c:304 RSP: 
> 8801d4887438
>
> Reported-by: syzbot+114b15f2be420a888...@syzkaller.appspotmail.com
> Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
> Signed-off-by: James Chapman <jchap...@katalix.com>
> ---
>  net/kcm/kcmsock.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> index d4e98f20fc2a..65392ed58f4a 100644
> --- a/net/kcm/kcmsock.c
> +++ b/net/kcm/kcmsock.c
> @@ -1391,6 +1391,10 @@ static int kcm_attach(struct socket *sock, struct 
> socket *csock,
> if (csk->sk_family == PF_KCM)
> return -EOPNOTSUPP;
>
> +   /* Cannot proceed if connected socket already uses sk_user_data */
> +   if (csk->sk_user_data)
> +   return -EOPNOTSUPP;
> +

sk_user_data is set with the sk_callback lock held in code below.
Should be able to take the lock earlier can do this check under the
lock.

Tom

> psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
> if (!psock)
> return -ENOMEM;
> --
> 1.9.1
>


Re: BUG: free active (active state 0) object type: work_struct hint: strp_work

2018-01-04 Thread Tom Herbert
On Thu, Jan 4, 2018 at 4:10 AM, syzbot
<syzbot+3c6c745b0d2f341bb...@syzkaller.appspotmail.com> wrote:
> Hello,
>
> syzkaller hit the following crash on
> 6bb8824732f69de0f233ae6b1a8158e149627b38
> git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> Unfortunately, I don't have any reproducer for this bug yet.
>
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+3c6c745b0d2f341bb...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
> Use struct sctp_assoc_value instead
> sctp: [Deprecated]: syz-executor4 (pid 12483) Use of int in maxseg socket
> option.
> Use struct sctp_assoc_value instead
> [ cut here ]
> ODEBUG: free active (active state 0) object type: work_struct hint:
> strp_work+0x0/0xf0 net/strparser/strparser.c:381
> WARNING: CPU: 1 PID: 3502 at lib/debugobjects.c:291
> debug_print_object+0x166/0x220 lib/debugobjects.c:288
> Kernel panic - not syncing: panic_on_warn set ...
>
> CPU: 1 PID: 3502 Comm: kworker/u4:4 Not tainted 4.15.0-rc5+ #170
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Workqueue: kkcmd kcm_tx_work
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  panic+0x1e4/0x41c kernel/panic.c:183
>  __warn+0x1dc/0x200 kernel/panic.c:547
>  report_bug+0x211/0x2d0 lib/bug.c:184
>  fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
>  fixup_bug arch/x86/kernel/traps.c:247 [inline]
>  do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
>  invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:1061
> RIP: 0010:debug_print_object+0x166/0x220 lib/debugobjects.c:288
> RSP: 0018:8801c0ee7068 EFLAGS: 00010086
> RAX: dc08 RBX: 0003 RCX: 8159bc3e
> RDX:  RSI: 1100381dcdc8 RDI: 8801db317dd0
> RBP: 8801c0ee70a8 R08:  R09: 1100381dcd9a
> R10: ed00381dce3c R11: 86137ad8 R12: 0001
> R13: 86113480 R14: 8560dc40 R15: 8146e5f0
>  __debug_check_no_obj_freed lib/debugobjects.c:745 [inline]
>  debug_check_no_obj_freed+0x662/0xf1f lib/debugobjects.c:774
>  kmem_cache_free+0x253/0x2a0 mm/slab.c:3745

I believe we just need to defer kmem_cache_free to call_rcu.

Tom

>  unreserve_psock+0x5a1/0x780 net/kcm/kcmsock.c:547
>  kcm_write_msgs+0xbae/0x1b80 net/kcm/kcmsock.c:590
>  kcm_tx_work+0x2e/0x190 net/kcm/kcmsock.c:731
>  process_one_work+0xbbf/0x1b10 kernel/workqueue.c:2112
>  worker_thread+0x223/0x1990 kernel/workqueue.c:2246
>  kthread+0x33c/0x400 kernel/kthread.c:238
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:515
>
> ==
> WARNING: possible circular locking dependency detected
> 4.15.0-rc5+ #170 Not tainted
> --
> kworker/u4:4/3502 is trying to acquire lock:
>  ((console_sem).lock){-.-.}, at: [<91214b42>] down_trylock+0x13/0x70
> kernel/locking/semaphore.c:136
>
> but task is already holding lock:
>  (_hash[i].lock){-.-.}, at: [<da143489>]
> __debug_check_no_obj_freed lib/debugobjects.c:736 [inline]
>  (_hash[i].lock){-.-.}, at: [<da143489>]
> debug_check_no_obj_freed+0x1e9/0xf1f lib/debugobjects.c:774
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (_hash[i].lock){-.-.}:
>__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
>__debug_object_init+0x109/0x1040 lib/debugobjects.c:343
>debug_object_init+0x17/0x20 lib/debugobjects.c:391
>debug_hrtimer_init kernel/time/hrtimer.c:396 [inline]
>debug_init kernel/time/hrtimer.c:441 [inline]
>hrtimer_init+0x8c/0x410 kernel/time/hrtimer.c:1122
>init_dl_task_timer+0x1b/0x50 kernel/sched/deadline.c:1023
>__sched_fork+0x2c4/0xb70 kernel/sched/core.c:2188
>init_idle+0x75/0x820 kernel/sched/core.c:5279
>sched_init+0xb19/0xc43 kernel/sched/core.c:5976
>start_kernel+0x452/0x819 init/main.c:582
>x86_64_start_reservations+0x2a/0x2c arch/x86/kernel/head64.c:378
>x86_64_start_kernel+0x77/0x7a arch/x86/kernel/head64.c:359
>secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S

[PATCH net 2/2] strparser: Call sock_owned_by_user_nocheck

2017-12-28 Thread Tom Herbert
strparser wants to check socket ownership without producing any
warnings. As indicated by the comment in the code, it is permissible
for owned_by_user to return true.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Reported-by: syzbot <syzkal...@googlegroups.com>
Reported-and-tested-by: 
<syzbot+c91c53af67f9ebe599a337d2e70950366153b...@syzkaller.appspotmail.com>
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/strparser/strparser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index c5fda15ba319..1fdab5c4eda8 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp)
 * allows a thread in BH context to safely check if the process
 * lock is held. In this case, if the lock is held, queue work.
 */
-   if (sock_owned_by_user(strp->sk)) {
+   if (sock_owned_by_user_nocheck(strp->sk)) {
queue_work(strp_wq, >work);
return;
}
-- 
2.11.0



[PATCH net 0/2] strparser: Fix lockdep issue

2017-12-28 Thread Tom Herbert
When sock_owned_by_user returns true in strparser. Fix is to add and
call sock_owned_by_user_nocheck since the check for owned by user is
not an error condition in this case.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Reported-by: syzbot <syzkal...@googlegroups.com>
Reported-and-tested-by: 
<syzbot+c91c53af67f9ebe599a337d2e70950366153b...@syzkaller.appspotmail.com>

Tom Herbert (2):
  sock: Add sock_owned_by_user_nocheck
  strparser: Call sock_owned_by_user_nocheck

 include/net/sock.h| 5 +
 net/strparser/strparser.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

-- 
2.11.0



[PATCH net 1/2] sock: Add sock_owned_by_user_nocheck

2017-12-28 Thread Tom Herbert
This allows checking socket lock ownership with producing lockdep
warnings.

Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 include/net/sock.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 9155da422692..7a7b14e9628a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1514,6 +1514,11 @@ static inline bool sock_owned_by_user(const struct sock 
*sk)
return sk->sk_lock.owned;
 }
 
+static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
+{
+   return sk->sk_lock.owned;
+}
+
 /* no reclassification while locks are held */
 static inline bool sock_allow_reclassification(const struct sock *csk)
 {
-- 
2.11.0



Re: WARNING in strp_data_ready

2017-12-28 Thread Tom Herbert
On Thu, Dec 28, 2017 at 12:59 AM, Ozgur <oz...@goosey.org> wrote:
>
>
> 28.12.2017, 04:19, "Tom Herbert" <t...@herbertland.com>:
>> On Wed, Dec 27, 2017 at 12:20 PM, Ozgur <oz...@goosey.org> wrote:
>>>  27.12.2017, 23:14, "Dmitry Vyukov" <dvyu...@google.com>:
>>>>  On Wed, Dec 27, 2017 at 9:08 PM, Ozgur <oz...@goosey.org> wrote:
>>>>>   27.12.2017, 22:21, "Dmitry Vyukov" <dvyu...@google.com>:
>>>>>>   On Wed, Dec 27, 2017 at 8:09 PM, Tom Herbert <t...@herbertland.com> 
>>>>>> wrote:
>>>>>>>Did you try the patch I posted?
>>>>>>
>>>>>>   Hi Tom,
>>>>>
>>>>>   Hello Dmitry,
>>>>>
>>>>>>   No. And I didn't know I need to. Why?
>>>>>>   If you think the patch needs additional testing, you can ask syzbot to
>>>>>>   test it. See 
>>>>>> https://github.com/google/syzkaller/blob/master/docs/syzbot.md#communication-with-syzbot
>>>>>>   Otherwise proceed with committing it. Or what are we waiting for?
>>>>>>
>>>>>>   Thanks
>>>>>
>>>>>   I think we need to fixed patch for crash, in fact check to patch code 
>>>>> and test solve the bug.
>>>>>   How do test it because there is no patch in the following bug?
>>>>
>>>>  Hi Ozgur,
>>>>
>>>>  I am not sure I completely understand what you mean. But the
>>>>  reproducer for this bug (which one can use for testing) is here:
>>>>  https://groups.google.com/forum/#!topic/syzkaller-bugs/Kxs05ziCpgY
>>>>  Tom also mentions there is some patch for this, but I don't know where
>>>>  it is, it doesn't seem to be referenced from this thread.
>>>
>>>  Hello Dmitry,
>>>
>>>  Ah, I'm sorry I don't seen Tom mail and I don't have a patch not tested :)
>>>  I think Tom send patch to only you and are you tested?
>>>
>>>  kcmsock.c will change and strp_data_ready I think locked.
>>>
>>>  Tom, please send a patch for me? I can test and inform you.
>>
>> Hi Ozgur,
>>
>> I reposted the patches as RFC "kcm: Fix lockdep issue". Please test if you 
>> can!
>>
>> Thanks,
>> Tom
>
> Hello Tom,
>
> Which are you use the repos? I pulled but I don't seen this patches.
>
They are not in any public repo yet. I posted the patches to netdev
list so they can be reviewed and tested by third parties. Posting
patches to the list a normal path to get patches into the kernel
(http://nickdesaulniers.github.io/blog/2017/05/16/submitting-your-first-patch-to-the-linux-kernel-and-responding-to-feedback/).

These patches were applied to net-next but are simple enough that they
should apply to other branches. I will repost and target to net per
Dave's directive once they are verified to fix the issue.

Tom


Re: WARNING in strp_data_ready

2017-12-27 Thread Tom Herbert
On Wed, Dec 27, 2017 at 12:20 PM, Ozgur <oz...@goosey.org> wrote:
>
>
> 27.12.2017, 23:14, "Dmitry Vyukov" <dvyu...@google.com>:
>> On Wed, Dec 27, 2017 at 9:08 PM, Ozgur <oz...@goosey.org> wrote:
>>>  27.12.2017, 22:21, "Dmitry Vyukov" <dvyu...@google.com>:
>>>>  On Wed, Dec 27, 2017 at 8:09 PM, Tom Herbert <t...@herbertland.com> wrote:
>>>>>   Did you try the patch I posted?
>>>>
>>>>  Hi Tom,
>>>
>>>  Hello Dmitry,
>>>
>>>>  No. And I didn't know I need to. Why?
>>>>  If you think the patch needs additional testing, you can ask syzbot to
>>>>  test it. See 
>>>> https://github.com/google/syzkaller/blob/master/docs/syzbot.md#communication-with-syzbot
>>>>  Otherwise proceed with committing it. Or what are we waiting for?
>>>>
>>>>  Thanks
>>>
>>>  I think we need to fixed patch for crash, in fact check to patch code and 
>>> test solve the bug.
>>>  How do test it because there is no patch in the following bug?
>>
>> Hi Ozgur,
>>
>> I am not sure I completely understand what you mean. But the
>> reproducer for this bug (which one can use for testing) is here:
>> https://groups.google.com/forum/#!topic/syzkaller-bugs/Kxs05ziCpgY
>> Tom also mentions there is some patch for this, but I don't know where
>> it is, it doesn't seem to be referenced from this thread.
>
> Hello Dmitry,
>
> Ah, I'm sorry I don't seen Tom mail and I don't have a patch not tested :)
> I think Tom send patch to only you and are you tested?
>
> kcmsock.c will change and strp_data_ready I think locked.
>
> Tom, please send a patch for me? I can test and inform you.
>
Hi Ozgur,

I reposted the patches as RFC "kcm: Fix lockdep issue". Please test if you can!

Thanks,
Tom

> Regards
>
> Ozgur
>
>>>  The fix patch should be for this net/kcm/kcmsock.c file and lock functions 
>>> must be added calling sk_data_ready ().
>>>  Regards
>>>
>>>  Ozgur
>>>
>>>>>   On Wed, Dec 27, 2017 at 10:25 AM, Dmitry Vyukov <dvyu...@google.com> 
>>>>> wrote:
>>>>>>   On Wed, Dec 6, 2017 at 4:44 PM, Dmitry Vyukov <dvyu...@google.com> 
>>>>>> wrote:
>>>>>>>>   <john.fastab...@gmail.com> wrote:
>>>>>>>>>   On 10/24/2017 08:20 AM, syzbot wrote:
>>>>>>>>>>   Hello,
>>>>>>>>>>
>>>>>>>>>>   syzkaller hit the following crash on 
>>>>>>>>>> 73d3393ada4f70fa3df5639c8d438f2f034c0ecb
>>>>>>>>>>   
>>>>>>>>>> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
>>>>>>>>>>   compiler: gcc (GCC) 7.1.1 20170620
>>>>>>>>>>   .config is attached
>>>>>>>>>>   Raw console output is attached.
>>>>>>>>>>   C reproducer is attached
>>>>>>>>>>   syzkaller reproducer is attached. See https://goo.gl/kgGztJ
>>>>>>>>>>   for information about syzkaller reproducers
>>>>>>>>>>
>>>>>>>>>>   WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 
>>>>>>>>>> sock_owned_by_me include/net/sock.h:1505 [inline]
>>>>>>>>>>   WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 
>>>>>>>>>> sock_owned_by_user include/net/sock.h:1511 [inline]
>>>>>>>>>>   WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 
>>>>>>>>>> strp_data_ready+0x2b7/0x390 net/strparser/strparser.c:404
>>>>>>>>>>   Kernel panic - not syncing: panic_on_warn set ...
>>>>>>>>>>
>>>>>>>>>>   CPU: 0 PID: 2996 Comm: syzkaller142210 Not tainted 4.14.0-rc5+ #138
>>>>>>>>>>   Hardware name: Google Google Compute Engine/Google Compute Engine, 
>>>>>>>>>> BIOS Google 01/01/2011
>>>>>>>>>>   Call Trace:
>>>>>>>>>>
>>>>>>>>>>__dump_stack lib/dump_stack.c:16 [inline]
>>>>>>>>>>dump_stack+0x194/0x257 lib/dump_stack.c:52
>>>>>>>>>>panic+0x1e4/0x417 kernel/panic.c:181
>>>>>>

[PATCH RFC 0/2] kcm: Fix lockdep issue

2017-12-27 Thread Tom Herbert
When sock_owned_by_user returns true in strparser. Fix is to add and
call sock_owned_by_user_nocheck since the check for owned by user is
not an error condition in this case.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Reported-by: syzbot <syzkal...@googlegroups.com>

Tom Herbert (2):
  sock: Add sock_owned_by_user_nocheck
  strparser: Call sock_owned_by_user_nocheck

 include/net/sock.h| 5 +
 net/strparser/strparser.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

-- 
2.11.0



[PATCH RFC 1/2] sock: Add sock_owned_by_user_nocheck

2017-12-27 Thread Tom Herbert
This allows checking socket lock ownership with producing lockdep
warnings.

Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 include/net/sock.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 6c1db823f8b9..66fd3951e6f3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1515,6 +1515,11 @@ static inline bool sock_owned_by_user(const struct sock 
*sk)
return sk->sk_lock.owned;
 }
 
+static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
+{
+   return sk->sk_lock.owned;
+}
+
 /* no reclassification while locks are held */
 static inline bool sock_allow_reclassification(const struct sock *csk)
 {
-- 
2.11.0



[PATCH RFC 2/2] strparser: Call sock_owned_by_user_nocheck

2017-12-27 Thread Tom Herbert
strparser wants to check socket ownership without producing any
warnings. As indicated by the comment in the code, it is permissible
for owned_by_user to return true.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Reported-by: syzbot <syzkal...@googlegroups.com>
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/strparser/strparser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index c5fda15ba319..1fdab5c4eda8 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp)
 * allows a thread in BH context to safely check if the process
 * lock is held. In this case, if the lock is held, queue work.
 */
-   if (sock_owned_by_user(strp->sk)) {
+   if (sock_owned_by_user_nocheck(strp->sk)) {
queue_work(strp_wq, >work);
return;
}
-- 
2.11.0



Re: WARNING in strp_data_ready

2017-12-27 Thread Tom Herbert
Did you try the patch I posted?


On Wed, Dec 27, 2017 at 10:25 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
> On Wed, Dec 6, 2017 at 4:44 PM, Dmitry Vyukov <dvyu...@google.com> wrote:
>>> <john.fastab...@gmail.com> wrote:
>>>> On 10/24/2017 08:20 AM, syzbot wrote:
>>>>> Hello,
>>>>>
>>>>> syzkaller hit the following crash on 
>>>>> 73d3393ada4f70fa3df5639c8d438f2f034c0ecb
>>>>> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
>>>>> compiler: gcc (GCC) 7.1.1 20170620
>>>>> .config is attached
>>>>> Raw console output is attached.
>>>>> C reproducer is attached
>>>>> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
>>>>> for information about syzkaller reproducers
>>>>>
>>>>>
>>>>> WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 sock_owned_by_me 
>>>>> include/net/sock.h:1505 [inline]
>>>>> WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 sock_owned_by_user 
>>>>> include/net/sock.h:1511 [inline]
>>>>> WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 
>>>>> strp_data_ready+0x2b7/0x390 net/strparser/strparser.c:404
>>>>> Kernel panic - not syncing: panic_on_warn set ...
>>>>>
>>>>> CPU: 0 PID: 2996 Comm: syzkaller142210 Not tainted 4.14.0-rc5+ #138
>>>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
>>>>> Google 01/01/2011
>>>>> Call Trace:
>>>>>  
>>>>>  __dump_stack lib/dump_stack.c:16 [inline]
>>>>>  dump_stack+0x194/0x257 lib/dump_stack.c:52
>>>>>  panic+0x1e4/0x417 kernel/panic.c:181
>>>>>  __warn+0x1c4/0x1d9 kernel/panic.c:542
>>>>>  report_bug+0x211/0x2d0 lib/bug.c:183
>>>>>  fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
>>>>>  do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
>>>>>  do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
>>>>>  do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
>>>>>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
>>>>>  invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
>>>>> RIP: 0010:sock_owned_by_me include/net/sock.h:1505 [inline]
>>>>> RIP: 0010:sock_owned_by_user include/net/sock.h:1511 [inline]
>>>>> RIP: 0010:strp_data_ready+0x2b7/0x390 net/strparser/strparser.c:404
>>>>> RSP: 0018:8801db206b18 EFLAGS: 00010206
>>>>> RAX: 8801d1e02080 RBX: 8801dad74c48 RCX: 
>>>>> RDX: 0100 RSI: 8801d29fa0a0 RDI: 85cbede0
>>>>> RBP: ffff8801db206b38 R08: 0005 R09: 10ce0bcd
>>>>> R10: 8801db206a00 R11: dc00 R12: 8801d29fa000
>>>>> R13: 8801dad74c50 R14: 8801d4350a92 R15: 0001
>>>>>  psock_data_ready+0x56/0x70 net/kcm/kcmsock.c:353
>>>>
>>>> Looks like KCM is calling sk_data_ready() without first taking the
>>>> sock lock.
>>>>
>>>> /* Called with lower sock held */
>>>> static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
>>>> {
>>>>  [...]
>>>> if (kcm_queue_rcv_skb(>sk, skb)) {
>>>>
>>>> In this case kcm->sk is not the same lock the comment is referring to.
>>>> And kcm_queue_rcv_skb() will eventually call sk_data_ready().
>>>>
>>>> @Tom, how about wrapping the sk_data_ready call in {lock|release}_sock?
>>>> I don't have anything better in mind immediately.
>>>>
>>> The sock locks are taken in reverse order in the send path so so
>>> grabbing kcm sock lock with lower lock held to call sk_data_ready may
>>> lead to deadlock like I think.
>>>
>>> It might be possible to change the order in the send path to do this.
>>> Something like:
>>>
>>> trylock on lower socket lock
>>> -if trylock fails
>>>   - release kcm sock lock
>>>   - lock lower sock
>>>   - lock kcm sock
>>> - call sendpage locked function
>>>
>>> I admit that dealing with two levels of socket locks in the data path
>>> is quite a pain :-)
>>
>> up
>>
>> still happening and we've lost 50K+ test VMs on this
>
> up
>
> Still happens and number of crashes crossed 60K, can we do something
> with this please?


Re: [PATCH v5 net-next 0/7] net: ILA notification mechanism and fixes

2017-12-26 Thread Tom Herbert
On Tue, Dec 26, 2017 at 2:29 PM, David Miller <da...@davemloft.net> wrote:
> From: Tom Herbert <t...@quantonium.net>
> Date: Thu, 21 Dec 2017 11:33:25 -0800
>
>> This patch set adds support to get netlink notifications for ILA
>> routes when a route is used.
>>
>> This patch set contains:
>>
>> - General infrastructure for route notifications
>> - The ILA route notification mechanism
>> - Add net to ila build_state
>> - Add flush command to ila_xlat
>> - Fix use of rhashtable for latest fixes
>>
>> Route notifications will be used in conjunction with populating
>> ILA forwarding caches.
>
> Tom, this is just a wolf in sheep's clothing.
>
Dave,

> It's still a cache controllable by external entities.
>
Yep, that's the nature of the problem. In networks of even modest
scale we anticipate that we'll see the number of virtual addresses
(identifiers) far exceed the number of physical hosts. The mapping of
virtual to physical address is not aggregable, so at full we expect
10s of billions of these discrete mappings in a single network. No
single device will be able hold all these mappings, so they'll be
sharded amongst some number of routers. This works fine for
connectivity except that it would be nice to eliminate the triangular
routing by having the source perform encapsulation for destination
itself. So this is the motivation for a working set cache. It is an
optimization, but in networks like 3GPP, it's a big win to eliminate
anchor points in mobility.

> It still therefore has the DoS'ability aspects.
>
True, if implemented without consideration of DOS this is a very bad
thing as proven already by others. However if we know this going in
then DOS'ability can be mitigated or eliminated depending on the rest
of the implementation and architecture, similar to how SYN attacks can
be dealt with.

For example, suppose a device has 10G input link, we want a cache
entry to be usable for at least 30 seconds, and we have no control
over the users on the other side of the link (a typical eNodeB
scenario). That gives a worse case of 19M pps, 585M packets over 30
seconds. Assuming 64 bytes per cache entry that gets us to 37G of
memory needed in the host. That amount of memory is reasonable for a
networking device. Cost of memory should drop over next few years so
10X scaling within ten years seems feasible.

> You can keep reframing this thing you want out there, either by
> explicitly filling the cache in the kernel or doing it via userspace
> responding the netlink events, but it's still the same exact thing
> with the same set of problems.
>
I would point out that the attack surface area using a redirect
mechanism is _way_ less than request/response that was used by LISP or
OVS.

> I'm sorry, but I can't apply this series.  Nor any series that adds a
> DoS'able facility of forwarding/switching/route objects to the
> kernel.
>
Technically, this patch set was just adding route notificates that
facilitate but aren't a requirement for cache management. However, I
do sympathsize with your concerns. Scaling and DOS are precisely the
big problem to overcome in network virtualization and
identifier/locator split.

Happy Holidays!
Tom


[PATCH v2 net-next 2/2] strparser: Call sock_owned_by_user_nocheck

2017-12-23 Thread Tom Herbert
strparser wants to check socket ownership without producing any
warnings. As indicated by the comment in the code, it is permissible
for owned_by_user to return true.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Reported-by: syzbot <syzkal...@googlegroups.com>
Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 net/strparser/strparser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index c5fda15ba319..1fdab5c4eda8 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp)
 * allows a thread in BH context to safely check if the process
 * lock is held. In this case, if the lock is held, queue work.
 */
-   if (sock_owned_by_user(strp->sk)) {
+   if (sock_owned_by_user_nocheck(strp->sk)) {
queue_work(strp_wq, >work);
return;
}
-- 
2.11.0



[PATCH v2 net-next 0/2] kcm: Fix two locking issues

2017-12-23 Thread Tom Herbert
One issue is lockdep warnings when sock_owned_by_user returns true
in strparser. Fix is to add and call sock_owned_by_user_nocheck since
the check for owned by user is not an error condition in this case.

The other issue is a potential deadlock between TX and RX paths

KCM socket lock and the psock socket lock are acquired in both
the RX and TX path, however they take the locks in opposite order
which can lead to deadlock. The fix is to add try_sock_lock to see
if psock socket lock can get acquired in the TX path with KCM lock
held. If not, then KCM socket is released and the psock socket lock
and KCM socket lock are acquired in the same order as the RX path.

Tested:

Ran KCM traffic without incident.

v2: Remove patches to address potential deadlock. I couldn't convince
myself this is an issue after looking at the code some more.


Tom Herbert (2):
  sock: Add sock_owned_by_user_nocheck
  strparser: Call sock_owned_by_user_nocheck

 include/net/sock.h| 5 +
 net/strparser/strparser.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

-- 
2.11.0



[PATCH v2 net-next 1/2] sock: Add sock_owned_by_user_nocheck

2017-12-23 Thread Tom Herbert
This allows checking socket lock ownership with producing lockdep
warnings.

Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 include/net/sock.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 6c1db823f8b9..66fd3951e6f3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1515,6 +1515,11 @@ static inline bool sock_owned_by_user(const struct sock 
*sk)
return sk->sk_lock.owned;
 }
 
+static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
+{
+   return sk->sk_lock.owned;
+}
+
 /* no reclassification while locks are held */
 static inline bool sock_allow_reclassification(const struct sock *csk)
 {
-- 
2.11.0



[PATCH net-next 0/4] kcm: Fix two locking issues

2017-12-22 Thread Tom Herbert
One issue is lockdep warnings when sock_owned_by_user returns true
in strparser. Fix is to add and call sock_owned_by_user_nocheck since
the check for owned by user is not an error condition in this case.

The other issue is a potential deadlock between TX and RX paths

KCM socket lock and the psock socket lock are acquired in both
the RX and TX path, however they take the locks in opposite order
which can lead to deadlock. The fix is to add try_sock_lock to see
if psock socket lock can get acquired in the TX path with KCM lock
held. If not, then KCM socket is released and the psock socket lock
and KCM socket lock are acquired in the same order as the RX path.

Tested:

Ran KCM traffic without incident.

Tom Herbert (4):
  sock: Add sock_owned_by_user_nocheck
  strparser: Call sock_owned_by_user_nocheck
  sock_lock: Add try_sock_lock
  kcm: Address deadlock between TX and RX paths

 include/net/kcm.h |  1 +
 include/net/sock.h| 12 +
 net/core/sock.c   | 20 +++
 net/kcm/kcmsock.c | 64 ++-
 net/strparser/strparser.c |  2 +-
 5 files changed, 81 insertions(+), 18 deletions(-)

-- 
2.11.0



[PATCH net-next 4/4] kcm: Address deadlock between TX and RX paths

2017-12-22 Thread Tom Herbert
Both the transmit and receive paths of KCM need to take both the
KCM socket lock and the psock socket lock, however they take the
locks in opposite order which can lead to deadlock.

This patch changes the transmit path (kcm_write_msgs to be specific)
so the locks are taken in the proper order. try_sock_lock is first used
to get the lower socket lock, if that is successful then sending data
can proceed with dropping KCM lock. If try_sock_lock fails then the KCM
lock is released and lock_sock is done on the lower socket followed by
the lock_sock on the KCM sock.

A doing_write_msgs flag has been added to kcm structure to prevent
multiple threads doing write_msgs when the KCM lock is dropped.
kernel_sendpage_locked is now called to do the send data with lock
already held.

Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 include/net/kcm.h |  1 +
 net/kcm/kcmsock.c | 64 ---
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/include/net/kcm.h b/include/net/kcm.h
index 2a8965819db0..22bd7dd3eedb 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -78,6 +78,7 @@ struct kcm_sock {
/* Don't use bit fields here, these are set under different locks */
bool tx_wait;
bool tx_wait_more;
+   bool doing_write_msgs;
 
/* Receive */
struct kcm_psock *rx_psock;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d4e98f20fc2a..3eb3179b96b3 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -574,13 +574,19 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm)
 static int kcm_write_msgs(struct kcm_sock *kcm)
 {
struct sock *sk = >sk;
-   struct kcm_psock *psock;
-   struct sk_buff *skb, *head;
-   struct kcm_tx_msg *txm;
+   struct sk_buff *head = skb_peek(>sk_write_queue);
unsigned short fragidx, frag_offset;
unsigned int sent, total_sent = 0;
+   struct kcm_psock *psock;
+   struct kcm_tx_msg *txm;
+   struct sk_buff *skb;
int ret = 0;
 
+   if (unlikely(kcm->doing_write_msgs))
+   return 0;
+
+   kcm->doing_write_msgs = true;
+
kcm->tx_wait_more = false;
psock = kcm->tx_psock;
if (unlikely(psock && psock->tx_stopped)) {
@@ -598,15 +604,36 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
return 0;
}
 
+try_again:
+   psock = reserve_psock(kcm);
+   if (!psock)
+   goto out_no_release_psock;
+
+   /* Get lock for lower sock */
+   if (!try_lock_sock(psock->sk)) {
+   /* Someone  else is holding the lower sock lock. We need to
+* release the KCM lock and get the psock lock first. This is
+* needed since the receive path obtains the locks in reverse
+* order and we want to avoid deadlock. Note that
+* write_msgs can't be reentered when we drop the KCM lock
+* since doing_write_msgs is set.
+*/
+   release_sock(>sk);
+
+   /* Take locks in order that receive path does */
+   lock_sock(psock->sk);
+   lock_sock(>sk);
+   }
+
+   /* At this point we have a reserved psock and its lower socket is
+* locked.
+*/
+
head = skb_peek(>sk_write_queue);
txm = kcm_tx_msg(head);
 
if (txm->sent) {
/* Send of first skbuff in queue already in progress */
-   if (WARN_ON(!psock)) {
-   ret = -EINVAL;
-   goto out;
-   }
sent = txm->sent;
frag_offset = txm->frag_offset;
fragidx = txm->fragidx;
@@ -615,11 +642,6 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
goto do_frag;
}
 
-try_again:
-   psock = reserve_psock(kcm);
-   if (!psock)
-   goto out;
-
do {
skb = head;
txm = kcm_tx_msg(head);
@@ -643,11 +665,12 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
goto out;
}
 
-   ret = kernel_sendpage(psock->sk->sk_socket,
- frag->page.p,
- frag->page_offset + frag_offset,
- frag->size - frag_offset,
- MSG_DONTWAIT);
+   ret = kernel_sendpage_locked(psock->sk, frag->page.p,
+frag->page_offset +
+   frag_offset,
+frag->size -
+   frag_offset,
+  

[PATCH net-next 3/4] sock_lock: Add try_sock_lock

2017-12-22 Thread Tom Herbert
try_sock lock is an opportunistic attempt to acquire a socket lock
without blocking or sleeping. If the socket lock is acquired then
true is returned, else false is returned.

Signed-off-by: Tom Herbert <t...@quantonium.net>
---
 include/net/sock.h |  7 +++
 net/core/sock.c| 20 
 2 files changed, 27 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 3b4ca2046f8c..69fdd1a89591 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1462,6 +1462,13 @@ static inline void lock_sock(struct sock *sk)
lock_sock_nested(sk, 0);
 }
 
+bool try_lock_sock_nested(struct sock *sk, int subclass);
+
+static inline bool try_lock_sock(struct sock *sk)
+{
+   return try_lock_sock_nested(sk, 0);
+}
+
 void release_sock(struct sock *sk);
 
 /* BH context may only use the following locking interface. */
diff --git a/net/core/sock.c b/net/core/sock.c
index 72d14b221784..40fb772e2d52 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2782,6 +2782,26 @@ void lock_sock_nested(struct sock *sk, int subclass)
 }
 EXPORT_SYMBOL(lock_sock_nested);
 
+bool try_lock_sock_nested(struct sock *sk, int subclass)
+{
+   spin_lock_bh(>sk_lock.slock);
+   if (sk->sk_lock.owned) {
+   spin_unlock_bh(>sk_lock.slock);
+   return false;
+   }
+
+   sk->sk_lock.owned = 1;
+   spin_unlock(>sk_lock.slock);
+
+   /* The sk_lock has mutex_lock() semantics here: */
+
+   mutex_acquire(>sk_lock.dep_map, subclass, 0, _RET_IP_);
+   local_bh_enable();
+
+   return true;
+}
+EXPORT_SYMBOL(try_lock_sock_nested);
+
 void release_sock(struct sock *sk)
 {
spin_lock_bh(>sk_lock.slock);
-- 
2.11.0



  1   2   3   4   5   6   7   8   9   10   >