date:20171205

[PATCH] xen-netback: Fix logging message with spurious period after newline

2017-12-05 Thread Joe Perches

Using a period after a newline causes bad output.

Signed-off-by: Joe Perches 
---
 drivers/net/xen-netback/interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/interface.c 
b/drivers/net/xen-netback/interface.c
index d6dff347f896..78ebe494fef0 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -186,7 +186,7 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct 
net_device *dev)
/* Obtain the queue to be used to transmit this packet */
index = skb_get_queue_mapping(skb);
if (index >= num_queues) {
-   pr_warn_ratelimited("Invalid queue %hu for packet on interface 
%s\n.",
+   pr_warn_ratelimited("Invalid queue %hu for packet on interface 
%s\n",
index, vif->dev->name);
index %= num_queues;
}
-- 
2.15.0

[PATCH net-next v2 2/2] bpf/tracing: add a bpf test for new ioctl query interface

2017-12-05 Thread Yonghong Song

Added a subtest in test_progs. The tracepoint is
sched/sched_switch. Multiple bpf programs are attached to
this tracepoint and the query interface is exercised.

Signed-off-by: Yonghong Song 
Acked-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/perf_event.h |   6 +
 tools/testing/selftests/bpf/Makefile  |   2 +-
 tools/testing/selftests/bpf/test_progs.c  | 155 ++
 tools/testing/selftests/bpf/test_tracepoint.c |  26 +
 4 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_tracepoint.c

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 362493a..8523db0 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -418,6 +418,11 @@ struct perf_event_attr {
__u16   __reserved_2;   /* align to __u64 */
 };
 
+struct perf_event_query_bpf {
+   __u64   prog_ids;
+   __u32   prog_cnt;
+};
+
 #define perf_flags(attr)   (*(&(attr)->read_format + 1))
 
 /*
@@ -433,6 +438,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_ID  _IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
 #define PERF_EVENT_IOC_PAUSE_OUTPUT_IOW('$', 9, __u32)
+#define PERF_EVENT_IOC_QUERY_BPF   _IOWR('$', 10, struct 
perf_event_query_bpf *)
 
 enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 2c9d8c6..255fb1f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -17,7 +17,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps 
test_lru_map test_lpm_map test
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o 
test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o 
sockmap_parse_prog.o \
-   sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o
+   sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
test_offload.py
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 6942753..dde23ed 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -21,8 +21,10 @@ typedef __u16 __sum16;
 #include 
 #include 
 #include 
+#include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -617,6 +619,158 @@ static void test_obj_name(void)
}
 }
 
+static void test_tp_attach_query(void)
+{
+   const int num_progs = 3;
+   __u32 duration = 0, info_len, prog_ids[num_progs], 
saved_prog_ids[num_progs];
+   int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs];
+   const char *file = "./test_tracepoint.o";
+   struct perf_event_query_bpf query = {};
+   struct perf_event_attr attr = {};
+   struct bpf_object *obj[num_progs];
+   struct bpf_prog_info prog_info;
+   char buf[256];
+
+   snprintf(buf, sizeof(buf),
+"/sys/kernel/debug/tracing/events/sched/sched_switch/id");
+   efd = open(buf, O_RDONLY, 0);
+   if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+   return;
+   bytes = read(efd, buf, sizeof(buf));
+   close(efd);
+   if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+ "read", "bytes %d errno %d\n", bytes, errno))
+   return;
+
+   attr.config = strtol(buf, NULL, 0);
+   attr.type = PERF_TYPE_TRACEPOINT;
+   attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+   attr.sample_period = 1;
+   attr.wakeup_events = 1;
+
+   for (i = 0; i < num_progs; i++) {
+   err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i],
+   &prog_fd[i]);
+   if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+   goto cleanup1;
+
+   bzero(&prog_info, sizeof(prog_info));
+   prog_info.jited_prog_len = 0;
+   prog_info.xlated_prog_len = 0;
+   prog_info.nr_map_ids = 0;
+   info_len = sizeof(prog_info);
+   err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len);
+   if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n",
+ err, errno))
+   goto cleanup1;
+   saved_prog_ids[i] = prog_info.id;
+
+   pmu_fd[i] = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+   0 /* cpu 0 */, -1 /* group id */,
+   0 /* flags */);
+   if (CHECK(pmu_fd[i] < 0, "perf_event_open", "err %d errno %d\n",
+ pmu_fd[i], errno))
+   goto cleanup2;
+   err = ioctl(pmu_fd[i], PERF_EV

[PATCH net-next v2 0/2] bpf/tracing: allow user space to query prog array on the same tp

2017-12-05 Thread Yonghong Song

Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event. Given a perf event
(kprobe, uprobe, or kernel tracepoint), the perf ioctl interface
is used to query bpf programs attached to the same trace event.

There already exists a BPF_PROG_QUERY command for introspection
currently used by cgroup+bpf. We did have an implementation for
querying tracepoint+bpf through the same interface. However, it
looks cleaner to use ioctl() style of api here, since attaching
bpf prog to tracepoint/kuprobe is also done via ioctl.

Patch #1 had the core implementation and patch #2 added
a test case in tools bpf selftests suite.

Changelogs:
v1-> v2:
  - Rebase on top of net-next.
  - Use existing bpf_prog_array_length function instead of
implementing the same functionality in function
bpf_prog_array_copy_info.

Yonghong Song (2):
  bpf/tracing: allow user space to query prog array on the same tp
  bpf/tracing: add a bpf test for new ioctl query interface

 include/linux/bpf.h   |   4 +
 include/uapi/linux/perf_event.h   |   6 +
 kernel/bpf/core.c |  21 
 kernel/events/core.c  |   3 +
 kernel/trace/bpf_trace.c  |  23 
 tools/include/uapi/linux/perf_event.h |   6 +
 tools/testing/selftests/bpf/Makefile  |   2 +-
 tools/testing/selftests/bpf/test_progs.c  | 155 ++
 tools/testing/selftests/bpf/test_tracepoint.c |  26 +
 9 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_tracepoint.c

-- 
2.9.5

[PATCH net-next v2 1/2] bpf/tracing: allow user space to query prog array on the same tp

2017-12-05 Thread Yonghong Song

Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint.

This patch adds a new ioctl
command, given a perf event fd, to query the bpf program array
attached to the same perf tracepoint event.

The new uapi ioctl command:
  PERF_EVENT_IOC_QUERY_BPF

The new uapi/linux/perf_event.h structure:
  struct perf_event_query_bpf {
   __u64prog_ids;
   __u32prog_cnt;
  };

The usage:
  struct perf_event_query_bpf query;
  query.prog_ids = (__u64)usr_prog_ids_buf;
  query.prog_cnt = usr_prog_ids_buf_len;
  err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, &query);

Signed-off-by: Yonghong Song 
Acked-by: Alexei Starovoitov 
---
 include/linux/bpf.h |  4 
 include/uapi/linux/perf_event.h |  6 ++
 kernel/bpf/core.c   | 21 +
 kernel/events/core.c|  3 +++
 kernel/trace/bpf_trace.c| 23 +++
 5 files changed, 57 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e425..f812ac5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -254,6 +254,7 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const 
void *src,
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
+int bpf_event_query_prog_array(struct perf_event *event, void __user *info);
 
 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
  union bpf_attr __user *uattr);
@@ -285,6 +286,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu 
*progs,
 
 void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
struct bpf_prog *old_prog);
+int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+__u32 __user *prog_ids, u32 request_cnt,
+__u32 __user *prog_cnt);
 int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b9a4953..fee0b43 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -418,6 +418,11 @@ struct perf_event_attr {
__u16   __reserved_2;   /* align to __u64 */
 };
 
+struct perf_event_query_bpf {
+   __u64   prog_ids;
+   __u32   prog_cnt;
+};
+
 #define perf_flags(attr)   (*(&(attr)->read_format + 1))
 
 /*
@@ -433,6 +438,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_ID  _IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
 #define PERF_EVENT_IOC_PAUSE_OUTPUT_IOW('$', 9, __u32)
+#define PERF_EVENT_IOC_QUERY_BPF   _IOWR('$', 10, struct 
perf_event_query_bpf *)
 
 enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86b50aa..35b427aa 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1462,6 +1462,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array 
__rcu *progs,
rcu_read_lock();
prog = rcu_dereference(progs)->progs;
for (; *prog; prog++) {
+   if (*prog == &dummy_bpf_prog.prog)
+   continue;
id = (*prog)->aux->id;
if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
rcu_read_unlock();
@@ -1545,6 +1547,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu 
*old_array,
return 0;
 }
 
+int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+__u32 __user *prog_ids, u32 request_cnt,
+__u32 __user *prog_cnt)
+{
+   u32 cnt = 0;
+
+   if (array)
+   cnt = bpf_prog_array_length(array);
+
+   if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
+   return -EFAULT;
+
+   /* return early if user requested only program count or nothing to copy 
*/
+   if (!request_cnt || !prog_ids || !cnt)
+   return 0;
+
+   return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4..f10609e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4723,6 +4723,9 @@ static long _perf_ioctl(struct perf_event *event, 
unsigned int cmd, unsigned lon
rcu_read_unlock();
return 0;
}
+
+   case PERF_EVENT_IOC_QUERY_BPF

Re: [PATCH v2 3/3] net: macb: change GFP_ATOMIC to GFP_KERNEL

2017-12-05 Thread Julia Lawall



On Tue, 5 Dec 2017, Julia Cartwright wrote:

> Now that the rx_fs_lock is no longer held across allocation, it's safe
> to use GFP_KERNEL for allocating new entries.
>
> This reverts commit 81da3bf6e3f88 ("net: macb: change GFP_KERNEL to
> GFP_ATOMIC").
>
> Cc: Julia Lawall 
> Signed-off-by: Julia Cartwright 

Acked-by: Julia Lawall 


> ---
>  drivers/net/ethernet/cadence/macb_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/cadence/macb_main.c 
> b/drivers/net/ethernet/cadence/macb_main.c
> index 758e8b3042b2..234667eaaa92 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -2800,7 +2800,7 @@ static int gem_add_flow_filter(struct net_device 
> *netdev,
>   int ret = -EINVAL;
>   bool added = false;
>
> - newfs = kmalloc(sizeof(*newfs), GFP_ATOMIC);
> + newfs = kmalloc(sizeof(*newfs), GFP_KERNEL);
>   if (newfs == NULL)
>   return -ENOMEM;
>   memcpy(&newfs->fs, fs, sizeof(newfs->fs));
> --
> 2.14.2
>
>

Re: [PATCH iproute2] iproute2: Fix undeclared __kernel_long_t type build error in RHEL 6.8

2017-12-05 Thread Leon Romanovsky

On Tue, Dec 05, 2017 at 05:33:25PM -0800, Stephen Hemminger wrote:
> On Fri,  1 Dec 2017 13:04:51 +0200
> Leon Romanovsky  wrote:
>
> > From: Leon Romanovsky 
> >
> > Add asm/posix_types.h header file to the list of needed includes,
> > because the headers files in RHEL 6.8 are too old and doesn't
> > have declaration of __kernel_long_t.
> >
> > In file included from ../include/uapi/linux/kernel.h:5,
> >  from ../include/uapi/linux/netfilter/x_tables.h:4,
> >  from ../include/xtables.h:20,
> >  from em_ipset.c:26:
> > ../include/uapi/linux/sysinfo.h:9: error: expected specifier-qualifier-list 
> > before ‘__kernel_long_t’
> >
> > Cc: Riad Abo Raed 
> > Cc: Guy Ergas 
> > Signed-off-by: Leon Romanovsky 
> > ---
> > Stephen,
> > I don't know how to properly solve this type of errors and would like to
> > hear your guidance on it.
> >
> > Should I simply add kernel file? Or maybe I need to add HAVE_xxx checks
> > to configure script to check __kernel_long_t existence and declare only
> > this type?
> >
> > I also have another build error on RHEL 6.8 system and looking for a
> > solution.
> >
> > In file included from em_ipset.c:26:
> > ../include/xtables.h:35:29: error: xtables-version.h: No such file or 
> > directory
> > make[1]: *** [em_ipset.o] Error 1
> >
> > The iptables-devel is iptables-devel-1.4.7-16.el6.x86_64 so check_xt()
> > success, but RH headers don't have xtable-version.h and the relevant defines
> > are embedded in the main xtables.h header file.
> >
> > Thanks
> > ---
> >  include/uapi/asm/posix_types.h | 97 
> > +++
> >  1 file changed, 97 insertions(+)
> >  create mode 100644 include/uapi/asm/posix_types.h
> >
> > diff --git a/include/uapi/asm/posix_types.h b/include/uapi/asm/posix_types.h
> > new file mode 100644
> > index ..5e6ea22b
> > --- /dev/null
> > +++ b/include/uapi/asm/posix_types.h
> > @@ -0,0 +1,97 @@
> > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > +#ifndef __ASM_GENERIC_POSIX_TYPES_H
> > +#define __ASM_GENERIC_POSIX_TYPES_H
> > +
> > +#include 
> > +/*
> > + * This file is generally used by user-level software, so you need to
> > + * be a little careful about namespace pollution etc.
> > + *
> > + * First the types that are often defined in different ways across
> > + * architectures, so that you can override them.
> > + */
> > +
> > +#ifndef __kernel_long_t
> > +typedef long   __kernel_long_t;
> > +typedef unsigned long  __kernel_ulong_t;
> > +#endif
> > +
> > +#ifndef __kernel_ino_t
> > +typedef __kernel_ulong_t __kernel_ino_t;
> > +#endif
> > +
> > +#ifndef __kernel_mode_t
> > +typedef unsigned int   __kernel_mode_t;
> > +#endif
> > +
> > +#ifndef __kernel_pid_t
> > +typedef int__kernel_pid_t;
> > +#endif
> > +
> > +#ifndef __kernel_ipc_pid_t
> > +typedef int__kernel_ipc_pid_t;
> > +#endif
> > +
> > +#ifndef __kernel_uid_t
> > +typedef unsigned int   __kernel_uid_t;
> > +typedef unsigned int   __kernel_gid_t;
> > +#endif
> > +
> > +#ifndef __kernel_suseconds_t
> > +typedef __kernel_long_t__kernel_suseconds_t;
> > +#endif
> > +
> > +#ifndef __kernel_daddr_t
> > +typedef int__kernel_daddr_t;
> > +#endif
> > +
> > +#ifndef __kernel_uid32_t
> > +typedef unsigned int   __kernel_uid32_t;
> > +typedef unsigned int   __kernel_gid32_t;
> > +#endif
> > +
> > +#ifndef __kernel_old_uid_t
> > +typedef __kernel_uid_t __kernel_old_uid_t;
> > +typedef __kernel_gid_t __kernel_old_gid_t;
> > +#endif
> > +
> > +#ifndef __kernel_old_dev_t
> > +typedef unsigned int   __kernel_old_dev_t;
> > +#endif
> > +
> > +/*
> > + * Most 32 bit architectures use "unsigned int" size_t,
> > + * and all 64 bit architectures use "unsigned long" size_t.
> > + */
> > +#ifndef __kernel_size_t
> > +#if __BITS_PER_LONG != 64
> > +typedef unsigned int   __kernel_size_t;
> > +typedef int__kernel_ssize_t;
> > +typedef int__kernel_ptrdiff_t;
> > +#else
> > +typedef __kernel_ulong_t __kernel_size_t;
> > +typedef __kernel_long_t__kernel_ssize_t;
> > +typedef __kernel_long_t__kernel_ptrdiff_t;
> > +#endif
> > +#endif
> > +
> > +#ifndef __kernel_fsid_t
> > +typedef struct {
> > +   int val[2];
> > +} __kernel_fsid_t;
> > +#endif
> > +
> > +/*
> > + * anything below here should be completely generic
> > + */
> > +typedef __kernel_long_t__kernel_off_t;
> > +typedef long long  __kernel_loff_t;
> > +typedef __kernel_long_t__kernel_time_t;
> > +typedef __kernel_long_t__kernel_clock_t;
> > +typedef int__kernel_timer_t;
> > +typedef int__kernel_clockid_t;
> > +typedef char * __kernel_caddr_t;
> > +typedef unsigned short __kernel_uid16_t;
> > +typedef unsigned short __kernel_gid16_t;
> > +
> > +#endif /* __ASM_GENERIC_POSIX_TYPES_H */
> > --
> > 2.15.1
> >
>
> This isn't going to be supportabl

Re: Sending 802.1Q packets using AF_PACKET socket on filtered bridge forwards with wrong MAC addresses

2017-12-05 Thread Toshiaki Makita

Hi,
(CC: Vlad)

On 2017/11/30 7:01, Brandon Carpenter wrote:
> I narrowed the search to a memmove() called from
> skb_reorder_vlan_header() in net/core/skbuff.c.
> 
>> memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
>>2 * ETH_ALEN);
> 
> Calling skb_reset_mac_len() after skb_reset_mac_header() before
> calling br_allowed_ingress() in net/bridge/br_device.c fixes the
> problem.
> 
> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
> index af5b8c87f590..e10131e2f68f 100644
> --- a/net/bridge/br_device.c
> +++ b/net/bridge/br_device.c
> @@ -58,6 +58,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct
> net_device *dev)
> BR_INPUT_SKB_CB(skb)->brdev = dev;
> 
> skb_reset_mac_header(skb);
> +   skb_reset_mac_len(skb);
> eth = eth_hdr(skb);
> skb_pull(skb, ETH_HLEN);

Thanks for debugging this problem.
It seems this has been broken since a6e18ff11170 ("vlan: Fix untag
operations of stacked vlans with REORDER_HEADER off").

Unfortunately this does not always work correctly, since in tx path
drivers assume network header to be set to L3 protocol header offset.
Packet socket (packet_snd()) determines network header by
dev_hard_header which is ETH_HLEN in bridge devices, so this works for
packet socket, but with vlan devices on top of bridge device with
tx-vlan hwaccel disabled we get ETH_HLEN + VLAN_HLEN or longer by mac_len.

Since mac_len can be arbitrarily long if we stack vlan devices on bridge
devices, and since we want to untag the outermost tag, using mac_len to
untag in tx path is probably no longer correct.

I'll think deeper about how to fix it.

> I'll put together an official patch  and submit it. Should I use
> another email account? Are my emails being ignored because of that
> stupid disclaimer my employer attaches to my messages (outside my
> control)?
> 
> Brandon
> 

-- 
Toshiaki Makita

Re: Transport mode xfrm_gro

2017-12-05 Thread Herbert Xu

On Wed, Dec 06, 2017 at 02:37:17PM +1100, Herbert Xu wrote:
>
> So why is xfrm_input in the xfrm_gro case trying to reinject the
> skb into the network stack?

Nevermind, I see now that transport_finish has code to skip xfrm_gro
packets.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH net-next] tun: avoid unnecessary READ_ONCE in tun_net_xmit

2017-12-05 Thread Jason Wang




On 2017年12月06日 11:11, Willem de Bruijn wrote:

From: Willem de Bruijn 

The statement no longer serves a purpose.

Commit fa35864e0bb7 ("tuntap: Fix for a race in accessing numqueues")
added the ACCESS_ONCE to avoid a race condition with skb_queue_len.

Commit 436accebb530 ("tuntap: remove unnecessary sk_receive_queue
length check during xmit") removed the affected skb_queue_len check.

Commit 96f84061620c ("tun: add eBPF based queue selection method")
split the function, reading the field a second time in the callee.
The temp variable is now only read once, so just remove it.

Signed-off-by: Willem de Bruijn 
---
  drivers/net/tun.c | 4 +---
  1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 787cc35ef89b..c2ad8f3858d1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -990,14 +990,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, 
struct net_device *dev)
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
-   u32 numqueues = 0;
  
  	rcu_read_lock();

tfile = rcu_dereference(tun->tfiles[txq]);
-   numqueues = READ_ONCE(tun->numqueues);
  
  	/* Drop packet if interface is not attached */

-   if (txq >= numqueues)
+   if (txq >= tun->numqueues)
goto drop;
  
  	if (!rcu_dereference(tun->steering_prog))


Acked-by: Jason Wang 

Thanks

Transport mode xfrm_gro

2017-12-05 Thread Herbert Xu

Hi Steffen:

I'm looking at the function xfrm_input near the end where it deals
with transport mode packets:

err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || 
async);
if (xfrm_gro) {
if (skb->sp)
skb->sp->olen = 0;
skb_dst_drop(skb);
gro_cells_receive(&gro_cells, skb);
return err;
}

This looks wrong because in transport mode, transport_finish is
well within its rights to consume and free the skb.  For example,
IPv4 transport_finish eventually calls xfrm4_rcv_encap_finish which
does:

if (!skb_dst(skb)) {
const struct iphdr *iph = ip_hdr(skb);

if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
 iph->tos, skb->dev))
goto drop;
}
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;

Whichever path it takes the skb is either gone or belongs to someone
else.

So why is xfrm_input in the xfrm_gro case trying to reinject the
skb into the network stack?

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH] ptr_ring: add barriers

2017-12-05 Thread Jason Wang




On 2017年12月06日 10:53, Michael S. Tsirkin wrote:

On Wed, Dec 06, 2017 at 10:31:39AM +0800, Jason Wang wrote:


On 2017年12月06日 03:29, Michael S. Tsirkin wrote:

Users of ptr_ring expect that it's safe to give the
data structure a pointer and have it be available
to consumers, but that actually requires an smb_wmb
or a stronger barrier.

In absence of such barriers and on architectures that reorder writes,
consumer might read an un=initialized value from an skb pointer stored
in the skb array.  This was observed causing crashes.

To fix, add memory barriers.  The barrier we use is a wmb, the
assumption being that producers do not need to read the value so we do
not need to order these reads.

Reported-by: George Cherian 
Suggested-by: Jason Wang 
Signed-off-by: Michael S. Tsirkin 
---

George, could you pls report whether this patch fixes
the issue for you?

This seems to be needed in stable as well.




   include/linux/ptr_ring.h | 9 +
   1 file changed, 9 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 37b4bb2..6866df4 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
   /* Note: callers invoking this in a loop must use a compiler barrier,
* for example cpu_relax(). Callers must hold producer_lock.
+ * Callers are responsible for making sure pointer that is being queued
+ * points to a valid data.
*/
   static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
   {
if (unlikely(!r->size) || r->queue[r->producer])
return -ENOSPC;
+   /* Make sure the pointer we are storing points to a valid data. */
+   /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
+   smp_wmb();
+
r->queue[r->producer++] = ptr;
if (unlikely(r->producer >= r->size))
r->producer = 0;
@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
if (ptr)
__ptr_ring_discard_one(r);
+   /* Make sure anyone accessing data through the pointer is up to date. */
+   /* Pairs with smp_wmb in __ptr_ring_produce. */
+   smp_read_barrier_depends();
return ptr;
   }

I was thinking whether or not it's better to move those to the callers. Then
we can save lots of barriers in e.g batch consuming.

Thanks

Batch consumers only do smp_read_barrier_depends which is free on
non-alpha. I suggest we do the simple thing for stable and reserve
optimizations for later.



Right.

Acked-by: Jason Wang

Re: [PATCH net-next 00/12] sctp: Implement Stream Interleave: The I-DATA Chunk Supporting User Message Interleaving

2017-12-05 Thread Xin Long

On Wed, Dec 6, 2017 at 1:30 AM, Marcelo Ricardo Leitner
 wrote:
> On Tue, Dec 05, 2017 at 11:15:57PM +0800, Xin Long wrote:
>> Stream Interleave would be Implemented in two Parts:
>>1. The I-DATA Chunk Supporting User Message Interleaving
>>2. Interaction with Other SCTP Extensions
>>
>
> I have reviewed this patchset a couple of times already before the
> posting and other than the missing blank line (heh), it looks good to
> me. Would ack it now but we'll need a respin for the newline.
OK, thanks !

>
> Xin, please wait a bit before respining it. Maybe Neil and others have
> more comments on it.
Sure,

I added the part 2 (Interaction with Other SCTP Extensions) only
as an attachment here, so that it would be more clear to know the
big picture by checking it. (patchset_2.tar.gz)

I also added the test cases here I've done, based on sctp-tests
(conformance.tar.gz). It includes:
idata with ulp layer process
idata with stream reconfig
idata with stream scheduler
idata with sctp prsctp
idata with auth

(idata with sctp-tests others old tests)

and note that another file (debug.tar.gz) is some patches for sctp
to make these tests easier to be done.

conformance.tar.gz
Description: GNU Zip compressed data

debug.tar.gz
Description: GNU Zip compressed data

patchset_2.tar.gz
Description: GNU Zip compressed data

Re: [PATCH] netlink: Add netns check on taps

2017-12-05 Thread Kevin Cernekee

On Tue, Dec 5, 2017 at 6:19 PM, David Ahern  wrote:
>> + if (!net_eq(dev_net(dev), sock_net(sk)) &&
>> + !net_eq(dev_net(dev), &init_net)) {
>
> Why is init_net special? Seems like snooping should be limited to the
> namespace you are in.

Depends how important it is to preserve the current "typical use case"
behavior, where the root user in the init netns can see all netlink
traffic on the system.

[PATCH net-next] tun: avoid unnecessary READ_ONCE in tun_net_xmit

2017-12-05 Thread Willem de Bruijn

From: Willem de Bruijn 

The statement no longer serves a purpose.

Commit fa35864e0bb7 ("tuntap: Fix for a race in accessing numqueues")
added the ACCESS_ONCE to avoid a race condition with skb_queue_len.

Commit 436accebb530 ("tuntap: remove unnecessary sk_receive_queue
length check during xmit") removed the affected skb_queue_len check.

Commit 96f84061620c ("tun: add eBPF based queue selection method")
split the function, reading the field a second time in the callee.
The temp variable is now only read once, so just remove it.

Signed-off-by: Willem de Bruijn 
---
 drivers/net/tun.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 787cc35ef89b..c2ad8f3858d1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -990,14 +990,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, 
struct net_device *dev)
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
-   u32 numqueues = 0;
 
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
-   numqueues = READ_ONCE(tun->numqueues);
 
/* Drop packet if interface is not attached */
-   if (txq >= numqueues)
+   if (txq >= tun->numqueues)
goto drop;
 
if (!rcu_dereference(tun->steering_prog))
-- 
2.15.1.424.g9478a66081-goog

Re: Linux 4.14 - regression: broken tun/tap / bridge network with virtio - bisected

2017-12-05 Thread Jason Wang




On 2017年12月06日 00:23, Andreas Hartmann wrote:

On 12/05/2017 at 04:50 AM Jason Wang wrote:


On 2017年12月05日 00:28, Andreas Hartmann wrote:

On 12/03/2017 at 12:35 PM Andreas Hartmann wrote:

On 12/01/2017 at 11:11 AM Andreas Hartmann wrote:

Hello!

I hopefully could get rid of both of my problems (hanging network w/
virtio) and endless hanging qemu-process on VM shutdown by upgrading
qemu from 2.6.2 to 2.10.1. I hope it will persist.

It didn't persist. 10h later - same problems happened again. It's just
much harder to trigger the problems.

I'm now trying it with

CONFIG_RCU_NOCB_CPU=y and
rcu_nocbs=0-15

Since then, I didn't see any problem any more. But this doesn't mean
anything until now ... .

Didn't work ether. Disabling vhost_net's zcopy hadn't any effect, too.

=> It's just finally broken since

2ddf71e23cc246e95af72a6deed67b4a50a7b81c
net: add notifier hooks for devmap bpf map

Hi:

Did you use XDP devmap in host? If not, please double check it was the
first bad commit since the patch should only work when XDP/devmap is
used on host.

How do I know if XDP/devmap is enabled / used? Could you please give
some hint?


Thanks,
Andreas


Something like:

./ip link | grep xdp
10: tap0:  mtu 1500 xdp qdisc mq master 
kvmbr0 state UNKNOWN mode DEFAULT group default qlen 1000

    prog/xdp id 4 tag 0381911915bc8d7f

But you should have some recent version of ip.

Thanks

Re: [PATCH] ptr_ring: add barriers

2017-12-05 Thread Michael S. Tsirkin

On Wed, Dec 06, 2017 at 10:31:39AM +0800, Jason Wang wrote:
> 
> 
> On 2017年12月06日 03:29, Michael S. Tsirkin wrote:
> > Users of ptr_ring expect that it's safe to give the
> > data structure a pointer and have it be available
> > to consumers, but that actually requires an smb_wmb
> > or a stronger barrier.
> > 
> > In absence of such barriers and on architectures that reorder writes,
> > consumer might read an un=initialized value from an skb pointer stored
> > in the skb array.  This was observed causing crashes.
> > 
> > To fix, add memory barriers.  The barrier we use is a wmb, the
> > assumption being that producers do not need to read the value so we do
> > not need to order these reads.
> > 
> > Reported-by: George Cherian 
> > Suggested-by: Jason Wang 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> > 
> > George, could you pls report whether this patch fixes
> > the issue for you?
> > 
> > This seems to be needed in stable as well.
> > 
> > 
> > 
> > 
> >   include/linux/ptr_ring.h | 9 +
> >   1 file changed, 9 insertions(+)
> > 
> > diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
> > index 37b4bb2..6866df4 100644
> > --- a/include/linux/ptr_ring.h
> > +++ b/include/linux/ptr_ring.h
> > @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring 
> > *r)
> >   /* Note: callers invoking this in a loop must use a compiler barrier,
> >* for example cpu_relax(). Callers must hold producer_lock.
> > + * Callers are responsible for making sure pointer that is being queued
> > + * points to a valid data.
> >*/
> >   static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
> >   {
> > if (unlikely(!r->size) || r->queue[r->producer])
> > return -ENOSPC;
> > +   /* Make sure the pointer we are storing points to a valid data. */
> > +   /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
> > +   smp_wmb();
> > +
> > r->queue[r->producer++] = ptr;
> > if (unlikely(r->producer >= r->size))
> > r->producer = 0;
> > @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring 
> > *r)
> > if (ptr)
> > __ptr_ring_discard_one(r);
> > +   /* Make sure anyone accessing data through the pointer is up to date. */
> > +   /* Pairs with smp_wmb in __ptr_ring_produce. */
> > +   smp_read_barrier_depends();
> > return ptr;
> >   }
> 
> I was thinking whether or not it's better to move those to the callers. Then
> we can save lots of barriers in e.g batch consuming.
> 
> Thanks

Batch consumers only do smp_read_barrier_depends which is free on
non-alpha. I suggest we do the simple thing for stable and reserve
optimizations for later.

-- 
MST

dsa: dsa_slave_port_obj_del calls multiple times with SWITCHDEV_OBJ_ID_HOST_MDB obj id

2017-12-05 Thread Tristram.Ha

I found the latest net-next kernel calls dsa_slave_port_obj_del() multiple 
times,
one for each port, with host port as the parameter.

As the base driver cannot find an entry with that host port, it returns an error
and so users will see a lot of failures from the DSA switch.

Is this a new behavior and the driver needs to handle that?  In previous 
versions
I do not think I saw that.

Typical operation is a PC connected to a port in a switch wants to send 
multicast
packets.  It broadcasts an IGMP membership join message.  Function
dsa_slave_port_obj_add is called to setup an entry in the lookup table.  When
IGMP membership leave message is received dsa_slave_port_obj_del will be
called after a delay.  But then it is called for each port with host port as the
parameter.

Another issue is the host port can setup an entry for IPv6 neighbor discovery 
like
33:33:FF:??:??:??.  When it leaves a failure message will be displayed for lan2,
lan3, and so on.  It seems the first deletion is coming from lan1.

Re: [PATCH] ptr_ring: add barriers

2017-12-05 Thread Jason Wang




On 2017年12月06日 03:29, Michael S. Tsirkin wrote:

Users of ptr_ring expect that it's safe to give the
data structure a pointer and have it be available
to consumers, but that actually requires an smb_wmb
or a stronger barrier.

In absence of such barriers and on architectures that reorder writes,
consumer might read an un=initialized value from an skb pointer stored
in the skb array.  This was observed causing crashes.

To fix, add memory barriers.  The barrier we use is a wmb, the
assumption being that producers do not need to read the value so we do
not need to order these reads.

Reported-by: George Cherian 
Suggested-by: Jason Wang 
Signed-off-by: Michael S. Tsirkin 
---

George, could you pls report whether this patch fixes
the issue for you?

This seems to be needed in stable as well.




  include/linux/ptr_ring.h | 9 +
  1 file changed, 9 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 37b4bb2..6866df4 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
  
  /* Note: callers invoking this in a loop must use a compiler barrier,

   * for example cpu_relax(). Callers must hold producer_lock.
+ * Callers are responsible for making sure pointer that is being queued
+ * points to a valid data.
   */
  static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
  {
if (unlikely(!r->size) || r->queue[r->producer])
return -ENOSPC;
  
+	/* Make sure the pointer we are storing points to a valid data. */

+   /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
+   smp_wmb();
+
r->queue[r->producer++] = ptr;
if (unlikely(r->producer >= r->size))
r->producer = 0;
@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
if (ptr)
__ptr_ring_discard_one(r);
  
+	/* Make sure anyone accessing data through the pointer is up to date. */

+   /* Pairs with smp_wmb in __ptr_ring_produce. */
+   smp_read_barrier_depends();
return ptr;
  }
  


I was thinking whether or not it's better to move those to the callers. 
Then we can save lots of barriers in e.g batch consuming.


Thanks

Re: [PATCH net-next V3] tun: add eBPF based queue selection method

2017-12-05 Thread Jason Wang

On 2017年12月06日 01:02, David Miller wrote:

From: Jason Wang 
Date: Mon,  4 Dec 2017 17:31:23 +0800

This patch introduces an eBPF based queue selection method. With this,
the policy could be offloaded to userspace completely through a new
ioctl TUNSETSTEERINGEBPF.

Signed-off-by: Jason Wang 
---
Changes from V2:
- call rtnl during netdev free
- switch to use call_rcu() to prevent DOS from userspace
- drop the policies setting/getting ioctls and allow detach through
   passing -1 as fd

Applied, thanks Jason.

I really wish this driver had newlink/changelink support rather than
us adding all of these ioctls...

Yes, will add this in my todo list.

Thanks

Re: [PATCH] netlink: Add netns check on taps

2017-12-05 Thread David Ahern

On 12/5/17 3:46 PM, Kevin Cernekee wrote:
> Currently, a nlmon link inside a child namespace can observe systemwide
> netlink activity.  Filter the traffic so that in a non-init netns,
> nlmon can only sniff netlink messages from its own netns.
> 
> Test case:
> 
> vpnns -- bash -c "ip link add nlmon0 type nlmon; \
>   ip link set nlmon0 up; \
>   tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" &
> sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \
> spi 0x1 mode transport \
> auth sha1 0x616263313233 \
> enc aes 0x
> grep abc123 /tmp/nlmon.pcap
> 
> Signed-off-by: Kevin Cernekee 
> ---
>  net/netlink/af_netlink.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index b9e0ee4..88381a2 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -253,6 +253,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
>   struct sock *sk = skb->sk;
>   int ret = -ENOMEM;
>  
> + if (!net_eq(dev_net(dev), sock_net(sk)) &&
> + !net_eq(dev_net(dev), &init_net)) {

Why is init_net special? Seems like snooping should be limited to the
namespace you are in.

[PATCH net] enic: add wq clean up budget

2017-12-05 Thread Govindarajulu Varadarajan

In case of tx clean up, we set '-1' as budget. This means clean up until
wq is empty or till (1 << 32) pkts are cleaned. Under heavy load this
will run for long time and cause
"watchdog: BUG: soft lockup - CPU#25 stuck for 21s!" warning.

This patch sets wq clean up budget to 256.

Signed-off-by: Govindarajulu Varadarajan 
---
 drivers/net/ethernet/cisco/enic/enic.h  | 2 ++
 drivers/net/ethernet/cisco/enic/enic_main.c | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h 
b/drivers/net/ethernet/cisco/enic/enic.h
index 6a9527004cb1..9b218f0e5a4c 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -43,6 +43,8 @@
 #define ENIC_CQ_MAX(ENIC_WQ_MAX + ENIC_RQ_MAX)
 #define ENIC_INTR_MAX  (ENIC_CQ_MAX + 2)
 
+#define ENIC_WQ_NAPI_BUDGET256
+
 #define ENIC_AIC_LARGE_PKT_DIFF3
 
 struct enic_msix_entry {
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c 
b/drivers/net/ethernet/cisco/enic/enic_main.c
index d98676e43e03..f202ba72a811 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1500,7 +1500,7 @@ static int enic_poll(struct napi_struct *napi, int budget)
unsigned int cq_wq = enic_cq_wq(enic, 0);
unsigned int intr = enic_legacy_io_intr();
unsigned int rq_work_to_do = budget;
-   unsigned int wq_work_to_do = -1; /* no limit */
+   unsigned int wq_work_to_do = ENIC_WQ_NAPI_BUDGET;
unsigned int  work_done, rq_work_done = 0, wq_work_done;
int err;
 
@@ -1598,7 +1598,7 @@ static int enic_poll_msix_wq(struct napi_struct *napi, 
int budget)
struct vnic_wq *wq = &enic->wq[wq_index];
unsigned int cq;
unsigned int intr;
-   unsigned int wq_work_to_do = -1; /* clean all desc possible */
+   unsigned int wq_work_to_do = ENIC_WQ_NAPI_BUDGET;
unsigned int wq_work_done;
unsigned int wq_irq;
 
-- 
2.15.1

Re: [PATCH V11 4/5] vsprintf: add printk specifier %px

2017-12-05 Thread Sergey Senozhatsky

On (12/05/17 17:59), Linus Torvalds wrote:
[..]
> On Tue, Dec 5, 2017 at 5:36 PM, Sergey Senozhatsky
>  wrote:
> > I see some %p-s being used in _supposedly_ important output,
> > like arch/x86/mm/fault.c
> >
> > show_fault_oops(struct pt_regs *regs, unsigned long error_code,
> > unsigned long address)
> > ...
> > printk(KERN_CONT " at %p\n", (void *) address);
> > printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
> 
> So %pS isn't %p, and shows the symbolic name.

sure, agreed. by "some %p-s being used" I meant the grep result,
not just x86 show_fault_oops().


> But yes, that "at %p" should definitely be %px.

more %p grepping [filtering out all `%ps %pf %pb' variants] gives
a huge number of print outs that potentially can be broken now

arch/x86/kernel/kprobes/core.c: printk(KERN_WARNING "Unrecoverable 
kprobe detected at %p.\n",
arch/x86/kernel/kprobes/core.c:"current sp %p does not 
match saved sp %p\n",
arch/x86/kernel/kprobes/core.c: printk(KERN_ERR "Saved 
registers for jprobe %p\n", jp);

arch/x86/kernel/head_32.S:  .asciz "Unknown interrupt or fault at: %p %p 
%p\n"
arch/x86/kernel/irq_32.c:   printk(KERN_DEBUG "CPU %u irqstacks, hard=%p 
soft=%p\n",

arch/x86/kernel/smpboot.c:  pr_debug("Stack at about %p\n", &cpuid);
arch/x86/kernel/traps.c:printk(KERN_EMERG "BUG: stack guard page was 
hit at %p (stack is %p..%p)\n",


so I'm not in position to suggest the removal of those print outs or to
decide if those are important at all, just saying that that "I'm confused
by pointer values and can't debug" might be more likely that we thought.


> So my gut feel is that those printouts should probably just be
> removed. They have some very old historical reasons: we've printed out
> the page directory pointers (and followed the page tables) since at
> least back in the 1.1.x days. This is from the 1.1.7 patch, back when
> mm/memory.c was all about x86:

I see, thanks.

-ss

Re: [PATCH V11 4/5] vsprintf: add printk specifier %px

2017-12-05 Thread Linus Torvalds

On Tue, Dec 5, 2017 at 5:36 PM, Sergey Senozhatsky
 wrote:
> I see some %p-s being used in _supposedly_ important output,
> like arch/x86/mm/fault.c
>
> show_fault_oops(struct pt_regs *regs, unsigned long error_code,
> unsigned long address)
> ...
> printk(KERN_CONT " at %p\n", (void *) address);
> printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);

So %pS isn't %p, and shows the symbolic name.

But yes, that "at %p" should definitely be %px.

In fact, it used to be a "%08lx" - and the value we print out is
"unsigned long - but then when we unified the 32- and 64-bit
architectures, using "%p" and a cast was a convenient way to unify the
32-bit %08lx and the 16-bit %016lx formats.

Will fix.

> a quick %p grep gives me the following list:
...
> or is it OK to show hashes instead of pgd or pmd pointers?

So my gut feel is that those printouts should probably just be
removed. They have some very old historical reasons: we've printed out
the page directory pointers (and followed the page tables) since at
least back in the 1.1.x days. This is from the 1.1.7 patch, back when
mm/memory.c was all about x86:

+   printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n",
+   current->tss.cr3, user_esp);
+   user_esp = ((unsigned long *) user_esp)[address >> 22];
+   printk(KERN_ALERT "*pde = %08lx\n", user_esp);

so it's more historical than sensible, I think.

   Linus

Re: [PATCH 2/2] veth: allow configuring GSO maximums

2017-12-05 Thread Solio Sarabia

On Tue, Dec 05, 2017 at 05:25:10PM -0800, Stephen Hemminger wrote:
> On Tue,  5 Dec 2017 17:14:26 -0800
> Solio Sarabia  wrote:
> 
> > From: Stephen Hemminger 
> > 
> > Veth's can be used in environments (like Azure) where the underlying
> > network device is impacted by large GSO packets. This patch allows
> > gso maximum values to be passed in when creating the device via
> > netlink.
> > 
> > In theory, other pseudo devices could also use netlink attributes
> > to set GSO maximums but for now veth is what has been observed
> > to be an issue.
> > 
> > Signed-off-by: Stephen Hemminger 
> > Signed-off-by: Solio Sarabia 
> 
> I am testing new version with changelink support

Ack, I second whatever version works best.
Will help to try newer patches.

[PATCH net-next] rds: debug: fix null check on static array

2017-12-05 Thread Prashant Bhole

t_name cannot be NULL since it is an array field of a struct.
Replacing null check on static array with string length check using
strnlen()

Signed-off-by: Prashant Bhole 
---
 net/rds/connection.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 9efc82c665b5..6492c0b608a4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
 
rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
  conn, &laddr, &faddr,
- trans->t_name ? trans->t_name : "[unknown]",
- is_outgoing ? "(outgoing)" : "");
+ strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
+ "[unknown]", is_outgoing ? "(outgoing)" : "");
 
/*
 * Since we ran without holding the conn lock, someone could
-- 
2.13.6

RE: [PATCH net-next v3 4/4] net: fec: add phy_reset_after_clk_enable() support

2017-12-05 Thread Andy Duan

From: Richard Leitner  Sent: Tuesday, December 05, 2017 9:26 PM
>Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow turning
>the refclk on and off again during operation (according to their datasheet).
>Nonetheless exactly this behaviour was introduced for power saving reasons
>by commit e8fcfcd5684a ("net: fec: optimize the clock management to save
>power").
>Therefore add support for the phy_reset_after_clk_enable function from
>phylib to mitigate this issue.
>
>Generally speaking this issue is only relevant if the ref clk for the PHY is
>generated by the SoC and therefore the PHY is configured to "REF_CLK In
>Mode". In our specific case (PCB) this problem does occur at about every 10th
>to 50th POR of an LAN8710 connected to an i.MX6SOLO SoC. The typical
>symptom of this problem is a "swinging" ethernet link.
>Similar issues were reported by users of the NXP forum:
>   https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F
>%2Fcommunity.nxp.com%2Fthread%2F389902&data=02%7C01%7Cfugang.du
>an%40nxp.com%7C7f9fee272fc44662c2a108d53be3d1ee%7C686ea1d3bc2b4c6
>fa92cd99c5c301635%7C0%7C0%7C636480772022331090&sdata=7RdUsoWVWu
>o1nM5zKwLt7%2F6U3dxgDJtBDGlQCUWC6IM%3D&reserved=0
>   https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F
>%2Fcommunity.nxp.com%2Fmessage%2F309354&data=02%7C01%7Cfugang.d
>uan%40nxp.com%7C7f9fee272fc44662c2a108d53be3d1ee%7C686ea1d3bc2b4
>c6fa92cd99c5c301635%7C0%7C0%7C636480772022331090&sdata=D56KilGWD3
>kLABxc0yOI%2B44Y%2FhLfrGtdAvupCEyvI%2BI%3D&reserved=0
>With this patch applied the issue didn't occur for at least a few hundret PORs
>of our board.
>
>Fixes: e8fcfcd5684a ("net: fec: optimize the clock management to save
>power")
>Signed-off-by: Richard Leitner 
>---
> drivers/net/ethernet/freescale/fec_main.c | 7 +++
> 1 file changed, 7 insertions(+)
>
>diff --git a/drivers/net/ethernet/freescale/fec_main.c
>b/drivers/net/ethernet/freescale/fec_main.c
>index 610573855213..8c3d0fb7db20 100644
>--- a/drivers/net/ethernet/freescale/fec_main.c
>+++ b/drivers/net/ethernet/freescale/fec_main.c
>@@ -1862,6 +1862,8 @@ static int fec_enet_clk_enable(struct net_device
>*ndev, bool enable)
>   ret = clk_prepare_enable(fep->clk_ref);
>   if (ret)
>   goto failed_clk_ref;
>+
>+  phy_reset_after_clk_enable(ndev->phydev);
>   } else {
>   clk_disable_unprepare(fep->clk_ahb);
>   clk_disable_unprepare(fep->clk_enet_out);
>@@ -2860,6 +2862,11 @@ fec_enet_open(struct net_device *ndev)
>   if (ret)
>   goto err_enet_mii_probe;
>
>+  /* reset phy if needed here, due to the fact this is the first time we
>+   * have the net_device to phy_driver link
>+   */
>+  phy_reset_after_clk_enable(ndev->phydev);
>+

The patch series look better.
But why does it need to reset phy here since phy already is hard reset after 
clock enable.


>   if (fep->quirks & FEC_QUIRK_ERR006687)
>   imx6q_cpuidle_fec_irqs_used();
>
>--
>2.11.0

[PATCH v1 net-next 0/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

This patch requires the previous patch for Microchip KSZ8795 DSA driver.

v1
- For latest KSZ8795 v3 patch

Tristram Ha (1):
  Add Microchip KSZ8895 DSA driver.

 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8895.c | 1274 +++
 drivers/net/dsa/microchip/ksz8895_reg.h |  824 
 drivers/net/dsa/microchip/ksz8895_spi.c |  157 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 6 files changed, 2275 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8895.c
 create mode 100644 drivers/net/dsa/microchip/ksz8895_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8895_spi.c

-- 
1.9.1

[PATCH v3 net-next] net: dsa: Modify tag_ksz.c so that tail tag code can be used by other KSZ switch drivers

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Modify tag_ksz.c so that tail tag code can be used by other KSZ switch
drivers.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
v3
- For latest KSZ9477 patch

v2
- No new feature is introduced

v1
- Switch driver code is not accessed from tag_ksz.c

 drivers/net/dsa/microchip/Kconfig   |  2 +-
 drivers/net/dsa/microchip/ksz9477.c |  2 +-
 include/net/dsa.h   |  2 +-
 net/dsa/Kconfig |  4 ++
 net/dsa/dsa.c   |  4 +-
 net/dsa/dsa_priv.h  |  2 +-
 net/dsa/tag_ksz.c   | 90 -
 7 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index 5a8660d..ab8f9f6 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -1,7 +1,7 @@
 menuconfig MICROCHIP_KSZ9477
tristate "Microchip KSZ9477 series switch support"
depends on NET_DSA
-   select NET_DSA_TAG_KSZ
+   select NET_DSA_TAG_KSZ9477
help
  This driver adds support for Microchip KSZ9477 switch chips.
 
diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 8bae36b1..ca1ccea 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -344,7 +344,7 @@ static void ksz9477_port_init_cnt(struct ksz_device *dev, 
int port)
 static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds,
  int port)
 {
-   return DSA_TAG_PROTO_KSZ;
+   return DSA_TAG_PROTO_KSZ9477;
 }
 
 static int ksz9477_phy_read16(struct dsa_switch *ds, int addr, int reg)
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8198efc..2b1a222 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -32,7 +32,7 @@ enum dsa_tag_protocol {
DSA_TAG_PROTO_BRCM_PREPEND,
DSA_TAG_PROTO_DSA,
DSA_TAG_PROTO_EDSA,
-   DSA_TAG_PROTO_KSZ,
+   DSA_TAG_PROTO_KSZ9477,
DSA_TAG_PROTO_LAN9303,
DSA_TAG_PROTO_MTK,
DSA_TAG_PROTO_QCA,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 03c3bdf..809b0e2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -32,6 +32,10 @@ config NET_DSA_TAG_EDSA
 config NET_DSA_TAG_KSZ
bool
 
+config NET_DSA_TAG_KSZ9477
+   bool
+   select NET_DSA_TAG_KSZ
+
 config NET_DSA_TAG_LAN9303
bool
 
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 6a9d0f5..92056a7 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -53,8 +53,8 @@ static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff 
*skb,
 #ifdef CONFIG_NET_DSA_TAG_EDSA
[DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
 #endif
-#ifdef CONFIG_NET_DSA_TAG_KSZ
-   [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops,
+#ifdef CONFIG_NET_DSA_TAG_KSZ9477
+   [DSA_TAG_PROTO_KSZ9477] = &ksz9477_netdev_ops,
 #endif
 #ifdef CONFIG_NET_DSA_TAG_LAN9303
[DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops,
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7d03669..a2955a8 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -200,7 +200,7 @@ static inline struct dsa_port *dsa_slave_to_port(const 
struct net_device *dev)
 extern const struct dsa_device_ops edsa_netdev_ops;
 
 /* tag_ksz.c */
-extern const struct dsa_device_ops ksz_netdev_ops;
+extern const struct dsa_device_ops ksz9477_netdev_ops;
 
 /* tag_lan9303.c */
 extern const struct dsa_device_ops lan9303_netdev_ops;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 0f62eff..7343270 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -14,34 +14,21 @@
 #include 
 #include "dsa_priv.h"
 
-/* For Ingress (Host -> KSZ), 2 bytes are added before FCS.
- * ---
- * DA(6bytes)|SA(6bytes)||Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
- * ---
- * tag0 : Prioritization (not used now)
- * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
- *
- * For Egress (KSZ -> Host), 1 byte is added before FCS.
- * ---
- * DA(6bytes)|SA(6bytes)||Data(nbytes)|tag0(1byte)|FCS(4bytes)
- * ---
- * tag0 : zero-based value represents port
- *   (eg, 0x00=port1, 0x02=port3, 0x06=port7)
- */
-
-#defineKSZ_INGRESS_TAG_LEN 2
-#defineKSZ_EGRESS_TAG_LEN  1
+/* Usually only one byte is used for tail tag. */
+#define KSZ_INGRESS_TAG_LEN1
+#define KSZ_EGRESS_TAG_LEN 1
 
-static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev,
+   int len,
+   void (*set_tag)(void *ptr, u8 *addr

[PATCH v3 net-next 0/1] net: dsa: microchip: Add Microchip KSZ8795 DSA driver

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

This patch requires the previous patches for Microchip KSZ9477 DSA driver.

v3
- For latest KSZ9477 v2 patch

v2
- No new feature is introduced in tag_ksz.c

v1
- Return error codes instead of numbers
- Add more comments to clarify operation
- Use ksz8795 prefix to indicate KSZ8795 specific code
- Simplify MIB counter reading code
- Switch driver code is not accessed from tag_ksz.c

Tristram Ha (1):
  Add Microchip KSZ8795 DSA driver.

 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8795.c | 1363 +++
 drivers/net/dsa/microchip/ksz8795_reg.h | 1016 +++
 drivers/net/dsa/microchip/ksz8795_spi.c |  166 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 include/net/dsa.h   |1 +
 net/dsa/Kconfig |4 +
 net/dsa/dsa.c   |3 +
 net/dsa/dsa_priv.h  |1 +
 net/dsa/tag_ksz.c   |   32 +
 11 files changed, 2606 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8795.c
 create mode 100644 drivers/net/dsa/microchip/ksz8795_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8795_spi.c

-- 
1.9.1

[PATCH v1 net-next 1/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Add Microchip KSZ8895 DSA driver.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8895.c | 1274 +++
 drivers/net/dsa/microchip/ksz8895_reg.h |  824 
 drivers/net/dsa/microchip/ksz8895_spi.c |  157 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 6 files changed, 2275 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8895.c
 create mode 100644 drivers/net/dsa/microchip/ksz8895_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8895_spi.c

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index cb95d3d..b854c4b 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -27,3 +27,20 @@ config MICROCHIP_KSZ8795_SPI_DRIVER
 
  It is required to use the KSZ8795 switch driver as the only access
  is through SPI.
+
+menuconfig MICROCHIP_KSZ8895
+   tristate "Microchip KSZ8895 series switch support"
+   depends on NET_DSA
+   select NET_DSA_TAG_KSZ8795
+   help
+ This driver adds support for Microchip KSZ8895 switch chips.
+
+config MICROCHIP_KSZ8895_SPI_DRIVER
+   tristate "KSZ8895 series SPI connected switch driver"
+   depends on MICROCHIP_KSZ8895 && SPI
+   default y
+   help
+ This driver accesses KSZ8895 chip through SPI.
+
+ It is required to use the KSZ8895 switch driver as the only access
+ is through SPI.
diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index 99a283e..8dd6312 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_MICROCHIP_KSZ9477) += ksz9477.o 
ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
 obj-$(CONFIG_MICROCHIP_KSZ8795)+= ksz8795.o ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ8795_SPI_DRIVER) += ksz8795_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ8895)+= ksz8895.o ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ8895_SPI_DRIVER) += ksz8895_spi.o
diff --git a/drivers/net/dsa/microchip/ksz8895.c 
b/drivers/net/dsa/microchip/ksz8895.c
new file mode 100644
index 000..d6d2ecc
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8895.c
@@ -0,0 +1,1274 @@
+/*
+ * Microchip KSZ8895 switch driver
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ * Tristram Ha 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz8895_reg.h"
+
+static const struct {
+   char string[ETH_GSTRING_LEN];
+} ksz8895_mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+   { "rx" },
+   { "rx_hi" },
+   { "rx_undersize" },
+   { "rx_fragments" },
+   { "rx_oversize" },
+   { "rx_jabbers" },
+   { "rx_symbol_err" },
+   { "rx_crc_err" },
+   { "rx_align_err" },
+   { "rx_mac_ctrl" },
+   { "rx_pause" },
+   { "rx_bcast" },
+   { "rx_mcast" },
+   { "rx_ucast" },
+   { "rx_64_or_less" },
+   { "rx_65_127" },
+   { "rx_128_255" },
+   { "rx_256_511" },
+   { "rx_512_1023" },
+   { "rx_1024_1522" },
+   { "tx" },
+   { "tx_hi" },
+   { "tx_late_col" },
+   { "tx_pause" },
+   { "tx_bcast" },
+   { "tx_mcast" },
+   { "tx_ucast" },
+   { "tx_deferred" },
+   { "tx_total_col" },
+   { "tx_exc_col" },
+   { "tx_single_col" },
+   { "tx_mult_col" },
+   { "rx_discards" },
+   { "tx_discards" },
+};
+
+static int ksz8895_reset_switch(struct ksz_device *dev)
+{
+   /* reset switch */
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1,
+  SW_SOFTWARE_POWER_DOWN << SW_POWER_MANAGEMENT_MODE_S);
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1, 0);
+
+   return 0;
+}
+
+static void ksz8895_set_prio_queue(struct ksz_device *dev, int port, int queue)
+{
+   u8 hi;
+   u8 lo;
+
+   /* Number of queues can only be 1, 2, or 4. */
+   switch (queue) {
+   case 4:
+   queue = PORT_QUEUE_SPLIT_4;
+   break;
+   case 2:
+

[PATCH v3 net-next 1/1] net: dsa: microchip: Add Microchip KSZ8795 DSA driver

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Add Microchip KSZ8795 DSA driver.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8795.c | 1363 +++
 drivers/net/dsa/microchip/ksz8795_reg.h | 1016 +++
 drivers/net/dsa/microchip/ksz8795_spi.c |  166 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 include/net/dsa.h   |1 +
 net/dsa/Kconfig |4 +
 net/dsa/dsa.c   |3 +
 net/dsa/dsa_priv.h  |1 +
 net/dsa/tag_ksz.c   |   32 +
 11 files changed, 2606 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8795.c
 create mode 100644 drivers/net/dsa/microchip/ksz8795_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8795_spi.c

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index ab8f9f6..cb95d3d 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -10,3 +10,20 @@ config MICROCHIP_KSZ9477_SPI_DRIVER
depends on MICROCHIP_KSZ9477 && SPI
help
  Select to enable support for registering switches configured through 
SPI.
+
+menuconfig MICROCHIP_KSZ8795
+   tristate "Microchip KSZ8795 series switch support"
+   depends on NET_DSA
+   select NET_DSA_TAG_KSZ8795
+   help
+ This driver adds support for Microchip KSZ8795 switch chips.
+
+config MICROCHIP_KSZ8795_SPI_DRIVER
+   tristate "KSZ8795 series SPI connected switch driver"
+   depends on MICROCHIP_KSZ8795 && SPI
+   default y
+   help
+ This driver accesses KSZ8795 chip through SPI.
+
+ It is required to use the KSZ8795 switch driver as the only access
+ is through SPI.
diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index 13dd8f0..99a283e 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz9477.o ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ8795)+= ksz8795.o ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ8795_SPI_DRIVER) += ksz8795_spi.o
diff --git a/drivers/net/dsa/microchip/ksz8795.c 
b/drivers/net/dsa/microchip/ksz8795.c
new file mode 100644
index 000..fc68034
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -0,0 +1,1363 @@
+/*
+ * Microchip KSZ8795 switch driver
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ * Tristram Ha 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz8795_reg.h"
+
+static const struct {
+   char string[ETH_GSTRING_LEN];
+} ksz8795_mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+   { "rx_hi" },
+   { "rx_undersize" },
+   { "rx_fragments" },
+   { "rx_oversize" },
+   { "rx_jabbers" },
+   { "rx_symbol_err" },
+   { "rx_crc_err" },
+   { "rx_align_err" },
+   { "rx_mac_ctrl" },
+   { "rx_pause" },
+   { "rx_bcast" },
+   { "rx_mcast" },
+   { "rx_ucast" },
+   { "rx_64_or_less" },
+   { "rx_65_127" },
+   { "rx_128_255" },
+   { "rx_256_511" },
+   { "rx_512_1023" },
+   { "rx_1024_1522" },
+   { "rx_1523_2000" },
+   { "rx_2001" },
+   { "tx_hi" },
+   { "tx_late_col" },
+   { "tx_pause" },
+   { "tx_bcast" },
+   { "tx_mcast" },
+   { "tx_ucast" },
+   { "tx_deferred" },
+   { "tx_total_col" },
+   { "tx_exc_col" },
+   { "tx_single_col" },
+   { "tx_mult_col" },
+   { "rx_total" },
+   { "tx_total" },
+   { "rx_discards" },
+   { "tx_discards" },
+};
+
+static int ksz8795_reset_switch(struct ksz_device *dev)
+{
+   /* reset switch */
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1,
+  SW_SOFTWARE_POWER_DOWN << SW_POWER_MANAGEMENT_MODE_S);
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1, 0);
+
+   return 0;
+}
+
+static void ksz8795_set_prio_queue(struct ksz_device *dev, int port, int queue)
+{
+

[PATCH v2 net-next 3/8] net: dsa: microchip: Initialize mutex before use

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Initialize mutex before use.

Signed-off-by: Tristram Ha 
---
 drivers/net/dsa/microchip/ksz_common.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index 435c463..e656615 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1102,7 +1102,6 @@ static int ksz_switch_init(struct ksz_device *dev)
 {
int i;
 
-   mutex_init(&dev->reg_mutex);
mutex_init(&dev->stats_mutex);
mutex_init(&dev->alu_mutex);
mutex_init(&dev->vlan_mutex);
@@ -1191,6 +1190,9 @@ int ksz_switch_register(struct ksz_device *dev)
if (dev->pdata)
dev->chip_id = dev->pdata->chip_id;
 
+   /* mutex is used in next function call. */
+   mutex_init(&dev->reg_mutex);
+
if (ksz_switch_detect(dev))
return -EINVAL;
 
-- 
1.9.1

[PATCH v2 net-next 1/8] net: dsa: microchip: Replace license with GPL

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Replace license with GPL.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/ksz_9477_reg.h | 23 ---
 drivers/net/dsa/microchip/ksz_common.c   | 23 ---
 drivers/net/dsa/microchip/ksz_priv.h | 23 ---
 drivers/net/dsa/microchip/ksz_spi.c  | 23 ---
 4 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_9477_reg.h 
b/drivers/net/dsa/microchip/ksz_9477_reg.h
index 6aa6752..26a0e4b 100644
--- a/drivers/net/dsa/microchip/ksz_9477_reg.h
+++ b/drivers/net/dsa/microchip/ksz_9477_reg.h
@@ -1,19 +1,20 @@
 /*
  * Microchip KSZ9477 register definitions
  *
- * Copyright (C) 2017
+ * Copyright (C) 2017 Microchip Technology Inc.
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
  */
 
 #ifndef __KSZ9477_REGS_H
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index 663b0d5..d662a9a 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1,19 +1,20 @@
 /*
  * Microchip switch driver main logic
  *
- * Copyright (C) 2017
+ * Copyright (C) 2017 Microchip Technology Inc.
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
  */
 
 #include 
diff --git a/drivers/net/dsa/microchip/ksz_priv.h 
b/drivers/net/dsa/microchip/ksz_priv.h
index 2a98dbd..d461468 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -1,19 +1,20 @@
 /*
  * Microchip KSZ series switch common definitions
  *
- * Copyright (C) 2017
+ * Copyright (C) 2017 Microchip Technology Inc.
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED

[PATCH v2 net-next 0/8] net: dsa: microchip: Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

This series of patches is to modify the original KSZ9477 DSA driver so
that other KSZ switch drivers can be added and use the common code.

There are several steps to accomplish this achievement.  First is to
rename some function names with a prefix to indicate chip specific
function.  Second is to move common code into header that can be shared.
Last is to modify tag_ksz.c so that it can handle many tail tag formats
used by different KSZ switch drivers.

ksz_common.c will contain the common code used by all KSZ switch drivers.
ksz9477.c will contain KSZ9477 code from the original ksz_common.c.
ksz9477_spi.c is renamed from ksz_spi.c.
ksz9477_reg.h is renamed from ksz_9477_reg.h.
ksz_common.h is added to provide common code access to KSZ switch
drivers.
ksz_spi.h is added to provide common SPI access functions to KSZ SPI
drivers.

v2
- Initialize reg_mutex before use
- The alu_mutex is only used inside chip specific functions

v1
- Each patch in the set is self-contained
- Use ksz9477 prefix to indicate KSZ9477 specific code

Tristram Ha (8):
  Replace license with GPL.
  Clean up code according to patch check suggestions.
  Initialize mutex before use.
  Rename some functions with ksz9477 prefix to separate chip specific
code from common code.
  Rename ksz_spi.c to ksz9477_spi.c and update Kconfig in preparation to
add more KSZ switch drivers.
  Break KSZ9477 DSA driver into two files in preparation to add more KSZ
switch drivers.  Add common functions in ksz_common.h so that other
KSZ switch drivers can access code in ksz_common.c.  Add ksz_spi.h
for common functions used by KSZ switch SPI drivers.
  Prepare PHY for proper advertisement and get link status for the port.
  Rename ksz_9477_reg.h to ksz9477_reg.h for consistency as the product
name is always KSZ.

 drivers/net/dsa/microchip/Kconfig  |   12 +-
 drivers/net/dsa/microchip/Makefile |4 +-
 drivers/net/dsa/microchip/ksz9477.c| 1331 
 .../microchip/{ksz_9477_reg.h => ksz9477_reg.h}|   23 +-
 drivers/net/dsa/microchip/ksz9477_spi.c|  188 +++
 drivers/net/dsa/microchip/ksz_common.c | 1176 +++--
 drivers/net/dsa/microchip/ksz_common.h |  229 
 drivers/net/dsa/microchip/ksz_priv.h   |  256 ++--
 drivers/net/dsa/microchip/ksz_spi.c|  216 
 drivers/net/dsa/microchip/ksz_spi.h|   82 ++
 10 files changed, 2122 insertions(+), 1395 deletions(-)
 create mode 100644 drivers/net/dsa/microchip/ksz9477.c
 rename drivers/net/dsa/microchip/{ksz_9477_reg.h => ksz9477_reg.h} (98%)
 create mode 100644 drivers/net/dsa/microchip/ksz9477_spi.c
 create mode 100644 drivers/net/dsa/microchip/ksz_common.h
 delete mode 100644 drivers/net/dsa/microchip/ksz_spi.c
 create mode 100644 drivers/net/dsa/microchip/ksz_spi.h

-- 
1.9.1

[PATCH v3 net-next] net: dsa: microchip: Add MIB counter reading support

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Add MIB counter reading support.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
v3
- Use new timer_setup API

v2
- Only MIB counter related code in patch

v1
- Simplify MIB counter reading code

 drivers/net/dsa/microchip/ksz9477.c| 121 ++---
 drivers/net/dsa/microchip/ksz_common.c | 100 +++
 drivers/net/dsa/microchip/ksz_common.h |   2 +
 drivers/net/dsa/microchip/ksz_priv.h   |   7 +-
 4 files changed, 185 insertions(+), 45 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 801117a..8bae36b1 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -271,6 +271,76 @@ static int ksz9477_reset_switch(struct ksz_device *dev)
return 0;
 }
 
+static void ksz9477_r_mib_cnt(struct ksz_device *dev, int port, u16 addr,
+ u64 *cnt)
+{
+   u32 data;
+   int timeout;
+   struct ksz_port *p = &dev->ports[port];
+
+   /* retain the flush/freeze bit */
+   data = p->freeze ? MIB_COUNTER_FLUSH_FREEZE : 0;
+   data |= MIB_COUNTER_READ;
+   data |= (addr << MIB_COUNTER_INDEX_S);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, data);
+
+   timeout = 1000;
+   do {
+   ksz_pread32(dev, port, REG_PORT_MIB_CTRL_STAT__4,
+   &data);
+   usleep_range(1, 10);
+   if (!(data & MIB_COUNTER_READ))
+   break;
+   } while (timeout-- > 0);
+
+   /* failed to read MIB. get out of loop */
+   if (!timeout) {
+   dev_dbg(dev->dev, "Failed to get MIB\n");
+   return;
+   }
+
+   /* count resets upon read */
+   ksz_pread32(dev, port, REG_PORT_MIB_DATA, &data);
+   *cnt += data;
+}
+
+static void ksz9477_r_mib_pkt(struct ksz_device *dev, int port, u16 addr,
+ u64 *dropped, u64 *cnt)
+{
+   addr = ksz9477_mib_names[addr].index;
+   ksz9477_r_mib_cnt(dev, port, addr, cnt);
+}
+
+static void ksz9477_freeze_mib(struct ksz_device *dev, int port, bool freeze)
+{
+   struct ksz_port *p = &dev->ports[port];
+   u32 val = freeze ? MIB_COUNTER_FLUSH_FREEZE : 0;
+
+   /* enable/disable the port for flush/freeze function */
+   mutex_lock(&p->mib.cnt_mutex);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, val);
+
+   /* used by MIB counter reading code to know freeze is enabled */
+   p->freeze = freeze;
+   mutex_unlock(&p->mib.cnt_mutex);
+}
+
+static void ksz9477_port_init_cnt(struct ksz_device *dev, int port)
+{
+   struct ksz_port_mib *mib = &dev->ports[port].mib;
+
+   /* flush all enabled port MIB counters */
+   mutex_lock(&mib->cnt_mutex);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4,
+MIB_COUNTER_FLUSH_FREEZE);
+   ksz_write8(dev, REG_SW_MAC_CTRL_6, SW_MIB_COUNTER_FLUSH);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, 0);
+   mutex_unlock(&mib->cnt_mutex);
+
+   mib->cnt_ptr = 0;
+   memset(mib->counters, 0, dev->mib_cnt * sizeof(u64));
+}
+
 static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds,
  int port)
 {
@@ -350,47 +420,6 @@ static void ksz9477_get_strings(struct dsa_switch *ds, int 
port, uint8_t *buf)
}
 }
 
-static void ksz_get_ethtool_stats(struct dsa_switch *ds, int port,
- uint64_t *buf)
-{
-   struct ksz_device *dev = ds->priv;
-   int i;
-   u32 data;
-   int timeout;
-
-   mutex_lock(&dev->stats_mutex);
-
-   for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
-   data = MIB_COUNTER_READ;
-   data |= ((ksz9477_mib_names[i].index & 0xFF) <<
-   MIB_COUNTER_INDEX_S);
-   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, data);
-
-   timeout = 1000;
-   do {
-   ksz_pread32(dev, port, REG_PORT_MIB_CTRL_STAT__4,
-   &data);
-   usleep_range(1, 10);
-   if (!(data & MIB_COUNTER_READ))
-   break;
-   } while (timeout-- > 0);
-
-   /* failed to read MIB. get out of loop */
-   if (!timeout) {
-   dev_dbg(dev->dev, "Failed to get MIB\n");
-   break;
-   }
-
-   /* count resets upon read */
-   ksz_pread32(dev, port, REG_PORT_MIB_DATA, &data);
-
-   dev->mib_value[i] += (uint64_t)data;
-   buf[i] = dev->mib_value[i];
-   }
-
-   mutex_unlock(&dev->stats_mutex);
-}
-
 static void ksz9477_cfg_port_member(struct ksz_device *dev, int port,
u8 member)
 {
@@ -1159,9 +1188,14 @@ static int ksz9477_setup

[PATCH v2 net-next 6/8] net: dsa: microchip: Break KSZ9477 DSA driver into two files

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Break KSZ9477 DSA driver into two files in preparation to add more KSZ
switch drivers.
Add common functions in ksz_common.h so that other KSZ switch drivers
can access code in ksz_common.c.
Add ksz_spi.h for common functions used by KSZ switch SPI drivers.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/Makefile  |2 +-
 drivers/net/dsa/microchip/ksz9477.c | 1318 +++
 drivers/net/dsa/microchip/ksz9477_spi.c |  143 ++--
 drivers/net/dsa/microchip/ksz_common.c  | 1139 +++---
 drivers/net/dsa/microchip/ksz_common.h  |  227 ++
 drivers/net/dsa/microchip/ksz_priv.h|  229 +++---
 drivers/net/dsa/microchip/ksz_spi.h |   82 ++
 7 files changed, 1911 insertions(+), 1229 deletions(-)
 create mode 100644 drivers/net/dsa/microchip/ksz9477.c
 create mode 100644 drivers/net/dsa/microchip/ksz_common.h
 create mode 100644 drivers/net/dsa/microchip/ksz_spi.h

diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index 5b6325b..13dd8f0 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz9477.o ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
new file mode 100644
index 000..2c407bd
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -0,0 +1,1318 @@
+/*
+ * Microchip KSZ9477 switch driver main logic
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz_9477_reg.h"
+
+static const struct {
+   int index;
+   char string[ETH_GSTRING_LEN];
+} ksz9477_mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+   { 0x00, "rx_hi" },
+   { 0x01, "rx_undersize" },
+   { 0x02, "rx_fragments" },
+   { 0x03, "rx_oversize" },
+   { 0x04, "rx_jabbers" },
+   { 0x05, "rx_symbol_err" },
+   { 0x06, "rx_crc_err" },
+   { 0x07, "rx_align_err" },
+   { 0x08, "rx_mac_ctrl" },
+   { 0x09, "rx_pause" },
+   { 0x0A, "rx_bcast" },
+   { 0x0B, "rx_mcast" },
+   { 0x0C, "rx_ucast" },
+   { 0x0D, "rx_64_or_less" },
+   { 0x0E, "rx_65_127" },
+   { 0x0F, "rx_128_255" },
+   { 0x10, "rx_256_511" },
+   { 0x11, "rx_512_1023" },
+   { 0x12, "rx_1024_1522" },
+   { 0x13, "rx_1523_2000" },
+   { 0x14, "rx_2001" },
+   { 0x15, "tx_hi" },
+   { 0x16, "tx_late_col" },
+   { 0x17, "tx_pause" },
+   { 0x18, "tx_bcast" },
+   { 0x19, "tx_mcast" },
+   { 0x1A, "tx_ucast" },
+   { 0x1B, "tx_deferred" },
+   { 0x1C, "tx_total_col" },
+   { 0x1D, "tx_exc_col" },
+   { 0x1E, "tx_single_col" },
+   { 0x1F, "tx_mult_col" },
+   { 0x80, "rx_total" },
+   { 0x81, "tx_total" },
+   { 0x82, "rx_discards" },
+   { 0x83, "tx_discards" },
+};
+
+static void ksz9477_cfg32(struct ksz_device *dev, u32 addr, u32 bits, bool set)
+{
+   u32 data;
+
+   ksz_read32(dev, addr, &data);
+   if (set)
+   data |= bits;
+   else
+   data &= ~bits;
+   ksz_write32(dev, addr, data);
+}
+
+static void ksz9477_port_cfg32(struct ksz_device *dev, int port, int offset,
+  u32 bits, bool set)
+{
+   u32 addr;
+   u32 data;
+
+   addr = PORT_CTRL_ADDR(port, offset);
+   ksz_read32(dev, addr, &data);
+
+   if (set)
+   data |= bits;
+   else
+   data &= ~bits;
+
+   ksz_write32(dev, addr, data);
+}
+
+static int ksz9477_wait_vlan_ctrl_ready(struct ksz_device *dev, u32 waiton,
+   int timeout)
+{
+   u8 data;
+
+   do {
+   ksz_read8(dev, REG_SW_VLAN_CTRL, &data);
+   if (!(data & waiton))
+   break;
+   usleep_range(1, 10);
+   } while (timeout-- > 0);
+
+   if (timeout <

[PATCH v2 net-next 4/8] net: dsa: microchip: Rename some functions with ksz9477 prefix

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Rename some functions with ksz9477 prefix to separate chip specific code
from common code.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/ksz_common.c | 114 +
 1 file changed, 58 insertions(+), 56 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index e656615..009ee5f 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -265,9 +265,8 @@ static int wait_alu_sta_ready(struct ksz_device *dev, u32 
waiton, int timeout)
return 0;
 }
 
-static int ksz_reset_switch(struct dsa_switch *ds)
+static int ksz9477_reset_switch(struct ksz_device *dev)
 {
-   struct ksz_device *dev = ds->priv;
u8 data8;
u16 data16;
u32 data32;
@@ -300,7 +299,7 @@ static int ksz_reset_switch(struct dsa_switch *ds)
return 0;
 }
 
-static void port_setup(struct ksz_device *dev, int port, bool cpu_port)
+static void ksz9477_port_setup(struct ksz_device *dev, int port, bool cpu_port)
 {
u8 data8;
u16 data16;
@@ -346,7 +345,7 @@ static void port_setup(struct ksz_device *dev, int port, 
bool cpu_port)
ksz_pread16(dev, port, REG_PORT_PHY_INT_ENABLE, &data16);
 }
 
-static void ksz_config_cpu_port(struct dsa_switch *ds)
+static void ksz9477_config_cpu_port(struct dsa_switch *ds)
 {
struct ksz_device *dev = ds->priv;
int i;
@@ -358,12 +357,12 @@ static void ksz_config_cpu_port(struct dsa_switch *ds)
dev->cpu_port = i;
 
/* enable cpu port */
-   port_setup(dev, i, true);
+   ksz9477_port_setup(dev, i, true);
}
}
 }
 
-static int ksz_setup(struct dsa_switch *ds)
+static int ksz9477_setup(struct dsa_switch *ds)
 {
struct ksz_device *dev = ds->priv;
int ret = 0;
@@ -373,7 +372,7 @@ static int ksz_setup(struct dsa_switch *ds)
if (!dev->vlan_cache)
return -ENOMEM;
 
-   ret = ksz_reset_switch(ds);
+   ret = ksz9477_reset_switch(dev);
if (ret) {
dev_err(ds->dev, "failed to reset switch\n");
return ret;
@@ -382,7 +381,7 @@ static int ksz_setup(struct dsa_switch *ds)
/* accept packet up to 2000bytes */
ksz_cfg(dev, REG_SW_MAC_CTRL_1, SW_LEGAL_PACKET_DISABLE, true);
 
-   ksz_config_cpu_port(ds);
+   ksz9477_config_cpu_port(ds);
 
ksz_cfg(dev, REG_SW_MAC_CTRL_1, MULTICAST_STORM_DISABLE, true);
 
@@ -395,13 +394,13 @@ static int ksz_setup(struct dsa_switch *ds)
return 0;
 }
 
-static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
- int port)
+static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds,
+ int port)
 {
return DSA_TAG_PROTO_KSZ;
 }
 
-static int ksz_phy_read16(struct dsa_switch *ds, int addr, int reg)
+static int ksz9477_phy_read16(struct dsa_switch *ds, int addr, int reg)
 {
struct ksz_device *dev = ds->priv;
u16 val = 0;
@@ -411,7 +410,8 @@ static int ksz_phy_read16(struct dsa_switch *ds, int addr, 
int reg)
return val;
 }
 
-static int ksz_phy_write16(struct dsa_switch *ds, int addr, int reg, u16 val)
+static int ksz9477_phy_write16(struct dsa_switch *ds, int addr, int reg,
+  u16 val)
 {
struct ksz_device *dev = ds->priv;
 
@@ -426,7 +426,7 @@ static int ksz_enable_port(struct dsa_switch *ds, int port,
struct ksz_device *dev = ds->priv;
 
/* setup slave port */
-   port_setup(dev, port, false);
+   ksz9477_port_setup(dev, port, false);
 
return 0;
 }
@@ -445,7 +445,7 @@ static int ksz_sset_count(struct dsa_switch *ds)
return TOTAL_SWITCH_COUNTER_NUM;
 }
 
-static void ksz_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
+static void ksz9477_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
 {
int i;
 
@@ -495,7 +495,8 @@ static void ksz_get_ethtool_stats(struct dsa_switch *ds, 
int port,
mutex_unlock(&dev->stats_mutex);
 }
 
-static void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
+static void ksz9477_port_stp_state_set(struct dsa_switch *ds, int port,
+  u8 state)
 {
struct ksz_device *dev = ds->priv;
u8 data;
@@ -540,7 +541,8 @@ static void ksz_port_fast_age(struct dsa_switch *ds, int 
port)
ksz_write8(dev, REG_SW_LUE_CTRL_1, data8);
 }
 
-static int ksz_port_vlan_filtering(struct dsa_switch *ds, int port, bool flag)
+static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port,
+  bool flag)
 {
struct ksz_device *dev = ds->priv;
 
@@ -567

[PATCH v2 net-next 8/8] net: dsa: microchip: Rename ksz_9477_reg.h to ksz9477_reg.h

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Rename ksz_9477_reg.h to ksz9477_reg.h for consistency as the product
name is always KSZ.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/ksz9477.c | 2 +-
 drivers/net/dsa/microchip/{ksz_9477_reg.h => ksz9477_reg.h} | 0
 drivers/net/dsa/microchip/ksz_priv.h| 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename drivers/net/dsa/microchip/{ksz_9477_reg.h => ksz9477_reg.h} (100%)

diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 537342a..801117a 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -31,7 +31,7 @@
 
 #include "ksz_priv.h"
 #include "ksz_common.h"
-#include "ksz_9477_reg.h"
+#include "ksz9477_reg.h"
 
 static const struct {
int index;
diff --git a/drivers/net/dsa/microchip/ksz_9477_reg.h 
b/drivers/net/dsa/microchip/ksz9477_reg.h
similarity index 100%
rename from drivers/net/dsa/microchip/ksz_9477_reg.h
rename to drivers/net/dsa/microchip/ksz9477_reg.h
diff --git a/drivers/net/dsa/microchip/ksz_priv.h 
b/drivers/net/dsa/microchip/ksz_priv.h
index d92a7c1..bfe9066 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -26,7 +26,7 @@
 #include 
 #include 
 
-#include "ksz_9477_reg.h"
+#include "ksz9477_reg.h"
 
 struct ksz_io_ops;
 
-- 
1.9.1

[PATCH v2 net-next 2/8] net: dsa: microchip: Clean up code according to patch check suggestions

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Clean up code according to patch check suggestions.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/ksz_common.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index d662a9a..435c463 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -895,9 +895,9 @@ static void ksz_port_mdb_add(struct dsa_switch *ds, int 
port,
 
if (static_table[0] & ALU_V_STATIC_VALID) {
/* check this has same vid & mac address */
-   if (((static_table[2] >> ALU_V_FID_S) == (mdb->vid)) &&
+   if (((static_table[2] >> ALU_V_FID_S) == mdb->vid) &&
((static_table[2] & ALU_V_MAC_ADDR_HI) == mac_hi) &&
-   (static_table[3] == mac_lo)) {
+   static_table[3] == mac_lo) {
/* found matching one */
break;
}
@@ -968,9 +968,9 @@ static int ksz_port_mdb_del(struct dsa_switch *ds, int port,
if (static_table[0] & ALU_V_STATIC_VALID) {
/* check this has same vid & mac address */
 
-   if (((static_table[2] >> ALU_V_FID_S) == (mdb->vid)) &&
+   if (((static_table[2] >> ALU_V_FID_S) == mdb->vid) &&
((static_table[2] & ALU_V_MAC_ADDR_HI) == mac_hi) &&
-   (static_table[3] == mac_lo)) {
+   static_table[3] == mac_lo) {
/* found matching one */
break;
}
-- 
1.9.1

[PATCH v2 net-next 7/8] net: dsa: microchip: Prepare PHY for proper advertisement

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Prepare PHY for proper advertisement and get link status for the port.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/ksz9477.c| 13 +
 drivers/net/dsa/microchip/ksz_common.c | 20 
 drivers/net/dsa/microchip/ksz_common.h |  2 ++
 drivers/net/dsa/microchip/ksz_priv.h   |  2 ++
 4 files changed, 37 insertions(+)

diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 2c407bd..537342a 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -978,6 +978,17 @@ static void ksz9477_port_mirror_del(struct dsa_switch *ds, 
int port,
 PORT_MIRROR_SNIFFER, false);
 }
 
+static void ksz9477_phy_setup(struct ksz_device *dev, int port,
+ struct phy_device *phy)
+{
+   if (port < dev->phy_port_cnt) {
+   /* SUPPORTED_Asym_Pause and SUPPORTED_Pause can be removed to
+* disable flow control when rate limiting is used.
+*/
+   phy->advertising = phy->supported;
+   }
+}
+
 static void ksz9477_port_setup(struct ksz_device *dev, int port, bool cpu_port)
 {
u8 data8;
@@ -1159,6 +1170,7 @@ static int ksz9477_setup(struct dsa_switch *ds)
.setup  = ksz9477_setup,
.phy_read   = ksz9477_phy_read16,
.phy_write  = ksz9477_phy_write16,
+   .adjust_link= ksz_adjust_link,
.port_enable= ksz_enable_port,
.port_disable   = ksz_disable_port,
.get_strings= ksz9477_get_strings,
@@ -1300,6 +1312,7 @@ static void ksz9477_switch_exit(struct ksz_device *dev)
.get_port_addr = ksz9477_get_port_addr,
.cfg_port_member = ksz9477_cfg_port_member,
.flush_dyn_mac_table = ksz9477_flush_dyn_mac_table,
+   .phy_setup = ksz9477_phy_setup,
.port_setup = ksz9477_port_setup,
.shutdown = ksz9477_reset_switch,
.detect = ksz9477_switch_detect,
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index b535544..624abdd 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -69,6 +69,25 @@ int ksz_phy_write16(struct dsa_switch *ds, int addr, int 
reg, u16 val)
return 0;
 }
 
+void ksz_adjust_link(struct dsa_switch *ds, int port,
+struct phy_device *phydev)
+{
+   struct ksz_device *dev = ds->priv;
+   struct ksz_port *p = &dev->ports[port];
+
+   if (phydev->link) {
+   p->speed = phydev->speed;
+   p->duplex = phydev->duplex;
+   p->flow_ctrl = phydev->pause;
+   p->link_up = 1;
+   dev->live_ports |= (1 << port) & dev->on_ports;
+   } else if (p->link_up) {
+   p->link_up = 0;
+   p->link_down = 1;
+   dev->live_ports &= ~(1 << port);
+   }
+}
+
 int ksz_sset_count(struct dsa_switch *ds)
 {
struct ksz_device *dev = ds->priv;
@@ -235,6 +254,7 @@ int ksz_enable_port(struct dsa_switch *ds, int port, struct 
phy_device *phy)
 
/* setup slave port */
dev->dev_ops->port_setup(dev, port, false);
+   dev->dev_ops->phy_setup(dev, port, phy);
 
/* port_stp_state_set() will be called after to enable the port so
 * there is no need to do anything.
diff --git a/drivers/net/dsa/microchip/ksz_common.h 
b/drivers/net/dsa/microchip/ksz_common.h
index dd2a468..599d4e5 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -26,6 +26,8 @@
 
 int ksz_phy_read16(struct dsa_switch *ds, int addr, int reg);
 int ksz_phy_write16(struct dsa_switch *ds, int addr, int reg, u16 val);
+void ksz_adjust_link(struct dsa_switch *ds, int port,
+struct phy_device *phydev);
 int ksz_sset_count(struct dsa_switch *ds);
 int ksz_port_bridge_join(struct dsa_switch *ds, int port,
 struct net_device *br);
diff --git a/drivers/net/dsa/microchip/ksz_priv.h 
b/drivers/net/dsa/microchip/ksz_priv.h
index 4126749..d92a7c1 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -150,6 +150,8 @@ struct ksz_dev_ops {
u32 (*get_port_addr)(int port, int offset);
void (*cfg_port_member)(struct ksz_device *dev, int port, u8 member);
void (*flush_dyn_mac_table)(struct ksz_device *dev, int port);
+   void (*phy_setup)(struct ksz_device *dev, int port,
+ struct phy_device *phy);
void (*port_setup)(struct ksz_device *dev, int port, bool cpu_port);
void (*r_phy)(struct ksz_device *dev, u16 phy, u16 reg, u16 *val);
void (*w_phy)(struct ksz_device *dev, u16 phy, u16 reg, u16 val);
-- 
1.9.1

[PATCH v2 net-next 5/8] net: dsa: microchip: Rename ksz_spi.c to ksz9477_spi.c

2017-12-05 Thread Tristram.Ha

From: Tristram Ha 

Rename ksz_spi.c to ksz9477_spi.c and update Kconfig in preparation to add
more KSZ switch drivers.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/Kconfig  | 12 ++--
 drivers/net/dsa/microchip/Makefile |  4 ++--
 drivers/net/dsa/microchip/{ksz_spi.c => ksz9477_spi.c} |  0
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename drivers/net/dsa/microchip/{ksz_spi.c => ksz9477_spi.c} (100%)

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index a8b8f59..5a8660d 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -1,12 +1,12 @@
-menuconfig MICROCHIP_KSZ
-   tristate "Microchip KSZ series switch support"
+menuconfig MICROCHIP_KSZ9477
+   tristate "Microchip KSZ9477 series switch support"
depends on NET_DSA
select NET_DSA_TAG_KSZ
help
- This driver adds support for Microchip KSZ switch chips.
+ This driver adds support for Microchip KSZ9477 switch chips.
 
-config MICROCHIP_KSZ_SPI_DRIVER
-   tristate "KSZ series SPI connected switch driver"
-   depends on MICROCHIP_KSZ && SPI
+config MICROCHIP_KSZ9477_SPI_DRIVER
+   tristate "KSZ9477 series SPI connected switch driver"
+   depends on MICROCHIP_KSZ9477 && SPI
help
  Select to enable support for registering switches configured through 
SPI.
diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index ed335e2..5b6325b 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_MICROCHIP_KSZ)+= ksz_common.o
-obj-$(CONFIG_MICROCHIP_KSZ_SPI_DRIVER) += ksz_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
diff --git a/drivers/net/dsa/microchip/ksz_spi.c 
b/drivers/net/dsa/microchip/ksz9477_spi.c
similarity index 100%
rename from drivers/net/dsa/microchip/ksz_spi.c
rename to drivers/net/dsa/microchip/ksz9477_spi.c
-- 
1.9.1

Re: [PATCH iproute2] ip route: broken logic when using default word and family not specified

2017-12-05 Thread Stephen Hemminger

On Sat, 18 Nov 2017 17:56:32 +0100
Alexander Zubkov  wrote:

> I also opened earlier a ticket in bugzilla:
> https://bugzilla.kernel.org/show_bug.cgi?id=197899
> And Stephen Hemminger had couple of comments there which I want to argue:
> 
> > $ ip route list default
> > Means list all routes in any address family (ie same as any)
> > but
> >
> > $ ip route list 0/0
> > Means list all routes for IPv4 default route.  
> 
> This is not correct, because first command do not show routes in any
> address family. Now it do so only when table 0 is specified, otherwise
> only IPv4 routes are showed. Here is the code from iproute.c:
> 
> if (do_ipv6 == AF_UNSPEC && filter.tb)
> do_ipv6 = AF_INET;
> 
> > It probably is worth a man page warning, but changing semantics
> > that have existed for many years is more likely to break some existing 
> > user.  
> 
> Yes, backward compatibility is a reason. But as I remember, that
> sematics already have changed earlier. Probably it was showing IPv4
> and IPv6 routes together without family specified - I do not remember
> exactly. And I have doubts that such feature could be lied on
> reliably.
> I as a end user would prefer to make the behaviour more consistent and
> without such excetptions. But of course there may be other opinions.

As a conservative maintainer, my preference is always to receive acknowledgments
from others before accepting something that may break existing users.

Backward compatibility is more important than the surprising result you 
discovered.
A confused new user is less of an issue than breaking some existing user
who may not even have contact back to network developers.

Since it has been that way for many years, waiting a couple of weeks for review
is not going to hurt anything.

[PATCH iproute2] iplink: allow configuring GSO max values

2017-12-05 Thread Solio Sarabia

From: Stephen Hemminger 

This allows sending GSO maximum values when configuring a device.
The values are advisory. Most devices will ignore them but for some
pseudo devices such as veth pairs they can be set.

Example:
# ip link add dev vm1 type veth peer name vm2 gso_max_size 32768

Signed-off-by: Stephen Hemminger 
Signed-off-by: Solio Sarabia 
---
Rebases original patch against linux-next. Also fixes minor space in
ip-link.8.in, check gso_max_size range, and fixes some style warnings.

 ip/iplink.c   | 23 ++-
 man/man8/ip-link.8.in | 13 +
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index 0a8eb56..94efa1f 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -97,7 +97,8 @@ void iplink_usage(void)
" [ master DEVICE ][ vrf NAME ]\n"
" [ nomaster ]\n"
" [ addrgenmode { eui64 | none | 
stable_secret | random } ]\n"
-   " [ protodown { on | off } ]\n"
+   " [ protodown { on | off } ]\n"
+   " [ gso_max_size BYTES ] | [ 
gso_max_segs PACKETS ]\n"
"\n"
"   ip link show [ DEVICE | group GROUP ] [up] [master DEV] 
[vrf NAME] [type TYPE]\n");
 
@@ -848,6 +849,26 @@ int iplink_parse(int argc, char **argv, struct iplink_req 
*req,
return on_off("protodown", *argv);
addattr8(&req->n, sizeof(*req), IFLA_PROTO_DOWN,
 proto_down);
+   } else if (strcmp(*argv, "gso_max_size") == 0) {
+   unsigned int max_size;
+
+   NEXT_ARG();
+   if (get_unsigned(&max_size, *argv, 0) ||
+   max_size > UINT16_MAX + 1)
+   invarg("Invalid \"gso_max_size\" value\n",
+   *argv);
+   addattr32(&req->n, sizeof(*req), IFLA_GSO_MAX_SIZE,
+   max_size);
+   } else if (strcmp(*argv, "gso_max_segs") == 0) {
+   unsigned int max_segs;
+
+   NEXT_ARG();
+   if (get_unsigned(&max_segs, *argv, 0) ||
+   max_segs > UINT16_MAX)
+   invarg("Invalid \"gso_max_segs\" value\n",
+   *argv);
+   addattr32(&req->n, sizeof(*req), IFLA_GSO_MAX_SEGS,
+   max_segs);
} else {
if (matches(*argv, "help") == 0)
usage();
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index a6a10e5..40f09b3 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -36,6 +36,11 @@ ip-link \- network device configuration
 .RB "[ " numrxqueues
 .IR QUEUE_COUNT " ]"
 .br
+.BR "[ " gso_max_size
+.IR BYTES " ]"
+.RB "[ " gso_max_segs
+.IR SEGMENTS " ]"
+.br
 .BI type " TYPE"
 .RI "[ " ARGS " ]"
 
@@ -343,6 +348,14 @@ specifies the number of transmit queues for new device.
 specifies the number of receive queues for new device.
 
 .TP
+.BI gso_max_size " BYTES "
+specifies the recommended maximum size of a Generic Segment Offload packet the 
new device should accept.
+
+.TP
+.BI gso_max_segs " SEGMENTS "
+specifies the recommended maximum number of a Generic Segment Offload segments 
the new device should accept.
+
+.TP
 .BI index " IDX "
 specifies the desired index of the new virtual device. The link creation 
fails, if the index is busy.
 
-- 
2.7.4

Re: [PATCH v2 iproute2 net-next] gre6: add collect metadata support

2017-12-05 Thread Daniel Borkmann

On 12/06/2017 02:07 AM, Stephen Hemminger wrote:
> On Tue,  5 Dec 2017 15:10:37 -0800
> William Tu  wrote:
> 
>> diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
>> index a6a10e577b1f..eb04f887c940 100644
>> --- a/man/man8/ip-link.8.in
>> +++ b/man/man8/ip-link.8.in
>> @@ -755,6 +755,8 @@ the following additional arguments are supported:
>>  .BI "dscp inherit"
>>  ] [
>>  .BI dev " PHYS_DEV "
>> +] [
>> +.RB external
>>  ]
>>  
>>  .in +8
>> @@ -833,6 +835,10 @@ or
>>  .IR 00 ".." ff
>>  when tunneling non-IP packets. The default value is 00.
>>  
>> +.sp
>> +.RB external
>> +- make this tunnel externally controlled (or not, which is the default).
>> +
>>  .in -8
> 
> I don't have any  direct involvement in offload, so would like some feedback
> from others that are.
> 
> Not a big fan of opaque "metadata" what exactly does it mean?
> Also "external" is already used to mean something else on other parts of
> the link command. Also the option, and the value in JSON should be the same.

The keyword "external" to set the device into collect metadata mode
is already used heavily throughout iproute2, e.g. vxlan, geneve, ipip,
and many other device types. Special casing gre6 to not having it set
up in such way through "external" would seem quite confusing.

> Please reconsider the naming and resubmit
> 
> The wording in the man page here could be better
>

Re: [PATCH V11 4/5] vsprintf: add printk specifier %px

2017-12-05 Thread Sergey Senozhatsky

Hello,

On (12/05/17 13:22), Linus Torvalds wrote:
[..]
> It's not like those hex numbers were really helping people anyway.
> We've turned off most of them on x86 oops reports long ago (and
> entirely independently of the pointer hashing). Having stared at a lot
> of oopses in my time, the only hex numbers that tend to be really
> relevant are (a) the register contents (which aren't %p anyway), and
> things like the faulting address (which is not, and never has been, %p
> on x86, but might be on some other architecture).

I see some %p-s being used in _supposedly_ important output,
like arch/x86/mm/fault.c

show_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
...
printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);


a quick %p grep gives me the following list:

arch/arm/mm/fault.c:pr_alert("pgd = %p\n", mm->pgd);
arch/arm64/mm/fault.c:  pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgd = 
%p\n",
arch/arm64/mm/fault.c:  pr_info_ratelimited("%s[%d]: %s exception: 
pc=%p sp=%p\n",
arch/m68k/mm/fault.c:   pr_debug("send_fault_sig: %p,%d,%d\n", siginfo.si_addr,
arch/m68k/mm/fault.c:   pr_cont(" at virtual address %p\n", 
siginfo.si_addr);
arch/m68k/mm/fault.c:   pr_debug("do page fault:\nregs->sr=%#x, regs->pc=%#lx, 
address=%#lx, %ld, %p\n",
arch/microblaze/mm/fault.c: pr_emerg("Page fault in user mode with 
faulthandler_disabled(), mm = %p\n",
arch/mn10300/mm/fault.c:printk(KERN_DEBUG "pgd entry %p: %016Lx\n",
arch/mn10300/mm/fault.c:printk(KERN_DEBUG "pmd entry %p: %016Lx\n",
arch/mn10300/mm/fault.c:printk(KERN_DEBUG "pte entry %p: %016Lx\n",
arch/mn10300/mm/fault.c:printk(KERN_DEBUG "--- 
do_page_fault(%p,%s:%04lx,%08lx)\n",
arch/powerpc/mm/fault.c:   " mm=%p\n",
arch/sh/mm/fault.c: printk(KERN_ALERT "pgd = %p\n", pgd);
arch/unicore32/mm/fault.c:  printk(KERN_ALERT "pgd = %p\n", mm->pgd);
arch/x86/mm/fault.c:printk(KERN_CONT " at %p\n", (void *) address);
arch/x86/mm/fault.c:printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
arch/x86/mm/fault.c:printk("%s%s[%d]: segfault at %lx ip %p sp %p error 
%lx",


or is it OK to show hashes instead of pgd or pmd pointers?

-ss

Re: [PATCH iproute2] iproute2: Fix undeclared __kernel_long_t type build error in RHEL 6.8

2017-12-05 Thread Stephen Hemminger

On Fri,  1 Dec 2017 13:04:51 +0200
Leon Romanovsky  wrote:

> From: Leon Romanovsky 
> 
> Add asm/posix_types.h header file to the list of needed includes,
> because the headers files in RHEL 6.8 are too old and doesn't
> have declaration of __kernel_long_t.
> 
> In file included from ../include/uapi/linux/kernel.h:5,
>  from ../include/uapi/linux/netfilter/x_tables.h:4,
>  from ../include/xtables.h:20,
>  from em_ipset.c:26:
> ../include/uapi/linux/sysinfo.h:9: error: expected specifier-qualifier-list 
> before ‘__kernel_long_t’
> 
> Cc: Riad Abo Raed 
> Cc: Guy Ergas 
> Signed-off-by: Leon Romanovsky 
> ---
> Stephen,
> I don't know how to properly solve this type of errors and would like to
> hear your guidance on it.
> 
> Should I simply add kernel file? Or maybe I need to add HAVE_xxx checks
> to configure script to check __kernel_long_t existence and declare only
> this type?
> 
> I also have another build error on RHEL 6.8 system and looking for a
> solution.
> 
> In file included from em_ipset.c:26:
> ../include/xtables.h:35:29: error: xtables-version.h: No such file or 
> directory
> make[1]: *** [em_ipset.o] Error 1
> 
> The iptables-devel is iptables-devel-1.4.7-16.el6.x86_64 so check_xt()
> success, but RH headers don't have xtable-version.h and the relevant defines
> are embedded in the main xtables.h header file.
> 
> Thanks
> ---
>  include/uapi/asm/posix_types.h | 97 
> +++
>  1 file changed, 97 insertions(+)
>  create mode 100644 include/uapi/asm/posix_types.h
> 
> diff --git a/include/uapi/asm/posix_types.h b/include/uapi/asm/posix_types.h
> new file mode 100644
> index ..5e6ea22b
> --- /dev/null
> +++ b/include/uapi/asm/posix_types.h
> @@ -0,0 +1,97 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef __ASM_GENERIC_POSIX_TYPES_H
> +#define __ASM_GENERIC_POSIX_TYPES_H
> +
> +#include 
> +/*
> + * This file is generally used by user-level software, so you need to
> + * be a little careful about namespace pollution etc.
> + *
> + * First the types that are often defined in different ways across
> + * architectures, so that you can override them.
> + */
> +
> +#ifndef __kernel_long_t
> +typedef long __kernel_long_t;
> +typedef unsigned long__kernel_ulong_t;
> +#endif
> +
> +#ifndef __kernel_ino_t
> +typedef __kernel_ulong_t __kernel_ino_t;
> +#endif
> +
> +#ifndef __kernel_mode_t
> +typedef unsigned int __kernel_mode_t;
> +#endif
> +
> +#ifndef __kernel_pid_t
> +typedef int  __kernel_pid_t;
> +#endif
> +
> +#ifndef __kernel_ipc_pid_t
> +typedef int  __kernel_ipc_pid_t;
> +#endif
> +
> +#ifndef __kernel_uid_t
> +typedef unsigned int __kernel_uid_t;
> +typedef unsigned int __kernel_gid_t;
> +#endif
> +
> +#ifndef __kernel_suseconds_t
> +typedef __kernel_long_t  __kernel_suseconds_t;
> +#endif
> +
> +#ifndef __kernel_daddr_t
> +typedef int  __kernel_daddr_t;
> +#endif
> +
> +#ifndef __kernel_uid32_t
> +typedef unsigned int __kernel_uid32_t;
> +typedef unsigned int __kernel_gid32_t;
> +#endif
> +
> +#ifndef __kernel_old_uid_t
> +typedef __kernel_uid_t   __kernel_old_uid_t;
> +typedef __kernel_gid_t   __kernel_old_gid_t;
> +#endif
> +
> +#ifndef __kernel_old_dev_t
> +typedef unsigned int __kernel_old_dev_t;
> +#endif
> +
> +/*
> + * Most 32 bit architectures use "unsigned int" size_t,
> + * and all 64 bit architectures use "unsigned long" size_t.
> + */
> +#ifndef __kernel_size_t
> +#if __BITS_PER_LONG != 64
> +typedef unsigned int __kernel_size_t;
> +typedef int  __kernel_ssize_t;
> +typedef int  __kernel_ptrdiff_t;
> +#else
> +typedef __kernel_ulong_t __kernel_size_t;
> +typedef __kernel_long_t  __kernel_ssize_t;
> +typedef __kernel_long_t  __kernel_ptrdiff_t;
> +#endif
> +#endif
> +
> +#ifndef __kernel_fsid_t
> +typedef struct {
> + int val[2];
> +} __kernel_fsid_t;
> +#endif
> +
> +/*
> + * anything below here should be completely generic
> + */
> +typedef __kernel_long_t  __kernel_off_t;
> +typedef long long__kernel_loff_t;
> +typedef __kernel_long_t  __kernel_time_t;
> +typedef __kernel_long_t  __kernel_clock_t;
> +typedef int  __kernel_timer_t;
> +typedef int  __kernel_clockid_t;
> +typedef char *   __kernel_caddr_t;
> +typedef unsigned short   __kernel_uid16_t;
> +typedef unsigned short   __kernel_gid16_t;
> +
> +#endif /* __ASM_GENERIC_POSIX_TYPES_H */
> --
> 2.15.1
> 

This isn't going to be supportable.  The headers in uapi are updated by
a script from kernel, and this version posix_types.h conflicts with what
the kernel creates by make headers_install.

Re: [PATCH 2/2] veth: allow configuring GSO maximums

2017-12-05 Thread Stephen Hemminger

On Tue,  5 Dec 2017 17:14:26 -0800
Solio Sarabia  wrote:

> From: Stephen Hemminger 
> 
> Veth's can be used in environments (like Azure) where the underlying
> network device is impacted by large GSO packets. This patch allows
> gso maximum values to be passed in when creating the device via
> netlink.
> 
> In theory, other pseudo devices could also use netlink attributes
> to set GSO maximums but for now veth is what has been observed
> to be an issue.
> 
> Signed-off-by: Stephen Hemminger 
> Signed-off-by: Solio Sarabia 

I am testing new version with changelink support

[PATCH 1/2] rtnetlink: allow GSO maximums to be passed to device

2017-12-05 Thread Solio Sarabia

From: Stephen Hemminger 

Allow GSO maximum segments and size as netlink parameters on input,
with 'ip link add' utils for example. Allow also updating these
attributes after rtnetlink devices are created.

Signed-off-by: Stephen Hemminger 
Signed-off-by: Solio Sarabia 
---
 net/core/rtnetlink.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a4faefd..a1ff2a8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1637,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PROMISCUITY]  = { .type = NLA_U32 },
[IFLA_NUM_TX_QUEUES]= { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES]= { .type = NLA_U32 },
+   [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
+   [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 },
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = 
MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES]  = { .type = NLA_U32 },  /* ignored */
[IFLA_PHYS_SWITCH_ID]   = { .type = NLA_BINARY, .len = 
MAX_PHYS_ITEM_ID_LEN },
@@ -2287,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb,
}
}
 
+   if (tb[IFLA_GSO_MAX_SIZE]) {
+   u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+   if (max_size > GSO_MAX_SIZE) {
+   err = -EINVAL;
+   goto errout;
+   }
+
+   if (dev->gso_max_size ^ max_size) {
+   netif_set_gso_max_size(dev, max_size);
+   status |= DO_SETLINK_MODIFIED;
+   }
+   }
+
+   if (tb[IFLA_GSO_MAX_SEGS]) {
+   u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+   if (max_segs > GSO_MAX_SEGS) {
+   err = -EINVAL;
+   goto errout;
+   }
+
+   if (dev->gso_max_segs ^ max_segs) {
+   dev->gso_max_segs = max_segs;
+   status |= DO_SETLINK_MODIFIED;
+   }
+   }
+
if (tb[IFLA_OPERSTATE])
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
 
-- 
2.7.4

[PATCH 0/2] Allow changing device gso maximums

2017-12-05 Thread Solio Sarabia

Docker uses bridge/veth for its bridged network. veth sends tcp packets
as big as 65536 (its default gso value), even when lower physical or
synthetic devices expose a lower limit. This causes tcp fragmentation in
the host, spinning more cpu cycles.

The proposed solution is to allow user to tune gso settings, via iproute
utils for example. Note: this enables changing gso for all interfaces,
not limited to veth only.

This series rebases Stephen's original patches [1]. It also fixes a
minor issue when validating maximum gso_max_size, which can be in the
range [0,65536]. Changes are validated with and without docker use
cases.

[1] https://marc.info/?l=linux-netdev&m=151217101428494&w=2

Stephen Hemminger (2):
  rtnetlink: allow GSO maximums to be passed to device
  veth: allow configuring GSO maximums

 drivers/net/veth.c   | 20 
 net/core/rtnetlink.c | 30 ++
 2 files changed, 50 insertions(+)

-- 
2.7.4

[PATCH 2/2] veth: allow configuring GSO maximums

2017-12-05 Thread Solio Sarabia

From: Stephen Hemminger 

Veth's can be used in environments (like Azure) where the underlying
network device is impacted by large GSO packets. This patch allows
gso maximum values to be passed in when creating the device via
netlink.

In theory, other pseudo devices could also use netlink attributes
to set GSO maximums but for now veth is what has been observed
to be an issue.

Signed-off-by: Stephen Hemminger 
Signed-off-by: Solio Sarabia 
---
 drivers/net/veth.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f5438d0..510c058 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -410,6 +410,26 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
if (ifmp && (dev->ifindex != 0))
peer->ifindex = ifmp->ifi_index;
 
+   if (tbp[IFLA_GSO_MAX_SIZE]) {
+   u32 max_size = nla_get_u32(tbp[IFLA_GSO_MAX_SIZE]);
+
+   if (max_size > GSO_MAX_SIZE)
+   return -EINVAL;
+
+   peer->gso_max_size = max_size;
+   dev->gso_max_size = max_size;
+   }
+
+   if (tbp[IFLA_GSO_MAX_SEGS]) {
+   u32 max_segs = nla_get_u32(tbp[IFLA_GSO_MAX_SEGS]);
+
+   if (max_segs > GSO_MAX_SEGS)
+   return -EINVAL;
+
+   peer->gso_max_segs = max_segs;
+   dev->gso_max_segs = max_segs;
+   }
+
err = register_netdevice(peer);
put_net(net);
net = NULL;
-- 
2.7.4

Re: [PATCH v2 iproute2 net-next] gre6: add collect metadata support

2017-12-05 Thread Stephen Hemminger

On Tue,  5 Dec 2017 15:10:37 -0800
William Tu  wrote:

> diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
> index a6a10e577b1f..eb04f887c940 100644
> --- a/man/man8/ip-link.8.in
> +++ b/man/man8/ip-link.8.in
> @@ -755,6 +755,8 @@ the following additional arguments are supported:
>  .BI "dscp inherit"
>  ] [
>  .BI dev " PHYS_DEV "
> +] [
> +.RB external
>  ]
>  
>  .in +8
> @@ -833,6 +835,10 @@ or
>  .IR 00 ".." ff
>  when tunneling non-IP packets. The default value is 00.
>  
> +.sp
> +.RB external
> +- make this tunnel externally controlled (or not, which is the default).
> +
>  .in -8

I don't have any  direct involvement in offload, so would like some feedback
from others that are.

Not a big fan of opaque "metadata" what exactly does it mean?
Also "external" is already used to mean something else on other parts of
the link command. Also the option, and the value in JSON should be the same.

Please reconsider the naming and resubmit

The wording in the man page here could be better

Re: [PATCH v2 0/3] macb rx filter cleanups

2017-12-05 Thread David Miller

From: Julia Cartwright 
Date: Tue, 5 Dec 2017 18:02:47 -0600

> Here's a proper patchset based on net-next.
> 
> v1 -> v2:
>   - Rebased on net-next
>   - Add Nicolas's Acks
>   - Reorder commits, putting the list_empty() cleanups prior to the
> others.
>   - Added commit reverting the GFP_ATOMIC change.

Thanks for following up so quickly.

Series applied to net-next, thanks.

Re: [PATCH iproute2] iproute2: Fix undeclared __kernel_long_t type build error in RHEL 6.8

2017-12-05 Thread Stephen Hemminger

On Sat, 2 Dec 2017 10:28:33 +0200
Leon Romanovsky  wrote:

> On Fri, Dec 01, 2017 at 08:48:07AM -0800, Stephen Hemminger wrote:
> > On Fri,  1 Dec 2017 13:04:51 +0200
> > Leon Romanovsky  wrote:
> >  
> > > From: Leon Romanovsky 
> > >
> > > Add asm/posix_types.h header file to the list of needed includes,
> > > because the headers files in RHEL 6.8 are too old and doesn't
> > > have declaration of __kernel_long_t.
> > >
> > > In file included from ../include/uapi/linux/kernel.h:5,
> > >  from ../include/uapi/linux/netfilter/x_tables.h:4,
> > >  from ../include/xtables.h:20,
> > >  from em_ipset.c:26:
> > > ../include/uapi/linux/sysinfo.h:9: error: expected 
> > > specifier-qualifier-list before ‘__kernel_long_t’
> > >
> > > Cc: Riad Abo Raed 
> > > Cc: Guy Ergas 
> > > Signed-off-by: Leon Romanovsky   
> >
> > I see the problem, but the solution of dragging in posix_types.h
> > would be too much of a long term maintenance issue.
> > All the headers in uapi are regularly generated from upstream
> > kernel headers; I don't want to start making exceptions.
> >
> > Is it just the xtables stuff (which has always been problematic)?  
> 
> Yes, both failures are related to xtables. And this wass my naive approach to
> solve first one, the second mentioned in the original commit log
> (missing xtables-version.h) is more harder to fix.
> 
> Will it work if I test in configure script the existence of __kernel_long_t
> and fallback to xt-internal.h?
> 
> Thanks

Why not just modify the part of the configure script that checks if xtables 
build will
work. It should fail if header files won't work.


pgpTF4jCjD7EN.pgp
Description: OpenPGP digital signature

Re: [PATCH v2 3/3] ethtool: Add ETHTOOL_RESET support via --reset command

2017-12-05 Thread Scott Branden


Hi Michal,

Thanks - one question below hopefully someone can help with.


On 17-12-05 02:29 PM, Michal Kubecek wrote:

On Tue, Dec 05, 2017 at 02:06:09PM -0800, Scott Branden wrote:

On 17-12-05 01:30 PM, Michal Kubecek wrote:

On Tue, Dec 05, 2017 at 12:53:23PM -0800, Scott Branden wrote:

Add ETHTOOL_RESET support via --reset command.

ie.  ethtool --reset DEVNAME 

flagnames currently match the ETH_RESET_xxx names:
mgmt,irq,dma,filter,offload,mac,phy,ram,dedicated,all

Alternatively, you can specific component bitfield directly using
ethtool --reset DEVNAME flags %x

IMHO it would be more consistent with e.g. msglvl without the keyword
"flags".

I don't see the consistency in ethtool of specifying a number without a
keyword in front of it.
I can only find --set-dump specify a number?
Others have keyword and number.  msglvl is the keyword after specifying -s -
same as flags is the keyword I use after specifying --reset.

What I meant is that you can write

 ethtool -s eth0 msglvl drv on probe off
 ethtool -s eth0 msglvl 0x7

i.e. either number or names (with on/off in this case) while your patch
has

 ethtool --reset eth0 mgmg,irq
 ethtool --reset eth0 flags 0x3

i.e. an extra keyword if a number is used.

But it's not really important, it doesn't seem I would be able to share
a parser for this with any other subcommand or parameter anyway.


   It would be also nice to provide a symbolic way to specify the
shared flags.

I'll change to allow -shared to be added to the end of each component
specified to use the shared bit.
  IE. mgmt-shared, irq-shared, dma-shared ?

Sounds good to me.


+   resetinfo.cmd = ETHTOOL_RESET;
+
+   if (send_ioctl(ctx, &resetinfo)) {
+   perror("Cannot issue RESET");
+   return 1;
+   }
+   fprintf(stdout, "RESET 0x%x issued\n", resetinfo.data);

According to documentation, driver is supposed to clear the flags
corresponding to components which were reset so that what is left are
those which were _not_ reset.

I'll move the print above the send_ioctl.

It might be even more useful if ethtool informed user what actually
happened, i.e. either change the message to saying these are bits for
components not reset (if resetinfo.data is not zero) or save the
original value of resetinfo.data and show  saved_data & ~resetinfo.data

In making the improvement I found a bug in the bnxt_en kernel driver.
The bnxt_en driver currently doesn't clear any of the component flags on 
success so I need to send in a fix for that.


Although in one case (RESET_ALL) in the driver it doesn't actually 
execute the reset until all necessary drivers are unloaded to prevent 
the PCIe bus from hanging.
So question: should the flags be cleared if the reset is "pending" but 
hasn't actually happened yet, but will reset once all the drivers are 
all properly unloaded?


Michal Kubecek

[Patch net-next v2] act_mirred: get rid of mirred_list_lock spinlock

2017-12-05 Thread Cong Wang

TC actions are no longer freed in RCU callbacks and we should
always have RTNL lock, so this spinlock is no longer needed.

Cc: Eric Dumazet 
Cc: Jiri Pirko 
Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 net/sched/act_mirred.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index ff497909c0ad..cee2d413bf57 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,7 +29,6 @@
 #include 
 
 static LIST_HEAD(mirred_list);
-static DEFINE_SPINLOCK(mirred_list_lock);
 
 static bool tcf_mirred_is_act_redirect(int action)
 {
@@ -55,13 +54,10 @@ static void tcf_mirred_release(struct tc_action *a)
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
 
-   /* We could be called either in a RCU callback or with RTNL lock held. 
*/
-   spin_lock_bh(&mirred_list_lock);
list_del(&m->tcfm_list);
-   dev = rcu_dereference_protected(m->tcfm_dev, 1);
+   dev = rtnl_dereference(m->tcfm_dev);
if (dev)
dev_put(dev);
-   spin_unlock_bh(&mirred_list_lock);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -147,9 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr 
*nla,
}
 
if (ret == ACT_P_CREATED) {
-   spin_lock_bh(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
-   spin_unlock_bh(&mirred_list_lock);
tcf_idr_insert(tn, *a);
}
 
@@ -293,7 +287,6 @@ static int mirred_device_event(struct notifier_block 
*unused,
 
ASSERT_RTNL();
if (event == NETDEV_UNREGISTER) {
-   spin_lock_bh(&mirred_list_lock);
list_for_each_entry(m, &mirred_list, tcfm_list) {
if (rcu_access_pointer(m->tcfm_dev) == dev) {
dev_put(dev);
@@ -303,7 +296,6 @@ static int mirred_device_event(struct notifier_block 
*unused,
RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
}
-   spin_unlock_bh(&mirred_list_lock);
}
 
return NOTIFY_DONE;
-- 
2.13.0

[Patch net-next v2] act_mirred: get rid of tcfm_ifindex from struct tcf_mirred

2017-12-05 Thread Cong Wang

tcfm_dev always points to the correct netdev and we already
hold a refcnt, so no need to use tcfm_ifindex to lookup again.

If we would support moving target netdev across netns, using
pointer would be better than ifindex.

This also fixes dumping obsolete ifindex, now after the
target device is gone we just dump 0 as ifindex.

Cc: Jiri Pirko 
Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c  |  6 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c  | 12 +---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c |  8 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  6 --
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   |  5 ++---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c|  5 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c |  3 +--
 drivers/net/ethernet/netronome/nfp/flower/action.c|  4 +---
 include/net/tc_act/tc_mirred.h|  6 ++
 net/dsa/slave.c   |  5 +
 net/sched/act_mirred.c|  7 +++
 11 files changed, 26 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 3d201d7324bd..b3ff5287aafe 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -54,12 +54,10 @@ static int bnxt_tc_parse_redir(struct bnxt *bp,
   struct bnxt_tc_actions *actions,
   const struct tc_action *tc_act)
 {
-   int ifindex = tcf_mirred_ifindex(tc_act);
-   struct net_device *dev;
+   struct net_device *dev = tcf_mirred_dev(tc_act);
 
-   dev = __dev_get_by_index(dev_net(bp->dev), ifindex);
if (!dev) {
-   netdev_info(bp->dev, "no dev for ifindex=%d", ifindex);
+   netdev_info(bp->dev, "no dev in mirred action");
return -EINVAL;
}
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index d4a548a6a55c..a12b894f135d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -405,9 +405,7 @@ static void cxgb4_process_flow_actions(struct net_device 
*in,
} else if (is_tcf_gact_shot(a)) {
fs->action = FILTER_DROP;
} else if (is_tcf_mirred_egress_redirect(a)) {
-   int ifindex = tcf_mirred_ifindex(a);
-   struct net_device *out = __dev_get_by_index(dev_net(in),
-   ifindex);
+   struct net_device *out = tcf_mirred_dev(a);
struct port_info *pi = netdev_priv(out);
 
fs->action = FILTER_SWITCH;
@@ -582,14 +580,14 @@ static int cxgb4_validate_flow_actions(struct net_device 
*dev,
/* Do nothing */
} else if (is_tcf_mirred_egress_redirect(a)) {
struct adapter *adap = netdev2adap(dev);
-   struct net_device *n_dev;
-   unsigned int i, ifindex;
+   struct net_device *n_dev, *target_dev;
+   unsigned int i;
bool found = false;
 
-   ifindex = tcf_mirred_ifindex(a);
+   target_dev = tcf_mirred_dev(a);
for_each_port(adap, i) {
n_dev = adap->port[i];
-   if (ifindex == n_dev->ifindex) {
+   if (target_dev == n_dev) {
found = true;
break;
}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index cd0cd13a964d..ab174bcfbfb0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -114,14 +114,14 @@ static int fill_action_fields(struct adapter *adap,
 
/* Re-direct to specified port in hardware. */
if (is_tcf_mirred_egress_redirect(a)) {
-   struct net_device *n_dev;
-   unsigned int i, index;
+   struct net_device *n_dev, *target_dev;
bool found = false;
+   unsigned int i;
 
-   index = tcf_mirred_ifindex(a);
+   target_dev = tcf_mirred_dev(a);
for_each_port(adap, i) {
n_dev = adap->port[i];
-   if (index == n_dev->ifindex) {
+   if (target_dev == n_dev) {

[PATCH bpf-next 1/2] bpf, doc: add bpf trees and tps to maintainers entry

2017-12-05 Thread Daniel Borkmann

i) Add the bpf and bpf-next trees to the maintainers entry
   so they can be found easily and picked up by test bots
   etc that would integrate all trees from maintainers file.
   Suggested by Stephen while integrating the trees into
   linux-next.

ii) Add the two headers defining BPF/XDP tracepoints to the
list of files as well.

Suggested-by: Stephen Rothwell 
Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 MAINTAINERS | 4 
 1 file changed, 4 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4007fa2..77ad4bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2724,12 +2724,16 @@ M:  Alexei Starovoitov 
 M: Daniel Borkmann 
 L: netdev@vger.kernel.org
 L: linux-ker...@vger.kernel.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
 S: Supported
 F: arch/x86/net/bpf_jit*
 F: Documentation/networking/filter.txt
 F: Documentation/bpf/
 F: include/linux/bpf*
 F: include/linux/filter.h
+F: include/trace/events/bpf.h
+F: include/trace/events/xdp.h
 F: include/uapi/linux/bpf*
 F: include/uapi/linux/filter.h
 F: kernel/bpf/
-- 
2.9.5

[PATCH bpf-next 2/2] bpf, doc: add faq about bpf development process

2017-12-05 Thread Daniel Borkmann

In the same spirit of netdev FAQ, start a BPF FAQ as a collection
of expectations and/or workflow details in the context of BPF patch
processing.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 Documentation/bpf/bpf_devel_QA.txt | 519 +
 1 file changed, 519 insertions(+)
 create mode 100644 Documentation/bpf/bpf_devel_QA.txt

diff --git a/Documentation/bpf/bpf_devel_QA.txt 
b/Documentation/bpf/bpf_devel_QA.txt
new file mode 100644
index 000..b0472a4
--- /dev/null
+++ b/Documentation/bpf/bpf_devel_QA.txt
@@ -0,0 +1,519 @@
+This document provides information for the BPF subsystem about various
+workflows related to reporting bugs, submitting patches, and queueing
+patches for stable kernels.
+
+For general information about submitting patches, please refer to
+Documentation/process/. This document only describes additional specifics
+related to BPF.
+
+Reporting bugs:
+---
+
+Q: How do I report bugs for BPF kernel code?
+
+A: Since all BPF kernel development as well as bpftool and iproute2 BPF
+   loader development happens through the netdev kernel mailing list,
+   please report any found issues around BPF to the following mailing
+   list:
+
+ netdev@vger.kernel.org
+
+   This may also include issues related to XDP, BPF tracing, etc.
+
+   Given netdev has a high volume of traffic, please also add the BPF
+   maintainers to Cc (from kernel MAINTAINERS file):
+
+ Alexei Starovoitov 
+ Daniel Borkmann 
+
+   In case a buggy commit has already been identified, make sure to keep
+   the actual commit authors in Cc as well for the report. They can
+   typically be identified through the kernel's git tree.
+
+   Please do *not* report BPF issues to bugzilla.kernel.org since it
+   is a guarantee that the reported issue will be overlooked.
+
+Submitting patches:
+---
+
+Q: To which mailing list do I need to submit my BPF patches?
+
+A: Please submit your BPF patches to the netdev kernel mailing list:
+
+ netdev@vger.kernel.org
+
+   Historically, BPF came out of networking and has always been maintained
+   by the kernel networking community. Although these days BPF touches
+   many other subsystems as well, the patches are still routed mainly
+   through the networking community.
+
+   In case your patch has changes in various different subsystems (e.g.
+   tracing, security, etc), make sure to Cc the related kernel mailing
+   lists and maintainers from there as well, so they are able to review
+   the changes and provide their Acked-by's to the patches.
+
+Q: Where can I find patches currently under discussion for BPF subsystem?
+
+A: All patches that are Cc'ed to netdev are queued for review under netdev
+   patchwork project:
+
+ http://patchwork.ozlabs.org/project/netdev/list/
+
+   Those patches which target BPF, are assigned to a 'bpf' delegate for
+   further processing from BPF maintainers. The current queue with
+   patches under review can be found at:
+
+ https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+
+   Once the patches have been reviewed by the BPF community as a whole
+   and approved by the BPF maintainers, their status in patchwork will be
+   changed to 'Accepted' and the submitter will be notified by mail. This
+   means that the patches look good from a BPF perspective and have been
+   applied to one of the two BPF kernel trees.
+
+   In case feedback from the community requires a respin of the patches,
+   their status in patchwork will be set to 'Changes Requested', and purged
+   from the current review queue. Likewise for cases where patches would
+   get rejected or are not applicable to the BPF trees (but assigned to
+   the 'bpf' delegate).
+
+Q: How do the changes make their way into Linux?
+
+A: There are two BPF kernel trees (git repositories). Once patches have
+   been accepted by the BPF maintainers, they will be applied to one
+   of the two BPF trees:
+
+ https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
+ https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
+
+   The bpf tree itself is for fixes only, whereas bpf-next for features,
+   cleanups or other kind of improvements ("next-like" content). This is
+   analogous to net and net-next trees for networking. Both bpf and
+   bpf-next will only have a master branch in order to simplify against
+   which branch patches should get rebased to.
+
+   Accumulated BPF patches in the bpf tree will regularly get pulled
+   into the net kernel tree. Likewise, accumulated BPF patches accepted
+   into the bpf-next tree will make their way into net-next tree. net and
+   net-next are both run by David S. Miller. From there, they will go
+   into the kernel mainline tree run by Linus Torvalds. To read up on the
+   process of net and net-next being merged into the mainline tree, see
+   the netdev FAQ under:
+
+ Documentation/networking/netdev-FAQ.txt
+
+   Occasionally,

[PATCH bpf-next 0/2] Few BPF doc updates

2017-12-05 Thread Daniel Borkmann

Two changes, i) add BPF trees into maintainers file, and ii) add
a BPF doc around the development process, similarly as we have
with netdev FAQ, but just describing BPF specifics. Thanks!

Daniel Borkmann (2):
  bpf, doc: add bpf trees and tps to maintainers entry
  bpf, doc: add faq about bpf development process

 Documentation/bpf/bpf_devel_QA.txt | 519 +
 MAINTAINERS|   4 +
 2 files changed, 523 insertions(+)
 create mode 100644 Documentation/bpf/bpf_devel_QA.txt

-- 
2.9.5

[PATCH net] net: thunderx: Fix TCP/UDP checksum offload for IPv4 pkts

2017-12-05 Thread Florian Westphal

Offload IP header checksum to NIC.

This fixes a previous patch which disabled checksum offloading
for both IPv4 and IPv6 packets.  So L3 checksum offload was
getting disabled for IPv4 pkts.  And HW is dropping these pkts
for some reason.

Without this patch, IPv4 TSO appears to be broken:

WIthout this patch I get ~16kbyte/s, with patch close to 2mbyte/s
when copying files via scp from test box to my home workstation.

Looking at tcpdump on sender it looks like hardware drops IPv4 TSO skbs.
This patch restores performance for me, ipv6 looks good too.

Fixes: fa6d7cb5d76c ("net: thunderx: Fix TCP/UDP checksum offload for IPv6 
pkts")
Cc: Sunil Goutham 
Cc: Aleksey Makarov 
Cc: Eric Dumazet 
Signed-off-by: Florian Westphal 
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 8b2c31e2a2b0..a3d12dbde95b 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -1355,6 +1355,8 @@ nicvf_sq_add_hdr_subdesc(struct nicvf *nic, struct 
snd_queue *sq, int qentry,
 
/* Offload checksum calculation to HW */
if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   if (ip.v4->version == 4)
+   hdr->csum_l3 = 1; /* Enable IP csum calculation */
hdr->l3_offset = skb_network_offset(skb);
hdr->l4_offset = skb_transport_offset(skb);
 
-- 
2.13.6

[PATCH v2 0/3] macb rx filter cleanups

2017-12-05 Thread Julia Cartwright

Here's a proper patchset based on net-next.

v1 -> v2:
  - Rebased on net-next
  - Add Nicolas's Acks
  - Reorder commits, putting the list_empty() cleanups prior to the
others.
  - Added commit reverting the GFP_ATOMIC change.

Julia Cartwright (3):
  net: macb: kill useless use of list_empty()
  net: macb: reduce scope of rx_fs_lock-protected regions
  net: macb: change GFP_ATOMIC to GFP_KERNEL

 drivers/net/ethernet/cadence/macb_main.c | 47 
 1 file changed, 23 insertions(+), 24 deletions(-)

-- 
2.14.2

[PATCH v2 2/3] net: macb: reduce scope of rx_fs_lock-protected regions

2017-12-05 Thread Julia Cartwright

Commit ae8223de3df5 ("net: macb: Added support for RX filtering")
introduces a lock, rx_fs_lock which is intended to protect the list of
rx_flow items and synchronize access to the hardware rx filtering
registers.

However, the region protected by this lock is overscoped, unnecessarily
including things like slab allocation.  Reduce this lock scope to only
include operations which must be performed atomically: list traversal,
addition, and removal, and hitting the macb filtering registers.

This fixes the use of kmalloc w/ GFP_KERNEL in atomic context.

Fixes: ae8223de3df5 ("net: macb: Added support for RX filtering")
Cc: Rafal Ozieblo 
Cc: Julia Lawall 
Acked-by: Nicolas Ferre 
Signed-off-by: Julia Cartwright 
---
 drivers/net/ethernet/cadence/macb_main.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index b7644836aba1..758e8b3042b2 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2796,6 +2796,7 @@ static int gem_add_flow_filter(struct net_device *netdev,
struct macb *bp = netdev_priv(netdev);
struct ethtool_rx_flow_spec *fs = &cmd->fs;
struct ethtool_rx_fs_item *item, *newfs;
+   unsigned long flags;
int ret = -EINVAL;
bool added = false;
 
@@ -2811,6 +2812,8 @@ static int gem_add_flow_filter(struct net_device *netdev,
htonl(fs->h_u.tcp_ip4_spec.ip4dst),
htons(fs->h_u.tcp_ip4_spec.psrc), 
htons(fs->h_u.tcp_ip4_spec.pdst));
 
+   spin_lock_irqsave(&bp->rx_fs_lock, flags);
+
/* find correct place to add in list */
list_for_each_entry(item, &bp->rx_fs_list.list, list) {
if (item->fs.location > newfs->fs.location) {
@@ -2833,9 +2836,11 @@ static int gem_add_flow_filter(struct net_device *netdev,
if (netdev->features & NETIF_F_NTUPLE)
gem_enable_flow_filters(bp, 1);
 
+   spin_unlock_irqrestore(&bp->rx_fs_lock, flags);
return 0;
 
 err:
+   spin_unlock_irqrestore(&bp->rx_fs_lock, flags);
kfree(newfs);
return ret;
 }
@@ -2846,6 +2851,9 @@ static int gem_del_flow_filter(struct net_device *netdev,
struct macb *bp = netdev_priv(netdev);
struct ethtool_rx_fs_item *item;
struct ethtool_rx_flow_spec *fs;
+   unsigned long flags;
+
+   spin_lock_irqsave(&bp->rx_fs_lock, flags);
 
list_for_each_entry(item, &bp->rx_fs_list.list, list) {
if (item->fs.location == cmd->fs.location) {
@@ -2862,12 +2870,14 @@ static int gem_del_flow_filter(struct net_device 
*netdev,
gem_writel_n(bp, SCRT2, fs->location, 0);
 
list_del(&item->list);
-   kfree(item);
bp->rx_fs_list.count--;
+   spin_unlock_irqrestore(&bp->rx_fs_lock, flags);
+   kfree(item);
return 0;
}
}
 
+   spin_unlock_irqrestore(&bp->rx_fs_lock, flags);
return -EINVAL;
 }
 
@@ -2936,11 +2946,8 @@ static int gem_get_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd,
 static int gem_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
 {
struct macb *bp = netdev_priv(netdev);
-   unsigned long flags;
int ret;
 
-   spin_lock_irqsave(&bp->rx_fs_lock, flags);
-
switch (cmd->cmd) {
case ETHTOOL_SRXCLSRLINS:
if ((cmd->fs.location >= bp->max_tuples)
@@ -2959,7 +2966,6 @@ static int gem_set_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd)
ret = -EOPNOTSUPP;
}
 
-   spin_unlock_irqrestore(&bp->rx_fs_lock, flags);
return ret;
 }
 
-- 
2.14.2

[PATCH v2 1/3] net: macb: kill useless use of list_empty()

2017-12-05 Thread Julia Cartwright

The list_for_each_entry() macro already handles the case where the list
is empty (by not executing the loop body).  It's not necessary to handle
this case specially, so stop doing so.

Cc: Rafal Ozieblo 
Acked-by: Nicolas Ferre 
Signed-off-by: Julia Cartwright 
---
 drivers/net/ethernet/cadence/macb_main.c | 31 ---
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index ebfeab853bf4..b7644836aba1 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2812,24 +2812,20 @@ static int gem_add_flow_filter(struct net_device 
*netdev,
htons(fs->h_u.tcp_ip4_spec.psrc), 
htons(fs->h_u.tcp_ip4_spec.pdst));
 
/* find correct place to add in list */
-   if (list_empty(&bp->rx_fs_list.list))
-   list_add(&newfs->list, &bp->rx_fs_list.list);
-   else {
-   list_for_each_entry(item, &bp->rx_fs_list.list, list) {
-   if (item->fs.location > newfs->fs.location) {
-   list_add_tail(&newfs->list, &item->list);
-   added = true;
-   break;
-   } else if (item->fs.location == fs->location) {
-   netdev_err(netdev, "Rule not added: location %d 
not free!\n",
-   fs->location);
-   ret = -EBUSY;
-   goto err;
-   }
+   list_for_each_entry(item, &bp->rx_fs_list.list, list) {
+   if (item->fs.location > newfs->fs.location) {
+   list_add_tail(&newfs->list, &item->list);
+   added = true;
+   break;
+   } else if (item->fs.location == fs->location) {
+   netdev_err(netdev, "Rule not added: location %d not 
free!\n",
+   fs->location);
+   ret = -EBUSY;
+   goto err;
}
-   if (!added)
-   list_add_tail(&newfs->list, &bp->rx_fs_list.list);
}
+   if (!added)
+   list_add_tail(&newfs->list, &bp->rx_fs_list.list);
 
gem_prog_cmp_regs(bp, fs);
bp->rx_fs_list.count++;
@@ -2851,9 +2847,6 @@ static int gem_del_flow_filter(struct net_device *netdev,
struct ethtool_rx_fs_item *item;
struct ethtool_rx_flow_spec *fs;
 
-   if (list_empty(&bp->rx_fs_list.list))
-   return -EINVAL;
-
list_for_each_entry(item, &bp->rx_fs_list.list, list) {
if (item->fs.location == cmd->fs.location) {
/* disable screener regs for the flow entry */
-- 
2.14.2

[PATCH v2 3/3] net: macb: change GFP_ATOMIC to GFP_KERNEL

2017-12-05 Thread Julia Cartwright

Now that the rx_fs_lock is no longer held across allocation, it's safe
to use GFP_KERNEL for allocating new entries.

This reverts commit 81da3bf6e3f88 ("net: macb: change GFP_KERNEL to
GFP_ATOMIC").

Cc: Julia Lawall 
Signed-off-by: Julia Cartwright 
---
 drivers/net/ethernet/cadence/macb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 758e8b3042b2..234667eaaa92 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2800,7 +2800,7 @@ static int gem_add_flow_filter(struct net_device *netdev,
int ret = -EINVAL;
bool added = false;
 
-   newfs = kmalloc(sizeof(*newfs), GFP_ATOMIC);
+   newfs = kmalloc(sizeof(*newfs), GFP_KERNEL);
if (newfs == NULL)
return -ENOMEM;
memcpy(&newfs->fs, fs, sizeof(newfs->fs));
-- 
2.14.2

Re: [PATCH v2 0/6] bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type

2017-12-05 Thread Daniel Borkmann

On 12/05/2017 04:53 PM, Daniel Borkmann wrote:
> On 12/04/2017 10:56 AM, Hendrik Brueckner wrote:
>> Perf tool bpf selftests revealed a broken uapi for s390 and arm64.
>> With the BPF_PROG_TYPE_PERF_EVENT program type the bpf_perf_event
>> structure exports the pt_regs structure for all architectures.
>>
>> This fails for s390 and arm64 because pt_regs are not part of the
>> user api and kept in-kernel only.  To mitigate the broken uapi,
>> introduce a wrapper that exports pt_regs in an asm-generic way.
>> For arm64, export the exising user_pt_regs structure.  For s390,
>> introduce a user_pt_regs structure that exports the beginning of
>> pt_regs.
>>
>> Note that user_pt_regs must export from the beginning of pt_regs
>> as BPF_PROG_TYPE_PERF_EVENT program type is not the only type for
>> running BPF programs.
>>
>> Some more background:
>>  For the bpf_perf_event, there is a uapi definition that is
>>  passed to the BPF program.  For other "probe" points like
>>  trace points, kprobes, and uprobes, there is no uapi and the
>>  BPF program is always passed pt_regs (which is OK as the BPF
>>  program runs in the kernel context).  The perf tool can attach
>>  BPF programs to all of these "probe" points and, optionally,
>>  can create a BPF prologue to access particular arguments
>>  (passed as registers).  For this, it uses DWARF/CFI
>>  information to obtain the register and calls a perf-arch
>>  backend function, regs_query_register_offset().  This function
>>  returns the index into (user_)pt_regs for a particular
>>  register.  Then, perf creates a BPF prologue that accesses
>>  this register based on the passed stucture from the "probe"
>>  point.
>>
>> Part of this series, are also updates to the testing and bpf selftest
>> to deal with asm-specifics.  To complete the bpf support in perf, the
>> the regs_query_register_offset function is added for s390 to support
>> BPF prologue creation.
>>
>> Changelog v1 -> v2:
>> - Correct kbuild test bot issues by including
>>   asm-generic/bpf_perf_event.h for archictectures that do not have
>>   their own asm version.
>> - Added patch to clean-up whitespace and coding style issues in s390
>>   asm/ptrace.h (#4/6) as suggested by Alexei.
>>
>>
>> Hendrik Brueckner (6):
>>   bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type
>>   s390/bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program
>> type
>>   arm64/bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program
>> type
>>   s390/uapi: correct whitespace & coding style in asm/ptrace.h
>>   selftests/bpf: sync kernel headers and introduce arch support in
>> Makefile
>>   perf s390: add regs_query_register_offset()
> 
> Series looks good to me, thanks for working on this Hendrik! If nobody
> hollers, I would take the fixes via bpf tree later tonight.

Done, applied to bpf, thanks Hendrik!

Re: [PATCH 3/3] make sock_alloc_file() do sock_release() on failures

2017-12-05 Thread David Miller

From: Al Viro 
Date: Tue, 5 Dec 2017 23:29:09 +

> This changes calling conventions (and simplifies the hell out
> the callers).  New rules: once struct socket had been passed
> to sock_alloc_file(), it's been consumed either by struct file
> or by sock_release() done by sock_alloc_file().  Either way
> the caller should not do sock_release() after that point.
> 
> Reviewed-by: Eric Dumazet 
> Signed-off-by: Al Viro 

Applied.

Re: [PATCH tip/core/rcu 21/21] drivers/vhost: Remove now-redundant read_barrier_depends()

2017-12-05 Thread Paul E. McKenney

On Wed, Dec 06, 2017 at 12:09:36AM +0200, Michael S. Tsirkin wrote:
> On Tue, Dec 05, 2017 at 10:57:00PM +0100, Peter Zijlstra wrote:
> > On Tue, Dec 05, 2017 at 11:24:49PM +0200, Michael S. Tsirkin wrote:
> > > READ_ONCE is really all over the place (some code literally replaced all
> > > memory accesses with READ/WRITE ONCE).
> > 
> > Yeah, so?
> 
> Oh my point was I can't just look for READ_ONCE and go
> *that's the pair*. there are too many of these.
> At Paul's suggestion I will document the pairing *this read once has a
> barrier that is paired with that barrier*.
> 
> > Complain to the compiler people for forcing us into that.
> 
> In some cases when you end up with all accesses
> going through read/write once volatile just might better.

That is in fact what the jiffies counter does.  But you lose READ_ONCE()'s
automatic handling of DEC Alpha when you take that approach.

> > > Would an API like WRITE_POINTER()/smp_store_pointer make sense,
> > > and READ_POINTER for symmetry?
> > 
> > No, the whole point of the exercise was to get away from the fact that
> > dependent loads are special.
> 
> It's a pity that dependent stores are still special.

We can make READ_ONCE() not be special at zero cost on non-Alpha
systems, but both smp_wmb() and smp_store_release() are decidedly
not free of added overhead.

Thanx, Paul

Re: [PATCH 1/3] fix kcm_clone()

2017-12-05 Thread David Miller

From: Al Viro 
Date: Tue, 5 Dec 2017 23:27:57 +

> 1) it's fput() or sock_release(), not both
> 2) don't do fd_install() until the last failure exit.
> 3) not a bug per se, but... don't attach socket to struct file
>until it's set up.
> 
> Take reserving descriptor into the caller, move fd_install() to the
> caller, sanitize failure exits and calling conventions.
> 
> Cc: sta...@vger.kernel.org # v4.6+
> Acked-by: Tom Herbert 
> Signed-off-by: Al Viro 

Applied.

Re: [PATCH 2/3] socketpair(): allocate descriptors first

2017-12-05 Thread David Miller

From: Al Viro 
Date: Tue, 5 Dec 2017 23:28:38 +

> simplifies failure exits considerably...
> 
> Reviewed-by: Eric Dumazet 
> Signed-off-by: Al Viro 

Applied.

Re: [PATCH V11 3/5] printk: hash addresses printed with %p

2017-12-05 Thread Linus Torvalds

On Tue, Dec 5, 2017 at 2:57 PM, Geert Uytterhoeven  wrote:
> Lowest 3 is good enough for all natural types, up to long long.
> We may still receive complaints from people who care about seeing if
> a pointer is cacheline-aligned or not. Fixing that may need up to 7 bits, I'm
> afraid, which is a bit too much to give up.

I really think even the lowest three is a bit too much.

Who really cares? If it's something that is particularly _about_
alignment (ie an alignment trap), maybe just print out those bits
separately.

And for everything else? It's purely about getting used to it.

I will just cut-and-paste what I wrote in another thread about the hashing:

  I'm going to require some actual proof of an actual case where it
  mattered that a particular value was hashed.

  Not hand-waving.

  Not "it surprised and confused me" because it looked different. You'll
  get used to it.

  So an actual "this was critical information that mattered for this
  particular bug, and it was missing due to the hashing of this
  particular value and debugging was harder in actual reality due to
  that".

  Because the actual example I have seen so far, not only didn't the
  hashing matter AT ALL, most of the _unhashed_ values shouldn't have
  been there either, and were due to arm still printing stuff that
  shouldn't have been printed at all and just made the oops more complex
  and harder to read and report.

This subject is really easy to bike-shed around. Everybody can have an
opinion. I want actual hard data and facts, not opinions.

And if the hashing _really_ is a problem, we'll just change that
particular thing to %px. But it needs actual hard data and real
reasons first, ok?

  Linus

  Linus

RE: [PATCH net-next 1/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-12-05 Thread Tristram.Ha

> > Sorry to be this late for the reply.  I finally got hold of a KSZ8895 board 
> > that
> > works with my SoC board to confirm the network communication.
> >
> > As expected the KSZ8895 board works correctly as the chip uses the same
> > tail tagging feature in KSZ8795, and I did verify that board is working.
> >
> > One thing to debug this problem is to dump the MIB counters.  Use the
> ethtool
> > utility to show MIB counters of both ports:
> >
> > ethtool -S lan3
> > ethtool -S eth0
> >
> > Assuming eth0 is the MAC controller that drives the switch, the receive
> counters of
> > the host port of the switch should match the transmit counters of
> > lan3, and vice versa.
> 
> Thanks for reply. I'll get to the tests shortly. Could I get .dts
> snippet that works for you and commands you are using for testing?
> 

You said your previous patch works, so I do not think there is anything wrong
with the device tree, unless you are using a completely different one.

The tricky part of network communication is the RMII/MII interface where the
host port connects to the MAC controller.  But again your patch works so there 
is
nothing wrong with the hardware.

I also use a simple setup to test the network:

ifconfig eth0 up
ifconfig lan1 192.168.0.1
ping -c 2 192.168.0.100

If I want a complete test I setup a bridge:

ifconfig eth0 up
ifconfig lan1 up
ifconfig lan2 up
ifconfig lan3 up
ifconfig lan4 up
brctl addbr br0
brctl addif br0 lan1
brctl addif br0 lan2
brctl addif br0 lan3
brctl addif br0 lan4
ifconfig br0 192.168.0.1
ping -c 2 192.168.0.100

Re: [RFC][PATCHES] sock_alloc_file() cleanups and fixes

2017-12-05 Thread Al Viro

On Tue, Dec 05, 2017 at 02:44:43PM -0500, David Miller wrote:
> From: Al Viro 
> Date: Mon, 4 Dec 2017 16:41:01 +
> 
> > On Mon, Dec 04, 2017 at 10:35:24AM -0500, David Miller wrote:
> >> From: Al Viro 
> >> Date: Fri, 1 Dec 2017 00:20:27 +
> >> 
> >> >  1) massage sys_socketpair() (should be a pure cleanup)
> >> >  2) fix and clean up kcm_clone() (-stable fodder)
> >> >  3) switch sock_alloc_file() to new calling conventions.
> >> > 
> >> >  It got some local testing, but it certainly needs more review.
> >> > Diffstat for the entire thing is
> >> 
> >> Series looks great to me:
> >> 
> >> Acked-by: David S. Miller 
> > 
> > How do you prefer it to be handled?  KCM one should go into everything
> > since 4.6 (with trivial modifications in 4.11 and 4.12 - both had
> > massaged the place around the call of kcm_clone() a bit, but this fix
> > overwrites the entire area and that can be dropped into earlier
> > kernels without any problems).  I've put that into vfs.git#net-fixes
> > and have the other two in vfs.git#for-davem on top of that, with
> > you merging the latter into net-next.git and the former - into net.git.
> > Is that OK with you, or would you prefer some other way of handling
> > that kind of stuff?
> 
> Why don't you resubmit this to netdev as a non-RFC, I'll queue it up to
> 'net' and -stable as well.

Sent...

[PATCH 3/3] make sock_alloc_file() do sock_release() on failures

2017-12-05 Thread Al Viro

This changes calling conventions (and simplifies the hell out
the callers).  New rules: once struct socket had been passed
to sock_alloc_file(), it's been consumed either by struct file
or by sock_release() done by sock_alloc_file().  Either way
the caller should not do sock_release() after that point.

Reviewed-by: Eric Dumazet 
Signed-off-by: Al Viro 
---
 drivers/staging/lustre/lnet/lnet/lib-socket.c |  8 ++--
 net/9p/trans_fd.c |  1 -
 net/kcm/kcmsock.c |  7 +--
 net/sctp/socket.c |  1 -
 net/socket.c  | 25 -
 5 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c 
b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index 539a26444f31..7d49d4865298 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -71,16 +71,12 @@ lnet_sock_ioctl(int cmd, unsigned long arg)
}
 
sock_filp = sock_alloc_file(sock, 0, NULL);
-   if (IS_ERR(sock_filp)) {
-   sock_release(sock);
-   rc = PTR_ERR(sock_filp);
-   goto out;
-   }
+   if (IS_ERR(sock_filp))
+   return PTR_ERR(sock_filp);
 
rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg);
 
fput(sock_filp);
-out:
return rc;
 }
 
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 985046ae4231..80f5c79053a4 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -839,7 +839,6 @@ static int p9_socket_open(struct p9_client *client, struct 
socket *csocket)
if (IS_ERR(file)) {
pr_err("%s (%d): failed to map fd\n",
   __func__, task_pid_nr(current));
-   sock_release(csocket);
kfree(p);
return PTR_ERR(file);
}
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index c5fa634e63ca..d4e98f20fc2a 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1629,7 +1629,6 @@ static struct file *kcm_clone(struct socket *osock)
 {
struct socket *newsock;
struct sock *newsk;
-   struct file *file;
 
newsock = sock_alloc();
if (!newsock)
@@ -1649,11 +1648,7 @@ static struct file *kcm_clone(struct socket *osock)
sock_init_data(newsock, newsk);
init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
 
-   file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
-   if (IS_ERR(file))
-   sock_release(newsock);
-
-   return file;
+   return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
 }
 
 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3204a9b29407..8bb5163d6331 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5080,7 +5080,6 @@ static int sctp_getsockopt_peeloff_common(struct sock 
*sk, sctp_peeloff_arg_t *p
*newfile = sock_alloc_file(newsock, 0, NULL);
if (IS_ERR(*newfile)) {
put_unused_fd(retval);
-   sock_release(newsock);
retval = PTR_ERR(*newfile);
*newfile = NULL;
return retval;
diff --git a/net/socket.c b/net/socket.c
index 2df83c0bfde9..05f361faec45 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -406,8 +406,10 @@ struct file *sock_alloc_file(struct socket *sock, int 
flags, const char *dname)
name.len = strlen(name.name);
}
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
-   if (unlikely(!path.dentry))
+   if (unlikely(!path.dentry)) {
+   sock_release(sock);
return ERR_PTR(-ENOMEM);
+   }
path.mnt = mntget(sock_mnt);
 
d_instantiate(path.dentry, SOCK_INODE(sock));
@@ -415,9 +417,11 @@ struct file *sock_alloc_file(struct socket *sock, int 
flags, const char *dname)
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
  &socket_file_ops);
if (IS_ERR(file)) {
-   /* drop dentry, keep inode */
+   /* drop dentry, keep inode for a bit */
ihold(d_inode(path.dentry));
path_put(&path);
+   /* ... and now kill it properly */
+   sock_release(sock);
return file;
}
 
@@ -1330,19 +1334,9 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, 
protocol)
 
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
-   goto out;
-
-   retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
-   if (retval < 0)
-   goto out_release;
-
-out:
-   /* It may be already another descriptor 8) Not kernel problem. */
-   return retval;
+   return retval;
 
-out_release:
-   sock_release(sock);
-   return retval;
+   return sock_map_fd(s

[PATCH 1/3] fix kcm_clone()

2017-12-05 Thread Al Viro

1) it's fput() or sock_release(), not both
2) don't do fd_install() until the last failure exit.
3) not a bug per se, but... don't attach socket to struct file
   until it's set up.

Take reserving descriptor into the caller, move fd_install() to the
caller, sanitize failure exits and calling conventions.

Cc: sta...@vger.kernel.org # v4.6+
Acked-by: Tom Herbert 
Signed-off-by: Al Viro 
---
 net/kcm/kcmsock.c | 71 +--
 1 file changed, 27 insertions(+), 44 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 0b750a22c4b9..c5fa634e63ca 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1625,60 +1625,35 @@ static struct proto kcm_proto = {
 };
 
 /* Clone a kcm socket. */
-static int kcm_clone(struct socket *osock, struct kcm_clone *info,
-struct socket **newsockp)
+static struct file *kcm_clone(struct socket *osock)
 {
struct socket *newsock;
struct sock *newsk;
-   struct file *newfile;
-   int err, newfd;
+   struct file *file;
 
-   err = -ENFILE;
newsock = sock_alloc();
if (!newsock)
-   goto out;
+   return ERR_PTR(-ENFILE);
 
newsock->type = osock->type;
newsock->ops = osock->ops;
 
__module_get(newsock->ops->owner);
 
-   newfd = get_unused_fd_flags(0);
-   if (unlikely(newfd < 0)) {
-   err = newfd;
-   goto out_fd_fail;
-   }
-
-   newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
-   if (IS_ERR(newfile)) {
-   err = PTR_ERR(newfile);
-   goto out_sock_alloc_fail;
-   }
-
newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
 &kcm_proto, true);
if (!newsk) {
-   err = -ENOMEM;
-   goto out_sk_alloc_fail;
+   sock_release(newsock);
+   return ERR_PTR(-ENOMEM);
}
-
sock_init_data(newsock, newsk);
init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
 
-   fd_install(newfd, newfile);
-   *newsockp = newsock;
-   info->fd = newfd;
-
-   return 0;
+   file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+   if (IS_ERR(file))
+   sock_release(newsock);
 
-out_sk_alloc_fail:
-   fput(newfile);
-out_sock_alloc_fail:
-   put_unused_fd(newfd);
-out_fd_fail:
-   sock_release(newsock);
-out:
-   return err;
+   return file;
 }
 
 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
@@ -1708,17 +1683,25 @@ static int kcm_ioctl(struct socket *sock, unsigned int 
cmd, unsigned long arg)
}
case SIOCKCMCLONE: {
struct kcm_clone info;
-   struct socket *newsock = NULL;
-
-   err = kcm_clone(sock, &info, &newsock);
-   if (!err) {
-   if (copy_to_user((void __user *)arg, &info,
-sizeof(info))) {
-   err = -EFAULT;
-   sys_close(info.fd);
-   }
-   }
+   struct file *file;
+
+   info.fd = get_unused_fd_flags(0);
+   if (unlikely(info.fd < 0))
+   return info.fd;
 
+   file = kcm_clone(sock);
+   if (IS_ERR(file)) {
+   put_unused_fd(info.fd);
+   return PTR_ERR(file);
+   }
+   if (copy_to_user((void __user *)arg, &info,
+sizeof(info))) {
+   put_unused_fd(info.fd);
+   fput(file);
+   return -EFAULT;
+   }
+   fd_install(info.fd, file);
+   err = 0;
break;
}
default:
-- 
2.11.0

[PATCH 2/3] socketpair(): allocate descriptors first

2017-12-05 Thread Al Viro

simplifies failure exits considerably...

Reviewed-by: Eric Dumazet 
Signed-off-by: Al Viro 
---
 net/socket.c | 89 ++--
 1 file changed, 38 insertions(+), 51 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index 42d8e9c9ccd5..2df83c0bfde9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1366,87 +1366,74 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, 
int, protocol,
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
/*
+* reserve descriptors and make sure we won't fail
+* to return them to userland.
+*/
+   fd1 = get_unused_fd_flags(flags);
+   if (unlikely(fd1 < 0))
+   return fd1;
+
+   fd2 = get_unused_fd_flags(flags);
+   if (unlikely(fd2 < 0)) {
+   put_unused_fd(fd1);
+   return fd2;
+   }
+
+   err = put_user(fd1, &usockvec[0]);
+   if (err)
+   goto out;
+
+   err = put_user(fd2, &usockvec[1]);
+   if (err)
+   goto out;
+
+   /*
 * Obtain the first socket and check if the underlying protocol
 * supports the socketpair call.
 */
 
err = sock_create(family, type, protocol, &sock1);
-   if (err < 0)
+   if (unlikely(err < 0))
goto out;
 
err = sock_create(family, type, protocol, &sock2);
-   if (err < 0)
-   goto out_release_1;
-
-   err = sock1->ops->socketpair(sock1, sock2);
-   if (err < 0)
-   goto out_release_both;
-
-   fd1 = get_unused_fd_flags(flags);
-   if (unlikely(fd1 < 0)) {
-   err = fd1;
-   goto out_release_both;
+   if (unlikely(err < 0)) {
+   sock_release(sock1);
+   goto out;
}
 
-   fd2 = get_unused_fd_flags(flags);
-   if (unlikely(fd2 < 0)) {
-   err = fd2;
-   goto out_put_unused_1;
+   err = sock1->ops->socketpair(sock1, sock2);
+   if (unlikely(err < 0)) {
+   sock_release(sock2);
+   sock_release(sock1);
+   goto out;
}
 
newfile1 = sock_alloc_file(sock1, flags, NULL);
if (IS_ERR(newfile1)) {
err = PTR_ERR(newfile1);
-   goto out_put_unused_both;
+   sock_release(sock1);
+   sock_release(sock2);
+   goto out;
}
 
newfile2 = sock_alloc_file(sock2, flags, NULL);
if (IS_ERR(newfile2)) {
err = PTR_ERR(newfile2);
-   goto out_fput_1;
+   sock_release(sock2);
+   fput(newfile1);
+   goto out;
}
 
-   err = put_user(fd1, &usockvec[0]);
-   if (err)
-   goto out_fput_both;
-
-   err = put_user(fd2, &usockvec[1]);
-   if (err)
-   goto out_fput_both;
-
audit_fd_pair(fd1, fd2);
 
fd_install(fd1, newfile1);
fd_install(fd2, newfile2);
-   /* fd1 and fd2 may be already another descriptors.
-* Not kernel problem.
-*/
-
return 0;
 
-out_fput_both:
-   fput(newfile2);
-   fput(newfile1);
-   put_unused_fd(fd2);
-   put_unused_fd(fd1);
-   goto out;
-
-out_fput_1:
-   fput(newfile1);
-   put_unused_fd(fd2);
-   put_unused_fd(fd1);
-   sock_release(sock2);
-   goto out;
-
-out_put_unused_both:
+out:
put_unused_fd(fd2);
-out_put_unused_1:
put_unused_fd(fd1);
-out_release_both:
-   sock_release(sock2);
-out_release_1:
-   sock_release(sock1);
-out:
return err;
 }
 
-- 
2.11.0

[PATCH net-next 2/2] samples/bpf: add ip6erspan sample code

2017-12-05 Thread William Tu

Extend the existing tests for ip6erspan.

Signed-off-by: William Tu 
---
 samples/bpf/tcbpf2_kern.c  | 58 ++
 samples/bpf/test_tunnel_bpf.sh | 37 +++
 2 files changed, 95 insertions(+)

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 15a469220e19..79ad061079dd 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -181,6 +181,64 @@ int _erspan_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
 }
 
+SEC("ip4ip6erspan_set_tunnel")
+int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
+{
+   struct bpf_tunnel_key key;
+   struct erspan_metadata md;
+   int ret;
+
+   __builtin_memset(&key, 0x0, sizeof(key));
+   key.remote_ipv6[3] = _htonl(0x11);
+   key.tunnel_id = 2;
+   key.tunnel_tos = 0;
+   key.tunnel_ttl = 64;
+
+   ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+BPF_F_TUNINFO_IPV6);
+   if (ret < 0) {
+   ERROR(ret);
+   return TC_ACT_SHOT;
+   }
+
+   md.index = htonl(123);
+   ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+   if (ret < 0) {
+   ERROR(ret);
+   return TC_ACT_SHOT;
+   }
+
+   return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_get_tunnel")
+int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
+{
+   char fmt[] = "key %d remote ip6 ::%x erspan index 0x%x\n";
+   struct bpf_tunnel_key key;
+   struct erspan_metadata md;
+   u32 index;
+   int ret;
+
+   ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 
BPF_F_TUNINFO_IPV6);
+   if (ret < 0) {
+   ERROR(ret);
+   return TC_ACT_SHOT;
+   }
+
+   ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+   if (ret < 0) {
+   ERROR(ret);
+   return TC_ACT_SHOT;
+   }
+
+   index = bpf_ntohl(md.index);
+   bpf_trace_printk(fmt, sizeof(fmt),
+   key.tunnel_id, key.remote_ipv6[0], index);
+
+   return TC_ACT_OK;
+}
+
 SEC("vxlan_set_tunnel")
 int _vxlan_set_tunnel(struct __sk_buff *skb)
 {
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index 226f45381b76..f53efb62f699 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -70,6 +70,28 @@ function add_erspan_tunnel {
ip addr add dev $DEV 10.1.1.200/24
 }
 
+function add_ip6erspan_tunnel {
+
+   # assign ipv6 address
+   ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+   ip netns exec at_ns0 ip link set dev veth0 up
+   ip addr add dev veth1 ::22/96
+   ip link set dev veth1 up
+
+   # in namespace
+   ip netns exec at_ns0 \
+   ip link add dev $DEV_NS type $TYPE seq key 2 erspan 123 \
+   local ::11 remote ::22
+
+   ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+   ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+   # out of namespace
+   ip link add dev $DEV type $TYPE external
+   ip addr add dev $DEV 10.1.1.200/24
+   ip link set dev $DEV up
+}
+
 function add_vxlan_tunnel {
# Set static ARP entry here because iptables set-mark works
# on L3 packet, as a result not applying to ARP packets,
@@ -184,6 +206,18 @@ function test_erspan {
cleanup
 }
 
+function test_ip6erspan {
+   TYPE=ip6erspan
+   DEV_NS=ip6erspan00
+   DEV=ip6erspan11
+   config_device
+   add_ip6erspan_tunnel
+   attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
+   ping6 -c 3 ::11
+   ip netns exec at_ns0 ping -c 1 10.1.1.200
+   cleanup
+}
+
 function test_vxlan {
TYPE=vxlan
DEV_NS=vxlan00
@@ -239,6 +273,7 @@ function cleanup {
ip link del vxlan11
ip link del geneve11
ip link del erspan11
+   ip link del ip6erspan11
pkill tcpdump
pkill cat
set -ex
@@ -254,6 +289,8 @@ echo "Testing IP6GRETAP tunnel..."
 test_ip6gretap
 echo "Testing ERSPAN tunnel..."
 test_erspan
+echo "Testing IP6ERSPAN tunnel..."
+test_ip6erspan
 echo "Testing VXLAN tunnel..."
 test_vxlan
 echo "Testing GENEVE tunnel..."
-- 
2.7.4

[PATCH net-next 1/2] ip6_gre: add ip6 erspan collect_md mode

2017-12-05 Thread William Tu

Similar to ip6 gretap and ip4 gretap, the patch allows
erspan tunnel to operate in collect metadata mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of
it right away.

Signed-off-by: William Tu 
---
 net/ipv6/ip6_gre.c | 110 +
 1 file changed, 85 insertions(+), 25 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 1510ce9a4e4e..4562579797d1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -524,8 +524,37 @@ static int ip6erspan_rcv(struct sk_buff *skb, int 
gre_hdr_len,
   false, false) < 0)
return PACKET_REJECT;
 
-   tunnel->parms.index = ntohl(index);
-   ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+   if (tunnel->parms.collect_md) {
+   struct metadata_dst *tun_dst;
+   struct ip_tunnel_info *info;
+   struct erspan_metadata *md;
+   __be64 tun_id;
+   __be16 flags;
+
+   tpi->flags |= TUNNEL_KEY;
+   flags = tpi->flags;
+   tun_id = key32_to_tunnel_id(tpi->key);
+
+   tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
+ sizeof(*md));
+   if (!tun_dst)
+   return PACKET_REJECT;
+
+   info = &tun_dst->u.tun_info;
+   md = ip_tunnel_info_opts(info);
+   if (!md)
+   return PACKET_REJECT;
+
+   md->index = index;
+   info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+   info->options_len = sizeof(*md);
+
+   ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+
+   } else {
+   tunnel->parms.index = ntohl(index);
+   ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+   }
 
return PACKET_RCVD;
}
@@ -857,42 +886,73 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff 
*skb,
if (gre_handle_offloads(skb, false))
goto tx_err;
 
-   switch (skb->protocol) {
-   case htons(ETH_P_IP):
-   memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-   prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
-&dsfield, &encap_limit);
-   break;
-   case htons(ETH_P_IPV6):
-   if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
-   goto tx_err;
-   if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
-&dsfield, &encap_limit))
-   goto tx_err;
-   break;
-   default:
-   memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-   break;
-   }
-
if (skb->len > dev->mtu + dev->hard_header_len) {
pskb_trim(skb, dev->mtu + dev->hard_header_len);
truncate = true;
}
 
-   erspan_build_header(skb, t->parms.o_key, t->parms.index,
-   truncate, false);
t->parms.o_flags &= ~TUNNEL_KEY;
-
IPCB(skb)->flags = 0;
-   fl6.daddr = t->parms.raddr;
+
+   /* For collect_md mode, derive fl6 from the tunnel key,
+* for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
+*/
+   if (t->parms.collect_md) {
+   struct ip_tunnel_info *tun_info;
+   const struct ip_tunnel_key *key;
+   struct erspan_metadata *md;
+
+   tun_info = skb_tunnel_info(skb);
+   if (unlikely(!tun_info ||
+!(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ip_tunnel_info_af(tun_info) != AF_INET6))
+   return -EINVAL;
+
+   key = &tun_info->key;
+   memset(&fl6, 0, sizeof(fl6));
+   fl6.flowi6_proto = IPPROTO_GRE;
+   fl6.daddr = key->u.ipv6.dst;
+   fl6.flowlabel = key->label;
+   fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+   dsfield = key->tos;
+   md = ip_tunnel_info_opts(tun_info);
+   if (!md)
+   goto tx_err;
+
+   erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+   ntohl(md->index), truncate, false);
+
+   } else {
+   switch (skb->protocol) {
+   case htons(ETH_P_IP):
+   memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+   prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+&dsfield, &encap_limit);
+   break;
+   case htons(ETH_P_IPV6):
+

[PATCH net-next 0/2] add ip6erspan collect_md mode

2017-12-05 Thread William Tu

Similar to erspan collect_md mode in ipv4, the first patch adds
support for ip6erspan collect metadata mode.  The second patch
adds the test case using bpf_skb_[gs]et_tunnel_key helpers.

The corresponding iproute2 patch:
https://marc.info/?l=linux-netdev&m=151251545410047&w=2

William Tu (2):
  ip6_gre: add ip6 erspan collect_md mode
  samples/bpf: add ip6erspan sample code

 net/ipv6/ip6_gre.c | 110 +++--
 samples/bpf/tcbpf2_kern.c  |  58 ++
 samples/bpf/test_tunnel_bpf.sh |  37 ++
 3 files changed, 180 insertions(+), 25 deletions(-)

-- 
2.7.4

[PATCH v2 iproute2 net-next] gre6: add collect metadata support

2017-12-05 Thread William Tu

The patch adds 'external' option to support collect metadata
gre6 tunnel. Example of L3 and L2 gre device:
bash:~# ip link add dev ip6gre123 type ip6gre external
bash:~# ip link add dev ip6gretap123 type ip6gretap external

Signed-off-by: William Tu 
---
change in v2:
  - remove "noexternal" in man page
---
 ip/link_gre6.c| 55 ---
 man/man8/ip-link.8.in |  6 ++
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/ip/link_gre6.c b/ip/link_gre6.c
index 0a82eaecf2cd..2cb46ca116d0 100644
--- a/ip/link_gre6.c
+++ b/ip/link_gre6.c
@@ -105,6 +105,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, 
char **argv,
__u16 encapflags = TUNNEL_ENCAP_FLAG_CSUM6;
__u16 encapsport = 0;
__u16 encapdport = 0;
+   __u8 metadata = 0;
int len;
__u32 fwmark = 0;
__u32 erspan_idx = 0;
@@ -178,6 +179,9 @@ get_failed:
if (greinfo[IFLA_GRE_ENCAP_SPORT])
encapsport = 
rta_getattr_u16(greinfo[IFLA_GRE_ENCAP_SPORT]);
 
+   if (greinfo[IFLA_GRE_COLLECT_METADATA])
+   metadata = 1;
+
if (greinfo[IFLA_GRE_ENCAP_DPORT])
encapdport = 
rta_getattr_u16(greinfo[IFLA_GRE_ENCAP_DPORT]);
 
@@ -355,6 +359,8 @@ get_failed:
encapflags |= TUNNEL_ENCAP_FLAG_REMCSUM;
} else if (strcmp(*argv, "noencap-remcsum") == 0) {
encapflags &= ~TUNNEL_ENCAP_FLAG_REMCSUM;
+   } else if (strcmp(*argv, "external") == 0) {
+   metadata = 1;
} else if (strcmp(*argv, "fwmark") == 0) {
NEXT_ARG();
if (strcmp(*argv, "inherit") == 0) {
@@ -388,26 +394,30 @@ get_failed:
argc--; argv++;
}
 
-   addattr32(n, 1024, IFLA_GRE_IKEY, ikey);
-   addattr32(n, 1024, IFLA_GRE_OKEY, okey);
-   addattr_l(n, 1024, IFLA_GRE_IFLAGS, &iflags, 2);
-   addattr_l(n, 1024, IFLA_GRE_OFLAGS, &oflags, 2);
-   addattr_l(n, 1024, IFLA_GRE_LOCAL, &laddr, sizeof(laddr));
-   addattr_l(n, 1024, IFLA_GRE_REMOTE, &raddr, sizeof(raddr));
-   if (link)
-   addattr32(n, 1024, IFLA_GRE_LINK, link);
-   addattr_l(n, 1024, IFLA_GRE_TTL, &hop_limit, 1);
-   addattr_l(n, 1024, IFLA_GRE_ENCAP_LIMIT, &encap_limit, 1);
-   addattr_l(n, 1024, IFLA_GRE_FLOWINFO, &flowinfo, 4);
-   addattr32(n, 1024, IFLA_GRE_FLAGS, flags);
-   addattr32(n, 1024, IFLA_GRE_FWMARK, fwmark);
-   if (erspan_idx != 0)
-   addattr32(n, 1024, IFLA_GRE_ERSPAN_INDEX, erspan_idx);
-
-   addattr16(n, 1024, IFLA_GRE_ENCAP_TYPE, encaptype);
-   addattr16(n, 1024, IFLA_GRE_ENCAP_FLAGS, encapflags);
-   addattr16(n, 1024, IFLA_GRE_ENCAP_SPORT, htons(encapsport));
-   addattr16(n, 1024, IFLA_GRE_ENCAP_DPORT, htons(encapdport));
+   if (!metadata) {
+   addattr32(n, 1024, IFLA_GRE_IKEY, ikey);
+   addattr32(n, 1024, IFLA_GRE_OKEY, okey);
+   addattr_l(n, 1024, IFLA_GRE_IFLAGS, &iflags, 2);
+   addattr_l(n, 1024, IFLA_GRE_OFLAGS, &oflags, 2);
+   addattr_l(n, 1024, IFLA_GRE_LOCAL, &laddr, sizeof(laddr));
+   addattr_l(n, 1024, IFLA_GRE_REMOTE, &raddr, sizeof(raddr));
+   if (link)
+   addattr32(n, 1024, IFLA_GRE_LINK, link);
+   addattr_l(n, 1024, IFLA_GRE_TTL, &hop_limit, 1);
+   addattr_l(n, 1024, IFLA_GRE_ENCAP_LIMIT, &encap_limit, 1);
+   addattr_l(n, 1024, IFLA_GRE_FLOWINFO, &flowinfo, 4);
+   addattr32(n, 1024, IFLA_GRE_FLAGS, flags);
+   addattr32(n, 1024, IFLA_GRE_FWMARK, fwmark);
+   if (erspan_idx != 0)
+   addattr32(n, 1024, IFLA_GRE_ERSPAN_INDEX, erspan_idx);
+
+   addattr16(n, 1024, IFLA_GRE_ENCAP_TYPE, encaptype);
+   addattr16(n, 1024, IFLA_GRE_ENCAP_FLAGS, encapflags);
+   addattr16(n, 1024, IFLA_GRE_ENCAP_SPORT, htons(encapsport));
+   addattr16(n, 1024, IFLA_GRE_ENCAP_DPORT, htons(encapdport));
+   } else {
+   addattr_l(n, 1024, IFLA_GRE_COLLECT_METADATA, NULL, 0);
+   }
 
return 0;
 }
@@ -426,6 +436,11 @@ static void gre_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
if (!tb)
return;
 
+   if (tb[IFLA_GRE_COLLECT_METADATA]) {
+   print_bool(PRINT_ANY, "collect_metadata", "external", true);
+   return;
+   }
+
if (tb[IFLA_GRE_FLAGS])
flags = rta_getattr_u32(tb[IFLA_GRE_FLAGS]);
 
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index a6a10e577b1f..eb04f887c940 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -755,6 +755,8 @@ the following additional arguments are supported:
 .BI "dscp inherit"
 ] [
 .BI dev " PHYS_DEV "
+] [
+.RB

Re: [PATCH] dccp: CVE-2017-8824: use-after-free in DCCP code

2017-12-05 Thread David Miller

From: simo.ghan...@gmail.com
Date: Tue,  5 Dec 2017 20:58:35 +

> From: Mohamed Ghannam 
> 
> Whenever the sock object is in DCCP_CLOSED state,
> dccp_disconnect() must free dccps_hc_tx_ccid and
> dccps_hc_rx_ccid and set to NULL.
> 
> Signed-off-by: Mohamed Ghannam 

Applied and queued up for -stable, thanks.

Re: [Patch net-next] net_sched: remove unused parameter from act cleanup ops

2017-12-05 Thread David Miller

From: Cong Wang 
Date: Tue,  5 Dec 2017 12:53:07 -0800

> No one actually uses it.
> 
> Cc: Jiri Pirko 
> Cc: Jamal Hadi Salim 
> Signed-off-by: Cong Wang 

Nice cleanup, applied, thanks Cong.

Re: [PATCH net] net: remove hlist_nulls_add_tail_rcu()

2017-12-05 Thread David Miller

From: Eric Dumazet 
Date: Tue, 05 Dec 2017 12:45:56 -0800

> From: Eric Dumazet 
> 
> Alexander Potapenko reported use of uninitialized memory [1]
> 
> This happens when inserting a request socket into TCP ehash,
> in __sk_nulls_add_node_rcu(), since sk_reuseport is not initialized.
> 
> Bug was added by commit d894ba18d4e4 ("soreuseport: fix ordering for
> mixed v4/v6 sockets")
> 
> Note that d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6
> ordering fix") missed the opportunity to get rid of
> hlist_nulls_add_tail_rcu() :
> 
> Both UDP sockets and TCP/DCCP listeners no longer use
> __sk_nulls_add_node_rcu() for their hash insertion.
> 
> Since all other sockets have unique 4-tuple, the reuseport status
> has no special meaning, so we can always use hlist_nulls_add_head_rcu()
> for them and save few cycles/instructions.
> 
> [1]
 ...
> Fixes: d894ba18d4e4 ("soreuseport: fix ordering for mixed v4/v6 sockets")
> Fixes: d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering 
> fix")
> Signed-off-by: Eric Dumazet 
> Reported-by: Alexander Potapenko 
> Acked-by: Craig Gallek 

I was just talking with Craig and Willem about this change the other
day, what a coincidence :-)

Applied and queued up for -stable, thanks Eric.

Re: [PATCH net 0/2] net: qualcomm: rmnet: Fix leaks in failure scenarios

2017-12-05 Thread David Miller

From: Subash Abhinov Kasiviswanathan 
Date: Tue,  5 Dec 2017 13:41:16 -0700

> Patch 1 fixes a leak in transmit path where a skb cannot be
> transmitted due to insufficient headroom to stamp the map header.
> Patch 2 fixes a leak in rmnet_newlink() failure because the
> rmnet endpoint was never freed

Series applied, thank you.

Re: [PATCH v2 net-next] netlink: optimize err assignment

2017-12-05 Thread Stephen Hemminger

On Sun, 03 Dec 2017 07:20:09 -0800
Eric Dumazet  wrote:

> On Sun, 2017-12-03 at 21:10 +0800, yuan linyu wrote:
> > From: yuan linyu 
> > 
> > Signed-off-by: yuan linyu 
> > ---
> > v2: fix kbuild test warning
> > ---
> >  net/netlink/af_netlink.c | 52 
> > 
> >  1 file changed, 22 insertions(+), 30 deletions(-)
> >   
> 
> I see no reason why we should accept this code churn.
> 
> This kind of change makes future fix backports harder.
> 

I more worried about the possibility of latent bugs.
Humans aren't perfect at following all code paths

Re: [PATCH net-next v2 0/5] net: dsa: use per-port upstream port

2017-12-05 Thread David Miller

From: Vivien Didelot 
Date: Tue,  5 Dec 2017 15:34:08 -0500

> An upstream port is a local switch port used to reach a CPU port.
> 
> DSA still considers a unique CPU port in the whole switch fabric and
> thus return a unique upstream port for a given switch. This is wrong in
> a multiple CPU ports environment.
> 
> We are now switching to using the dedicated CPU port assigned to each
> port in order to get rid of the deprecated unique tree CPU port.
> 
> This patchset makes the dsa_upstream_port() helper take a port argument
> and goes one step closer complete support for multiple CPU ports.
> 
> Changes in v2:
>   - reverse-christmas-tree-fy variables

Series applied, thanks Vivien.

Re: [PATCH 1/2] net: macb: reduce scope of rx_fs_lock-protected regions

2017-12-05 Thread David Miller

From: Julia Cartwright 
Date: Tue, 5 Dec 2017 14:17:11 -0600

> While Julia Lawall's cocci-generated patch fixes the problem, the right
> solution is to obviate the problem altogether.

I already applied Julia's patch.  And I hope that if you generated
this against current net-next you would have seen that.

So you'll need to redo this series and put the GFP_KERNEL back.

Re: [PATCH net] netlink: Relax attr validation for fixed length types

2017-12-05 Thread David Miller

From: David Ahern 
Date: Tue,  5 Dec 2017 12:55:40 -0700

> Commit 28033ae4e0f5 ("net: netlink: Update attr validation to require
> exact length for some types") requires attributes using types NLA_U* and
> NLA_S* to have an exact length. This change is exposing bugs in various
> userspace commands that are sending attributes with an invalid length
> (e.g., attribute has type NLA_U8 and userspace sends NLA_U32). While
> the commands are clearly broken and need to be fixed, users are arguing
> that the sudden change in enforcement is breaking older commands on
> newer kernels for use cases that otherwise "worked".
> 
> Relax the validation to print a warning mesage similar to what is done
> for messages containing extra bytes after parsing.
> 
> Fixes: 28033ae4e0f5 ("net: netlink: Update attr validation to require exact 
> length for some types")
> Signed-off-by: David Ahern 

Johannes, please review.

> ---
>  lib/nlattr.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/nlattr.c b/lib/nlattr.c
> index 8bf78b4b78f0..6122662906c8 100644
> --- a/lib/nlattr.c
> +++ b/lib/nlattr.c
> @@ -28,8 +28,16 @@ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
>  };
>  
>  static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
> + [NLA_U8]= sizeof(u8),
> + [NLA_U16]   = sizeof(u16),
> + [NLA_U32]   = sizeof(u32),
> + [NLA_U64]   = sizeof(u64),
>   [NLA_MSECS] = sizeof(u64),
>   [NLA_NESTED]= NLA_HDRLEN,
> + [NLA_S8]= sizeof(s8),
> + [NLA_S16]   = sizeof(s16),
> + [NLA_S32]   = sizeof(s32),
> + [NLA_S64]   = sizeof(s64),
>  };
>  
>  static int validate_nla_bitfield32(const struct nlattr *nla,
> @@ -70,10 +78,9 @@ static int validate_nla(const struct nlattr *nla, int 
> maxtype,
>   BUG_ON(pt->type > NLA_TYPE_MAX);
>  
>   /* for data types NLA_U* and NLA_S* require exact length */
> - if (nla_attr_len[pt->type]) {
> - if (attrlen != nla_attr_len[pt->type])
> - return -ERANGE;
> - return 0;
> + if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
> + pr_warn_ratelimited("netlink: '%s': attribute type %d has an 
> invalid length.\n",
> + current->comm, type);
>   }
>  
>   switch (pt->type) {
> -- 
> 2.11.0
>

Re: [PATCH V11 3/5] printk: hash addresses printed with %p

2017-12-05 Thread Geert Uytterhoeven

Hi Tobin,

On Tue, Dec 5, 2017 at 9:44 PM, Tobin C. Harding  wrote:
> On Tue, Dec 05, 2017 at 09:20:57PM +0100, Geert Uytterhoeven wrote:
>> On Wed, Nov 29, 2017 at 3:05 AM, Tobin C. Harding  wrote:
>> > Currently there exist approximately 14 000 places in the kernel where
>> > addresses are being printed using an unadorned %p. This potentially
>> > leaks sensitive information regarding the Kernel layout in memory. Many
>> > of these calls are stale, instead of fixing every call lets hash the
>> > address by default before printing. This will of course break some
>> > users, forcing code printing needed addresses to be updated.
>> >
>> > Code that _really_ needs the address will soon be able to use the new
>> > printk specifier %px to print the address.
>>
>> > --- a/lib/vsprintf.c
>> > +++ b/lib/vsprintf.c
>>
>> > +/* Maps a pointer to a 32 bit unique identifier. */
>> > +static char *ptr_to_id(char *buf, char *end, void *ptr, struct 
>> > printf_spec spec)
>> > +{
>> > +   unsigned long hashval;
>> > +   const int default_width = 2 * sizeof(ptr);
>> > +
>> > +   if (unlikely(!have_filled_random_ptr_key)) {
>> > +   spec.field_width = default_width;
>> > +   /* string length must be less than default_width */
>> > +   return string(buf, end, "(ptrval)", spec);
>> > +   }
>> > +
>> > +#ifdef CONFIG_64BIT
>> > +   hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
>> > +   /*
>> > +* Mask off the first 32 bits, this makes explicit that we have
>> > +* modified the address (and 32 bits is plenty for a unique ID).
>> > +*/
>> > +   hashval = hashval & 0x;
>> > +#else
>> > +   hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
>> > +#endif
>>
>> Would it make sense to keep the 3 lowest bits of the address?
>>
>> Currently printed pointers no longer have any correlation with the actual
>> alignment in memory of the object, which is a typical cause of a class of 
>> bugs.
>
> We'd have to keep the lowest 4 since we are printing in hex, right? This
> is easy enough to add. I wasn't the architect behind the hashing but I
> can do up a patch and see if anyone who knows crypto objects.

Lowest 3 is good enough for all natural types, up to long long.
We may still receive complaints from people who care about seeing if
a pointer is cacheline-aligned or not. Fixing that may need up to 7 bits, I'm
afraid, which is a bit too much to give up.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

[PATCH] netlink: Add netns check on taps

2017-12-05 Thread Kevin Cernekee

Currently, a nlmon link inside a child namespace can observe systemwide
netlink activity.  Filter the traffic so that in a non-init netns,
nlmon can only sniff netlink messages from its own netns.

Test case:

vpnns -- bash -c "ip link add nlmon0 type nlmon; \
  ip link set nlmon0 up; \
  tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" &
sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \
spi 0x1 mode transport \
auth sha1 0x616263313233 \
enc aes 0x
grep abc123 /tmp/nlmon.pcap

Signed-off-by: Kevin Cernekee 
---
 net/netlink/af_netlink.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b9e0ee4..88381a2 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -253,6 +253,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
struct sock *sk = skb->sk;
int ret = -ENOMEM;
 
+   if (!net_eq(dev_net(dev), sock_net(sk)) &&
+   !net_eq(dev_net(dev), &init_net)) {
+   return 0;
+   }
+
dev_hold(dev);
 
if (is_vmalloc_addr(skb->head))
-- 
2.7.4

Re: [RFC] virtio-net: help live migrate SR-IOV devices

2017-12-05 Thread Stephen Hemminger

On Tue, 5 Dec 2017 14:29:28 -0800
Jakub Kicinski  wrote:

> On Tue, 5 Dec 2017 11:59:17 +0200, achiad shochat wrote:
> >  I second Jacob - having a netdev of one device driver enslave a netdev
> >  of another device driver is an awkward a-symmetric model.
> >  Regardless of whether they share the same backend device.
> >  Only I am not sure the Linux Bond is the right choice.
> >  e.g one may well want to use the virtio device also when the
> >  pass-through device is available, e.g for multicasts, east-west
> >  traffic, etc.
> >  I'm not sure the Linux Bond fits that functionality.
> >  And, as I hear in this thread, it is hard to make it work out of the 
> >  box.
> >  So I think the right thing would be to write a new dedicated module
> >  for this purpose.
> > >
> > > This part I can sort of agree with. What if we were to look at
> > > providing a way to somehow advertise that the two devices were meant
> > > to be boded for virtualization purposes? For now lets call it a
> > > "virt-bond". Basically we could look at providing a means for virtio
> > > and VF drivers to advertise that they want this sort of bond. Then it
> > > would just be a matter of providing some sort of side channel to
> > > indicate where you want things like multicast/broadcast/east-west
> > > traffic to go.  
> > 
> > I like this approach.  
> 
> +1 on a separate driver, just enslaving devices to virtio may break
> existing setups.  If people are bonding from user space today, if they
> update their kernel it may surprise them how things get auto-mangled.
> 
> Is what Alex is suggesting a separate PV device that says "I would
> like to be a bond of those two interfaces"?  That would make the HV
> intent explicit and kernel decisions more understandable.

So far, in my experience it still works.
As long as the kernel slaving happens first, it will work.
The attempt to bond an already slaved device will fail and no scripts seem
to check the error return.

[PATCHv3 0/2] capability controlled user-namespaces

2017-12-05 Thread Mahesh Bandewar

From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  7 ++
 include/linux/user_namespace.h  | 25 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  4 
 security/commoncap.c|  8 +++
 7 files changed, 122 insertions(+)

-- 
2.15.0.531.g2ccb3012c9-goog

[PATCHv3 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-12-05 Thread Mahesh Bandewar

From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Acked-by: Serge Hallyn 
Signed-off-by: Mahesh Bandewar 
---
v3:
  Added couple of comments as requested by Serge Hallyn
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(&caps_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap_to_u32arr

Re: [PATCH v2 3/3] ethtool: Add ETHTOOL_RESET support via --reset command

2017-12-05 Thread Scott Branden


Hi Andrew,


On 17-12-05 02:26 PM, Andrew Lunn wrote:

On Tue, Dec 05, 2017 at 12:53:23PM -0800, Scott Branden wrote:

Add ETHTOOL_RESET support via --reset command.

ie.  ethtool --reset DEVNAME 

flagnames currently match the ETH_RESET_xxx names:
mgmt,irq,dma,filter,offload,mac,phy,ram,dedicated,all

Yes, I missed adding ap to the commit message here.

[Snip]


+.B ethtool \-\-reset
+.I devname
+.BN flags
+.RB [ mgmt ]
+.RB [ irq ]
+.RB [ dma ]
+.RB [ filter ]
+.RB [ offload ]
+.RB [ mac ]
+.RB [ phy ]
+.RB [ ram ]
+.RB [ ap ]
+.RB [ dedicated ]
+.RB [ all ]

Hi Scott

Just a nick pick. You don't list ap above, which is kind of why you
are doing this, if i remember correctly.
Yes - I added ap to v2 of this patch but didn't add it to the commit 
message. Will update in v3.


 Andrew

Re: [RfC net-next 0/3] RTL8211F Ethernet PHY "documentation"

2017-12-05 Thread Andrew Lunn

> I do not expect that this series is applied. if someone is interested
> in testing this: it applies on top of my other series:
> "Realtek Ethernet PHY driver improvements" [1]

Hi Martin

Thanks for the patches. Documentation like this is often useful.

   Andrew

[PATCHv3 2/2] userns: control capabilities of some user namespaces

2017-12-05 Thread Mahesh Bandewar

From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Acked-by: Serge Hallyn 
Signed-off-by: Mahesh Bandewar 
---
v3:
  Rebase
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  4 
 include/linux/user_namespace.h | 25 +
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 46 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..383f31f066f0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,10 @@ extern bool ptracer_capable(struct task_struct *tsk, 
struct user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+/* Controlled capability is capability that is missing from the capability-mask
+ * controlled_userns_caps_whitelist controlled via sysctl.
+ */
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..a5c48684b317 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -112,6 +113,21 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+/* Controlled user-ns is the one that is created by a process that does not
+ * have CAP_SYS_ADMIN (or descended from such an user-ns).
+ * For more details please see the sysctl description of
+ * controlled_userns_caps_whitelist.
+ */
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -170,6 +186,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 246d4d4ce5c7..ca0556d466b6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -141,6 +141,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
diff --git a/security/commoncap.c b/security/commoncap.c
index 4f8e09340956..5454e9c03ee8 100644
--- a/security/com

Re: [RFC] virtio-net: help live migrate SR-IOV devices

2017-12-05 Thread Jakub Kicinski

On Tue, 5 Dec 2017 11:59:17 +0200, achiad shochat wrote:
>  I second Jacob - having a netdev of one device driver enslave a netdev
>  of another device driver is an awkward a-symmetric model.
>  Regardless of whether they share the same backend device.
>  Only I am not sure the Linux Bond is the right choice.
>  e.g one may well want to use the virtio device also when the
>  pass-through device is available, e.g for multicasts, east-west
>  traffic, etc.
>  I'm not sure the Linux Bond fits that functionality.
>  And, as I hear in this thread, it is hard to make it work out of the box.
>  So I think the right thing would be to write a new dedicated module
>  for this purpose.  
> >
> > This part I can sort of agree with. What if we were to look at
> > providing a way to somehow advertise that the two devices were meant
> > to be boded for virtualization purposes? For now lets call it a
> > "virt-bond". Basically we could look at providing a means for virtio
> > and VF drivers to advertise that they want this sort of bond. Then it
> > would just be a matter of providing some sort of side channel to
> > indicate where you want things like multicast/broadcast/east-west
> > traffic to go.
> 
> I like this approach.

+1 on a separate driver, just enslaving devices to virtio may break
existing setups.  If people are bonding from user space today, if they
update their kernel it may surprise them how things get auto-mangled.

Is what Alex is suggesting a separate PV device that says "I would
like to be a bond of those two interfaces"?  That would make the HV
intent explicit and kernel decisions more understandable.

Re: [PATCH v2 3/3] ethtool: Add ETHTOOL_RESET support via --reset command

2017-12-05 Thread Michal Kubecek

On Tue, Dec 05, 2017 at 02:06:09PM -0800, Scott Branden wrote:
> On 17-12-05 01:30 PM, Michal Kubecek wrote:
> > On Tue, Dec 05, 2017 at 12:53:23PM -0800, Scott Branden wrote:
> > > Add ETHTOOL_RESET support via --reset command.
> > > 
> > > ie.  ethtool --reset DEVNAME 
> > > 
> > > flagnames currently match the ETH_RESET_xxx names:
> > > mgmt,irq,dma,filter,offload,mac,phy,ram,dedicated,all
> > > 
> > > Alternatively, you can specific component bitfield directly using
> > > ethtool --reset DEVNAME flags %x
> > IMHO it would be more consistent with e.g. msglvl without the keyword
> > "flags".
> I don't see the consistency in ethtool of specifying a number without a
> keyword in front of it.
> I can only find --set-dump specify a number?
> Others have keyword and number.  msglvl is the keyword after specifying -s -
> same as flags is the keyword I use after specifying --reset.

What I meant is that you can write

ethtool -s eth0 msglvl drv on probe off
ethtool -s eth0 msglvl 0x7

i.e. either number or names (with on/off in this case) while your patch
has

ethtool --reset eth0 mgmg,irq
ethtool --reset eth0 flags 0x3

i.e. an extra keyword if a number is used.

But it's not really important, it doesn't seem I would be able to share
a parser for this with any other subcommand or parameter anyway.

> >   It would be also nice to provide a symbolic way to specify the
> > shared flags.
> 
> I'll change to allow -shared to be added to the end of each component
> specified to use the shared bit.
>  IE. mgmt-shared, irq-shared, dma-shared ?

Sounds good to me.

> > > + resetinfo.cmd = ETHTOOL_RESET;
> > > +
> > > + if (send_ioctl(ctx, &resetinfo)) {
> > > + perror("Cannot issue RESET");
> > > + return 1;
> > > + }
> > > + fprintf(stdout, "RESET 0x%x issued\n", resetinfo.data);
> > 
> > According to documentation, driver is supposed to clear the flags
> > corresponding to components which were reset so that what is left are
> > those which were _not_ reset.
> 
> I'll move the print above the send_ioctl.

It might be even more useful if ethtool informed user what actually
happened, i.e. either change the message to saying these are bits for
components not reset (if resetinfo.data is not zero) or save the
original value of resetinfo.data and show  saved_data & ~resetinfo.data

Michal Kubecek

Re: [PATCH v2 3/3] ethtool: Add ETHTOOL_RESET support via --reset command

2017-12-05 Thread Andrew Lunn

On Tue, Dec 05, 2017 at 12:53:23PM -0800, Scott Branden wrote:
> Add ETHTOOL_RESET support via --reset command.
> 
> ie.  ethtool --reset DEVNAME 
> 
> flagnames currently match the ETH_RESET_xxx names:
> mgmt,irq,dma,filter,offload,mac,phy,ram,dedicated,all

[Snip]

> +.B ethtool \-\-reset
> +.I devname
> +.BN flags
> +.RB [ mgmt ]
> +.RB [ irq ]
> +.RB [ dma ]
> +.RB [ filter ]
> +.RB [ offload ]
> +.RB [ mac ]
> +.RB [ phy ]
> +.RB [ ram ]
> +.RB [ ap ]
> +.RB [ dedicated ]
> +.RB [ all ]

Hi Scott

Just a nick pick. You don't list ap above, which is kind of why you
are doing this, if i remember correctly.

Andrew

Re: [PATCH net-next 1/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-12-05 Thread Pavel Machek

On Tue 2017-12-05 22:16:45, tristram...@microchip.com wrote:
> > Thanks for patches. I installed whole series on top of net-next.
> > 
> > Hardware is:
> > 
> > root@miro:~# cat /proc/cpuinfo
> > model name   : ARM926EJ-S rev 5 (v5l)
> > Hardware  : Freescale MXS (Device Tree)
> > 
> > I added devicetree chunks, and enabled DSA in the config. It seems
> > switch is detected:
> > 
> > [4.775934] Micrel KSZ8051 dsa-0.0:00: attached PHY driver [Micrel
> > KSZ8051] (mii_bus:phy_addr=dsa-0.0:00, irq=POLL)
> > [4.885952] Micrel KSZ8051 dsa-0.0:01: attached PHY driver [Micrel
> > KSZ8051] (mii_bus:phy_addr=dsa-0.0:01, irq=POLL)
> > [4.995934] Micrel KSZ8051 dsa-0.0:02: attached PHY driver [Micrel
> > KSZ8051] (mii_bus:phy_addr=dsa-0.0:02, irq=POLL)
> > [5.011484] DSA: tree 0 setup
> > 
> > root@miro:~# ifconfig lan3 192.168.20.103 netmask 255.255.0.0 up
> > [  131.196667] IPv6: ADDRCONF(NETDEV_UP): lan3: link is not ready
> > root@miro:~# [  132.225863] ksz8895-switch spi2.0 lan3: Link is Up -
> > 100Mbps/Full - flow control rx/tx
> > [  132.233939] IPv6: ADDRCONF(NETDEV_CHANGE): lan3: link becomes ready
> > 
> > root@miro:~# ping 192.168.1.1
> > PING 192.168.1.1 (192.168.1.1): 56 data bytes
> > ^C
> > --- 192.168.1.1 ping statistics ---
> > 7 packets transmitted, 0 packets received, 100% packet loss
> > root@miro:~# ifconfig [  149.904234] random: crng init done
> > 
> > But packets do not go through, and there is nothing helpful in
> > dmesg. Dts part is:
> > 
> > spi@0 {
> > compatible = "microchip,ksz8895";
> > spi-max-frequency = <2500>;
> > reg = <0>;
> > // reset-gpios = <&gpio2 8 0>;
> > status = "okay";
> > 
> > spi-cpha;
> > spi-cpol;
> >ports {
> >  #address-cells = <1>;
> >  #size-cells = <0>;
> >  port@0 {
> > reg = <0>;
> > label = "lan1";
> >  };
> >  port@1 {
> > reg = <1>;
> > label = "lan2";
> >  };
> >  port@2 {
> > reg = <2>;
> > label = "lan3";
> >  };
> >  port@4 {
> > reg = <4>;
> > label = "cpu";
> > ethernet = <&mac0>;
> > fixed-link {
> >speed = <100>;
> >full-duplex;
> > };
> >  };
> >};
> > };
> > 
> > I went back to my version of dsa patches, and test above works as
> > expected.
> 
> Sorry to be this late for the reply.  I finally got hold of a KSZ8895 board 
> that
> works with my SoC board to confirm the network communication.
> 
> As expected the KSZ8895 board works correctly as the chip uses the same
> tail tagging feature in KSZ8795, and I did verify that board is working.
> 
> One thing to debug this problem is to dump the MIB counters.  Use the ethtool
> utility to show MIB counters of both ports:
> 
> ethtool -S lan3
> ethtool -S eth0
> 
> Assuming eth0 is the MAC controller that drives the switch, the receive 
> counters of
> the host port of the switch should match the transmit counters of
> lan3, and vice versa.

Thanks for reply. I'll get to the tests shortly. Could I get .dts
snippet that works for you and commands you are using for testing?

Thanks,
Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature

RE: [PATCH net-next 1/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-12-05 Thread Tristram.Ha

> Thanks for patches. I installed whole series on top of net-next.
> 
> Hardware is:
> 
> root@miro:~# cat /proc/cpuinfo
> model name   : ARM926EJ-S rev 5 (v5l)
> Hardware: Freescale MXS (Device Tree)
> 
> I added devicetree chunks, and enabled DSA in the config. It seems
> switch is detected:
> 
> [4.775934] Micrel KSZ8051 dsa-0.0:00: attached PHY driver [Micrel
> KSZ8051] (mii_bus:phy_addr=dsa-0.0:00, irq=POLL)
> [4.885952] Micrel KSZ8051 dsa-0.0:01: attached PHY driver [Micrel
> KSZ8051] (mii_bus:phy_addr=dsa-0.0:01, irq=POLL)
> [4.995934] Micrel KSZ8051 dsa-0.0:02: attached PHY driver [Micrel
> KSZ8051] (mii_bus:phy_addr=dsa-0.0:02, irq=POLL)
> [5.011484] DSA: tree 0 setup
> 
> root@miro:~# ifconfig lan3 192.168.20.103 netmask 255.255.0.0 up
> [  131.196667] IPv6: ADDRCONF(NETDEV_UP): lan3: link is not ready
> root@miro:~# [  132.225863] ksz8895-switch spi2.0 lan3: Link is Up -
> 100Mbps/Full - flow control rx/tx
> [  132.233939] IPv6: ADDRCONF(NETDEV_CHANGE): lan3: link becomes ready
> 
> root@miro:~# ping 192.168.1.1
> PING 192.168.1.1 (192.168.1.1): 56 data bytes
> ^C
> --- 192.168.1.1 ping statistics ---
> 7 packets transmitted, 0 packets received, 100% packet loss
> root@miro:~# ifconfig [  149.904234] random: crng init done
> 
> But packets do not go through, and there is nothing helpful in
> dmesg. Dts part is:
> 
> spi@0 {
>   compatible = "microchip,ksz8895";
> spi-max-frequency = <2500>;
> reg = <0>;
>   // reset-gpios = <&gpio2 8 0>;
> status = "okay";
> 
> spi-cpha;
>   spi-cpol;
>ports {
>  #address-cells = <1>;
>  #size-cells = <0>;
>  port@0 {
> reg = <0>;
> label = "lan1";
>  };
>  port@1 {
> reg = <1>;
> label = "lan2";
>  };
>  port@2 {
> reg = <2>;
> label = "lan3";
>  };
>  port@4 {
> reg = <4>;
> label = "cpu";
> ethernet = <&mac0>;
> fixed-link {
>speed = <100>;
>full-duplex;
> };
>  };
>};
>   };
> 
> I went back to my version of dsa patches, and test above works as
> expected.

Sorry to be this late for the reply.  I finally got hold of a KSZ8895 board that
works with my SoC board to confirm the network communication.

As expected the KSZ8895 board works correctly as the chip uses the same
tail tagging feature in KSZ8795, and I did verify that board is working.

One thing to debug this problem is to dump the MIB counters.  Use the ethtool
utility to show MIB counters of both ports:

ethtool -S lan3
ethtool -S eth0

Assuming eth0 is the MAC controller that drives the switch, the receive 
counters of
the host port of the switch should match the transmit counters of lan3, and 
vice versa.

1 2 3 4 >

1 - 100 of 339 matches

Mail list logo