Re: [PATCH iproute2 1/1] tc: use get_u32() in psample action to match types

2018-03-13 Thread yotam gigi
On Tue, Mar 13, 2018 at 11:16 PM, Roman Mashak  wrote:

Makes sense :)

Acked-by: Yotam Gigi 

> Signed-off-by: Roman Mashak 
> ---
>  tc/m_sample.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/tc/m_sample.c b/tc/m_sample.c
> index ff5ee6bd1ef6..dff986f5 100644
> --- a/tc/m_sample.c
> +++ b/tc/m_sample.c
> @@ -65,7 +65,7 @@ static int parse_sample(struct action_util *a, int *argc_p, 
> char ***argv_p,
> while (argc > 0) {
> if (matches(*argv, "rate") == 0) {
> NEXT_ARG();
> -   if (get_unsigned(, *argv, 10) != 0) {
> +   if (get_u32(, *argv, 10) != 0) {
> fprintf(stderr, "Illegal rate %s\n", *argv);
> usage();
> return -1;
> @@ -73,7 +73,7 @@ static int parse_sample(struct action_util *a, int *argc_p, 
> char ***argv_p,
> rate_set = true;
> } else if (matches(*argv, "group") == 0) {
> NEXT_ARG();
> -   if (get_unsigned(, *argv, 10) != 0) {
> +   if (get_u32(, *argv, 10) != 0) {
> fprintf(stderr, "Illegal group num %s\n",
> *argv);
> usage();
> @@ -82,7 +82,7 @@ static int parse_sample(struct action_util *a, int *argc_p, 
> char ***argv_p,
> group_set = true;
> } else if (matches(*argv, "trunc") == 0) {
> NEXT_ARG();
> -   if (get_unsigned(, *argv, 10) != 0) {
> +   if (get_u32(, *argv, 10) != 0) {
> fprintf(stderr, "Illegal truncation size 
> %s\n",
> *argv);
> usage();
> --
> 2.7.4
>


Re: [PATCH 7/7] ixgbevf: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Timur Tabi

On 3/13/18 10:20 PM, Sinan Kaya wrote:

+/* Assumes caller has executed a write barrier to order memory and device
+ * requests.
+ */
  static inline void ixgbevf_write_tail(struct ixgbevf_ring *ring, u32 value)
  {
-   writel(value, ring->tail);
+   writel_relaxed(value, ring->tail);
  }


Why not put the wmb() in this function, or just get rid of the wmb() in 
the rest of the file and keep this as writel?  That way, you can avoid 
the comment and the risk that comes with it.


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


[PATCH net-next] liquidio: Add support for liquidio 10GBase-T NIC

2018-03-13 Thread Felix Manlunas
From: Veerasenareddy Burru 

Added ethtool changes to show port type as TP (Twisted Pair) for
10GBASE-T ports. Same driver and firmware works for liquidio NIC with
SFP+ ports or TP ports.

Signed-off-by: Veerasenareddy Burru 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 24 --
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 12 +--
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c 
b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index a63ddf0..550ac29 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -232,10 +232,16 @@ static int lio_get_link_ksettings(struct net_device 
*netdev,
 
linfo = >linfo;
 
-   if (linfo->link.s.if_mode == INTERFACE_MODE_XAUI ||
-   linfo->link.s.if_mode == INTERFACE_MODE_RXAUI ||
-   linfo->link.s.if_mode == INTERFACE_MODE_XLAUI ||
-   linfo->link.s.if_mode == INTERFACE_MODE_XFI) {
+   switch (linfo->link.s.phy_type) {
+   case LIO_PHY_PORT_TP:
+   ecmd->base.port = PORT_TP;
+   supported = (SUPPORTED_1baseT_Full |
+SUPPORTED_TP | SUPPORTED_Pause);
+   advertising = (ADVERTISED_1baseT_Full | ADVERTISED_Pause);
+   ecmd->base.autoneg = AUTONEG_DISABLE;
+   break;
+
+   case LIO_PHY_PORT_FIBRE:
ecmd->base.port = PORT_FIBRE;
 
if (linfo->link.s.speed == SPEED_1) {
@@ -245,12 +251,18 @@ static int lio_get_link_ksettings(struct net_device 
*netdev,
 
supported |= SUPPORTED_FIBRE | SUPPORTED_Pause;
advertising |= ADVERTISED_Pause;
+   ecmd->base.autoneg = AUTONEG_DISABLE;
+   break;
+   }
+
+   if (linfo->link.s.if_mode == INTERFACE_MODE_XAUI ||
+   linfo->link.s.if_mode == INTERFACE_MODE_RXAUI ||
+   linfo->link.s.if_mode == INTERFACE_MODE_XLAUI ||
+   linfo->link.s.if_mode == INTERFACE_MODE_XFI) {
ethtool_convert_legacy_u32_to_link_mode(
ecmd->link_modes.supported, supported);
ethtool_convert_legacy_u32_to_link_mode(
ecmd->link_modes.advertising, advertising);
-   ecmd->base.autoneg = AUTONEG_DISABLE;
-
} else {
dev_err(>pci_dev->dev, "Unknown link interface reported 
%d\n",
linfo->link.s.if_mode);
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 522dcc4..ae8566c 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -675,9 +675,11 @@ struct octeon_instr_rdp {
u64 if_mode:5;
u64 pause:1;
u64 flashing:1;
-   u64 reserved:15;
+   u64 phy_type:5;
+   u64 reserved:10;
 #else
-   u64 reserved:15;
+   u64 reserved:10;
+   u64 phy_type:5;
u64 flashing:1;
u64 pause:1;
u64 if_mode:5;
@@ -690,6 +692,12 @@ struct octeon_instr_rdp {
} s;
 };
 
+enum lio_phy_type {
+   LIO_PHY_PORT_TP = 0x0,
+   LIO_PHY_PORT_FIBRE = 0x1,
+   LIO_PHY_PORT_UNKNOWN,
+};
+
 /** The txpciq info passed to host from the firmware */
 
 union oct_txpciq {
-- 
1.8.3.1



[PATCH v2 2/2] doc: Change the udp/sctp rmem/wmem default value.

2018-03-13 Thread Tonghao Zhang
The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096.

Signed-off-by: Tonghao Zhang 
---
 Documentation/networking/ip-sysctl.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 783675a..1d11207 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -755,13 +755,13 @@ udp_rmem_min - INTEGER
Minimal size of receive buffer used by UDP sockets in moderation.
Each UDP socket is able to use the size for receiving data, even if
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-   Default: 1 page
+   Default: 4K
 
 udp_wmem_min - INTEGER
Minimal size of send buffer used by UDP sockets in moderation.
Each UDP socket is able to use the size for sending data, even if
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-   Default: 1 page
+   Default: 4K
 
 CIPSOv4 Variables:
 
@@ -2101,7 +2101,7 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
It is guaranteed to each SCTP socket (but not association) even
under moderate memory pressure.
 
-   Default: 1 page
+   Default: 4K
 
 sctp_wmem  - vector of 3 INTEGERs: min, default, max
Currently this tunable has no effect.
-- 
1.8.3.1



[PATCH v2 1/2] udp: Move the udp sysctl to namespace.

2018-03-13 Thread Tonghao Zhang
This patch moves the udp_rmem_min, udp_wmem_min
to namespace and init the udp_l3mdev_accept explicitly.

The udp_rmem_min/udp_wmem_min affect udp rx/tx queue,
with this patch namespaces can set them differently.

Signed-off-by: Tonghao Zhang 
---
 v2: use a common helper to avoid the code duplication.
---
 include/net/netns/ipv4.h   |  3 ++
 net/ipv4/sysctl_net_ipv4.c | 32 -
 net/ipv4/udp.c | 86 +++---
 net/ipv6/udp.c | 52 ++--
 4 files changed, 96 insertions(+), 77 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3a970e4..382bfd7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -168,6 +168,9 @@ struct netns_ipv4 {
atomic_t tfo_active_disable_times;
unsigned long tfo_active_disable_stamp;
 
+   int sysctl_udp_wmem_min;
+   int sysctl_udp_rmem_min;
+
 #ifdef CONFIG_NET_L3_MASTER_DEV
int sysctl_udp_l3mdev_accept;
 #endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 011de9a..5b72d97 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -520,22 +520,6 @@ static int proc_fib_multipath_hash_policy(struct ctl_table 
*table, int write,
.mode   = 0644,
.proc_handler   = proc_doulongvec_minmax,
},
-   {
-   .procname   = "udp_rmem_min",
-   .data   = _udp_rmem_min,
-   .maxlen = sizeof(sysctl_udp_rmem_min),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = 
-   },
-   {
-   .procname   = "udp_wmem_min",
-   .data   = _udp_wmem_min,
-   .maxlen = sizeof(sysctl_udp_wmem_min),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = 
-   },
{ }
 };
 
@@ -1167,6 +1151,22 @@ static int proc_fib_multipath_hash_policy(struct 
ctl_table *table, int write,
.proc_handler   = proc_dointvec_minmax,
.extra1 = ,
},
+   {
+   .procname   = "udp_rmem_min",
+   .data   = _net.ipv4.sysctl_udp_rmem_min,
+   .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = 
+   },
+   {
+   .procname   = "udp_wmem_min",
+   .data   = _net.ipv4.sysctl_udp_wmem_min,
+   .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = 
+   },
{ }
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3013404..908fc02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,12 +122,6 @@
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
-int sysctl_udp_rmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_rmem_min);
-
-int sysctl_udp_wmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_wmem_min);
-
 atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
@@ -2533,35 +2527,35 @@ int udp_abort(struct sock *sk, int err)
 EXPORT_SYMBOL_GPL(udp_abort);
 
 struct proto udp_prot = {
-   .name  = "UDP",
-   .owner = THIS_MODULE,
-   .close = udp_lib_close,
-   .connect   = ip4_datagram_connect,
-   .disconnect= udp_disconnect,
-   .ioctl = udp_ioctl,
-   .init  = udp_init_sock,
-   .destroy   = udp_destroy_sock,
-   .setsockopt= udp_setsockopt,
-   .getsockopt= udp_getsockopt,
-   .sendmsg   = udp_sendmsg,
-   .recvmsg   = udp_recvmsg,
-   .sendpage  = udp_sendpage,
-   .release_cb= ip4_datagram_release_cb,
-   .hash  = udp_lib_hash,
-   .unhash= udp_lib_unhash,
-   .rehash= udp_v4_rehash,
-   .get_port  = udp_v4_get_port,
-   .memory_allocated  = _memory_allocated,
-   .sysctl_mem= sysctl_udp_mem,
-   .sysctl_wmem   = _udp_wmem_min,
-   .sysctl_rmem   = _udp_rmem_min,
-   .obj_size  = sizeof(struct udp_sock),
-   .h.udp_table   = _table,
+   .name   = "UDP",
+   .owner  = THIS_MODULE,
+   .close  = udp_lib_close,
+   .connect= ip4_datagram_connect,
+   .disconnect = udp_disconnect,
+   .ioctl  = udp_ioctl,
+   .init   = udp_init_sock,
+   .destroy= 

Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-13 Thread Siwei Liu
On Tue, Mar 13, 2018 at 5:28 PM, Samudrala, Sridhar
 wrote:
> On 3/12/2018 3:44 PM, Siwei Liu wrote:
>>
>> Apologies, still some comments going. Please see inline.
>>
>> On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
>>  wrote:
>>>
>>> This patch enables virtio_net to switch over to a VF datapath when a VF
>>> netdev is present with the same MAC address. It allows live migration
>>> of a VM with a direct attached VF without the need to setup a bond/team
>>> between a VF and virtio net device in the guest.
>>>
>>> The hypervisor needs to enable only one datapath at any time so that
>>> packets don't get looped back to the VM over the other datapath. When a
>>> VF
>>> is plugged, the virtio datapath link state can be marked as down. The
>>> hypervisor needs to unplug the VF device from the guest on the source
>>> host
>>> and reset the MAC filter of the VF to initiate failover of datapath to
>>> virtio before starting the migration. After the migration is completed,
>>> the destination hypervisor sets the MAC filter on the VF and plugs it
>>> back
>>> to the guest to switch over to VF datapath.
>>>
>>> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>>> created that acts as a master device and tracks the state of the 2 lower
>>> netdevs. The original virtio_net netdev is marked as 'backup' netdev and
>>> a
>>> passthru device with the same MAC is registered as 'active' netdev.
>>>
>>> This patch is based on the discussion initiated by Jesse on this thread.
>>> https://marc.info/?l=linux-virtualization=151189725224231=2
>>>
>>> Signed-off-by: Sridhar Samudrala 
>>> Signed-off-by: Alexander Duyck 
>>> Reviewed-by: Jesse Brandeburg 
>>> ---
>>>   drivers/net/virtio_net.c | 683
>>> ++-
>>>   1 file changed, 682 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index bcd13fe906ca..f2860d86c952 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -30,6 +30,8 @@
>>>   #include 
>>>   #include 
>>>   #include 
>>> +#include 
>>> +#include 
>>>   #include 
>>>   #include 
>>>
>>> @@ -206,6 +208,9 @@ struct virtnet_info {
>>>  u32 speed;
>>>
>>>  unsigned long guest_offloads;
>>> +
>>> +   /* upper netdev created when BACKUP feature enabled */
>>> +   struct net_device *bypass_netdev;
>>>   };
>>>
>>>   struct padded_vnet_hdr {
>>> @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev,
>>> struct netdev_bpf *xdp)
>>>  }
>>>   }
>>>
>>> +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
>>> + size_t len)
>>> +{
>>> +   struct virtnet_info *vi = netdev_priv(dev);
>>> +   int ret;
>>> +
>>> +   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
>>> +   return -EOPNOTSUPP;
>>> +
>>> +   ret = snprintf(buf, len, "_bkup");
>>> +   if (ret >= len)
>>> +   return -EOPNOTSUPP;
>>> +
>>> +   return 0;
>>> +}
>>> +
>>>   static const struct net_device_ops virtnet_netdev = {
>>>  .ndo_open= virtnet_open,
>>>  .ndo_stop= virtnet_close,
>>> @@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev =
>>> {
>>>  .ndo_xdp_xmit   = virtnet_xdp_xmit,
>>>  .ndo_xdp_flush  = virtnet_xdp_flush,
>>>  .ndo_features_check = passthru_features_check,
>>> +   .ndo_get_phys_port_name = virtnet_get_phys_port_name,
>>>   };
>>>
>>>   static void virtnet_config_changed_work(struct work_struct *work)
>>> @@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device
>>> *vdev)
>>>  return 0;
>>>   }
>>>
>>> +/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
>>> + * When BACKUP feature is enabled, an additional netdev(bypass netdev)
>>> + * is created that acts as a master device and tracks the state of the
>>> + * 2 lower netdevs. The original virtio_net netdev is registered as
>>> + * 'backup' netdev and a passthru device with the same MAC is registered
>>> + * as 'active' netdev.
>>> + */
>>> +
>>> +/* bypass state maintained when BACKUP feature is enabled */
>>> +struct virtnet_bypass_info {
>>> +   /* passthru netdev with same MAC */
>>> +   struct net_device __rcu *active_netdev;
>>> +
>>> +   /* virtio_net netdev */
>>> +   struct net_device __rcu *backup_netdev;
>>> +
>>> +   /* active netdev stats */
>>> +   struct rtnl_link_stats64 active_stats;
>>> +
>>> +   /* backup netdev stats */
>>> +   struct rtnl_link_stats64 backup_stats;
>>> +
>>> +   /* aggregated stats */
>>> +   struct rtnl_link_stats64 bypass_stats;
>>> +
>>> +   /* spinlock while updating stats */
>>> +   spinlock_t stats_lock;
>>> +};
>>> +
>>> 

Re: [PATCH 3/7] RDMA/qedr: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Jason Gunthorpe
On Tue, Mar 13, 2018 at 11:20:24PM -0400, Sinan Kaya wrote:
> Code includes wmb() followed by writel() in multiple places. writel()
> already has a barrier on some architectures like arm64.
> 
> This ends up CPU observing two barriers back to back before executing the
> register write.
> 
> Since code already has an explicit barrier call, changing writel() to
> writel_relaxed().
> 
> Signed-off-by: Sinan Kaya 
>  drivers/infiniband/hw/qedr/verbs.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Sure matches my understanding of writel_relaxed

This is part of a series, should we take just this patch through the
rdma tree? If not:

Acked-by: Jason Gunthorpe 

Thanks,
Jason


Re: [bug, bisected] pfifo_fast causes packet reordering

2018-03-13 Thread John Fastabend
On 03/13/2018 11:35 AM, Dave Taht wrote:
> On Tue, Mar 13, 2018 at 11:24 AM, Jakob Unterwurzacher
>  wrote:
>> During stress-testing our "ucan" USB/CAN adapter SocketCAN driver on Linux
>> v4.16-rc4-383-ged58d66f60b3 we observed that a small fraction of packets are
>> delivered out-of-order.
>>

Is the stress-testing tool available somewhere? What type of packets
are being sent?


>> We have tracked the problem down to the driver interface level, and it seems
>> that the driver's net_device_ops.ndo_start_xmit() function gets the packets
>> handed over in the wrong order.
>>
>> This behavior was not observed on Linux v4.15 and I have bisected the
>> problem down to this patch:
>>
>>> commit c5ad119fb6c09b0297446be05bd66602fa564758
>>> Author: John Fastabend 
>>> Date:   Thu Dec 7 09:58:19 2017 -0800
>>>
>>>net: sched: pfifo_fast use skb_array
>>>
>>>This converts the pfifo_fast qdisc to use the skb_array data structure
>>>and set the lockless qdisc bit. pfifo_fast is the first qdisc to
>>> support
>>>the lockless bit that can be a child of a qdisc requiring locking. So
>>>we add logic to clear the lock bit on initialization in these cases
>>> when
>>>the qdisc graft operation occurs.
>>>
>>>This also removes the logic used to pick the next band to dequeue from
>>>and instead just checks a per priority array for packets from top
>>> priority
>>>to lowest. This might need to be a bit more clever but seems to work
>>>for now.
>>>
>>>Signed-off-by: John Fastabend 
>>>Signed-off-by: David S. Miller 
>>
>>
>> The patch does not revert cleanly, but moving to one commit earlier makes
>> the problem go away.
>>
>> Selecting the "fq" scheduler instead of "pfifo_fast" makes the problem go
>> away as well.
> 

Is this a single queue device or a multiqueue device? Running
'tc -s qdisc show dev foo' would help some.

> I am of course, a fan of obsoleting pfifo_fast. There's no good reason
> for it anymore.
> 
>>
>> Is this an unintended side-effect of the patch or is there something the
>> driver has to do to request in-order delivery?
>>
If we introduced a OOO edge case somewhere that was not
intended so I'll take a look into it. But, if you can provide
a bit more details on how stress testing is done to cause the
issue that would help.

Thanks,
John

>> Thanks,
>> Jakob
> 
> 
> 



Re: [RESEND] rsi: Remove stack VLA usage

2018-03-13 Thread Tobin C. Harding
Added Konstantin in case he is in charge of administering patchwork.kernel.org?

On Tue, Mar 13, 2018 at 07:53:34PM -0700, Kees Cook wrote:
> On Tue, Mar 13, 2018 at 7:11 PM, Tobin C. Harding  wrote:
> > On Tue, Mar 13, 2018 at 11:00:47PM +0200, Andy Shevchenko wrote:
> >> On Tue, Mar 13, 2018 at 10:17 PM, tcharding  wrote:
> >> > On Mon, Mar 12, 2018 at 09:46:06AM +, Kalle Valo wrote:
> >> >> tcharding  wrote:
> >>
> >> I'm pretty much sure it depends on the original email headers, like
> >> above ^^^ — no name.
> >> Perhaps git config on your side should be done.
> >
> > Thanks for the suggestion Andy but the 'tcharding' as the name was
> > munged by either Kalle or patchwork.  I'm guessing patchwork.
> 
> Something you're sending from is using "tcharding" (see the email Andy
> quotes). I see the headers as:
> 
> Date: Wed, 14 Mar 2018 07:17:57 +1100
> From: tcharding 
> ...
> Message-ID: <20180313201757.GK8631@eros>
> X-Mailer: Mutt 1.5.24 (2015-08-30)
> User-Agent: Mutt/1.5.24 (2015-08-30)
> 
> Your most recently email shows "Tobin C. Harding" though, and also
> sent with Mutt...
>
> Do you have multiple Mutt configurations? Is something lacking a
> "From" header insertion and your MTA is filling it in for you from
> your username?

Thanks for taking the time to respond Kees (and Tycho).  I have mutt
configured to reply from whichever email address I receive from so if
patchwork sent an email to 'tcharding ' (which is the
details it has) and I hit reply it would have come from 'tcharding',
hence Andy's reply.  I wouldn't bet my life on it but I'm kinda
confident that I cannot initiate an email from 'tcharding' with my
current set up.

Super bad form to blame someone (or something else) but I think this is
a problem with how my patchwork account is configured.  Either way, that
is still my fault I should have added my real name to patchwork when I
signed up (not just username 'tcharding').

Is patchwork.kernel.org administered by Konstantin Ryabitsev? Added
Konstantin to CC's.

thanks,
Tobin.


[PATCH RFC bpf-next 6/6] bpf: Post-hooks for sys_bind

2018-03-13 Thread Alexei Starovoitov
From: Andrey Ignatov 

"Post-hooks" are hooks that are called right before returning from
sys_bind. At this time IP and port are already allocated and no further
changes to `struct sock` can happen before returning from sys_bind but
BPF program has a chance to inspect the socket and change sys_bind
result.

Specifically it can e.g. inspect what port was allocated and if it
doesn't satisfy some policy, BPF program can force sys_bind to release
that port and return an error to user.

Another example of usage is recording the IP:port pair to some map to
use it in later calls to sys_connect. E.g. if some TCP server inside
cgroup was bound to some IP:port and then some TCP client inside same
cgroup is trying to connect to 127.0.0.1:port then BPF hook for
sys_connect can override the destination and connect application to
IP:port instead of 127.0.0.1:port. That helps forcing all applications
inside a cgroup to use desired IP and not break those applications if
they user e.g. localhost to communicate between each other.

== Implementation details ==

Post-hooks are implemented as two new prog types
`BPF_PROG_TYPE_CGROUP_INET4_POST_BIND` and
`BPF_PROG_TYPE_CGROUP_INET6_POST_BIND` and corresponding attach types
`BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND`.

Separate prog types for IPv4 and IPv6 are introduced to avoid access to
IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from
`inet6_bind()` since those fields might not make sense in such cases.

`BPF_PROG_TYPE_CGROUP_SOCK` prog type is not reused because it provides
write access to some `struct sock` fields, but socket must not be
changed in post-hooks for sys_bind.

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h |  16 -
 include/linux/bpf_types.h  |   2 +
 include/uapi/linux/bpf.h   |  13 
 kernel/bpf/syscall.c   |  14 
 kernel/bpf/verifier.c  |   2 +
 net/core/filter.c  | 170 -
 net/ipv4/af_inet.c |   3 +-
 net/ipv6/af_inet6.c|   3 +-
 8 files changed, 202 insertions(+), 21 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 6b5c25ef1482..693c542632e3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
__ret; \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
+#define BPF_CGROUP_RUN_SK_PROG(sk, type)  \
 ({\
int __ret = 0; \
if (cgroup_bpf_enabled) {  \
-   __ret = __cgroup_bpf_run_filter_sk(sk, \
-BPF_CGROUP_INET_SOCK_CREATE); \
+   __ret = __cgroup_bpf_run_filter_sk(sk, type);  \
}  \
__ret; \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
+   BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)
   \
+   BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)
   \
+   BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)   \
 ({\
int __ret = 0; \
@@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { 
return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 52a571827b9f..23a97978b544 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -10,6 +10,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_BIND, cg_inet4_bind)
 

[PATCH RFC bpf-next 2/6] selftests/bpf: Selftest for sys_bind hooks

2018-03-13 Thread Alexei Starovoitov
From: Andrey Ignatov 

Add selftest to work with bpf_sock_addr context from
`BPF_PROG_TYPE_CGROUP_INET4_BIND` and BPF_PROG_TYPE_CGROUP_INET6_BIND`
programs.

Try to bind(2) on IP:port and apply:
* loads to make sure context can be read correctly, including narrow
  loads (byte, half) for IP and full-size loads (word) for all fields;
* stores to those fields allowed by verifier.

All combination from IPv4/IPv6 and TCP/UDP are tested.

Test passes when expected data can be read from context in the
BPF-program, and after the call to bind(2) socket is bound to IP:port
pair that was written by BPF-program to the context.

Example:
  # ./test_sock_addr
  Attached bind4 program.
  Test case #1 (IPv4/TCP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Test case #2 (IPv4/UDP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Attached bind6 program.
  Test case #3 (IPv6/TCP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  Test case #4 (IPv6/UDP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  ### SUCCESS

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
Signed-off-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h   |  24 ++
 tools/testing/selftests/bpf/Makefile |   3 +-
 tools/testing/selftests/bpf/test_sock_addr.c | 463 +++
 3 files changed, 489 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sock_addr.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db6bdc375126..a6af06bb5efb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -133,6 +133,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_CGROUP_INET4_BIND,
+   BPF_PROG_TYPE_CGROUP_INET6_BIND,
 };
 
 enum bpf_attach_type {
@@ -143,6 +145,8 @@ enum bpf_attach_type {
BPF_SK_SKB_STREAM_PARSER,
BPF_SK_SKB_STREAM_VERDICT,
BPF_CGROUP_DEVICE,
+   BPF_CGROUP_INET4_BIND,
+   BPF_CGROUP_INET6_BIND,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -952,6 +956,26 @@ struct bpf_map_info {
__u64 netns_ino;
 } __attribute__((aligned(8)));
 
+/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
+ * by user and intended to be used by socket (e.g. to bind to, depends on
+ * attach attach type).
+ */
+struct bpf_sock_addr {
+   __u32 user_family;  /* Allows 4-byte read, but no write. */
+   __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.
+* Stored in network byte order.
+*/
+   __u32 user_ip6[4];  /* Allows 1,2,4-byte read an 4-byte write.
+* Stored in network byte order.
+*/
+   __u32 user_port;/* Allows 4-byte read and write.
+* Stored in network byte order
+*/
+   __u32 family;   /* Allows 4-byte read, but no write */
+   __u32 type; /* Allows 4-byte read, but no write */
+   __u32 protocol; /* Allows 4-byte read, but no write */
+};
+
 /* User bpf_sock_ops struct to access socket values and specify request ops
  * and their replies.
  * Some of this fields are in network (bigendian) byte order and may need
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 8567a858b789..f319b67fd0f6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -15,7 +15,7 @@ LDLIBS += -lcap -lelf -lrt -lpthread
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs \
-   test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
+   test_align test_verifier_log test_dev_cgroup test_tcpbpf_user 
test_sock_addr
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o 
test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o 
sockmap_parse_prog.o \
@@ -42,6 +42,7 @@ $(TEST_GEN_PROGS): $(BPFOBJ)
 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+$(OUTPUT)/test_sock_addr: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c 
b/tools/testing/selftests/bpf/test_sock_addr.c
new file mode 100644
index ..18ea250484dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 

[PATCH RFC bpf-next 0/6] bpf: introduce cgroup-bpf bind, connect, post-bind hooks

2018-03-13 Thread Alexei Starovoitov
For our container management we've been using complicated and fragile setup
consisting of LD_PRELOAD wrapper intercepting bind and connect calls from
all containerized applications.
The setup involves per-container IPs, policy, etc, so traditional
network-only solutions that involve VRFs, netns, acls are not applicable.
Changing apps is not possible and LD_PRELOAD doesn't work
for apps that don't use glibc like java and golang.
BPF+cgroup looks to be the best solution for this problem.
Hence we introduce 3 hooks:
- at entry into sys_bind and sys_connect
  to let bpf prog look and modify 'struct sockaddr' provided
  by user space and fail bind/connect when appropriate
- post sys_bind after port is allocated

The approach works great and has zero overhead for anyone who doesn't
use it and very low overhead when deployed.

The main question for Daniel and Dave is what approach to take
with prog types...

In this patch set we introduce 6 new program types to make user
experience easier:
  BPF_PROG_TYPE_CGROUP_INET4_BIND,
  BPF_PROG_TYPE_CGROUP_INET6_BIND,
  BPF_PROG_TYPE_CGROUP_INET4_CONNECT,
  BPF_PROG_TYPE_CGROUP_INET6_CONNECT,
  BPF_PROG_TYPE_CGROUP_INET4_POST_BIND,
  BPF_PROG_TYPE_CGROUP_INET6_POST_BIND,

since v4 programs should not be using 'struct bpf_sock_addr'->user_ip6 fields
and different prog type for v4 and v6 helps verifier reject such access
at load time.
Similarly bind vs connect are two different prog types too,
since only sys_connect programs can call new bpf_bind() helper.

This approach is very different from tcp-bpf where single
'struct bpf_sock_ops' and single prog type is used for different hooks.
The field checks are done at run-time instead of load time.

I think the approach taken by this patch set is justified,
but we may do better if we extend BPF_PROG_ATTACH cmd
with log_buf + log_size, then we should be able to combine
bind+connect+v4+v6 into single program type.
The idea that at load time the verifier will remember a bitmask
of fields in bpf_sock_addr used by the program and helpers
that program used, then at attach time we can check that
hook is compatible with features used by the program and
report human readable error message back via log_buf.
We cannot do this right now with just EINVAL, since combinations
of errors like 'using user_ip6 field but attaching to v4 hook'
are too high to express as errno.
This would be bigger change. If you folks think it's worth it
we can go with this approach or if you think 6 new prog types
is not too bad, we can leave the patch as-is.
Comments?
Other comments on patches are welcome.

Andrey Ignatov (6):
  bpf: Hooks for sys_bind
  selftests/bpf: Selftest for sys_bind hooks
  net: Introduce __inet_bind() and __inet6_bind
  bpf: Hooks for sys_connect
  selftests/bpf: Selftest for sys_connect hooks
  bpf: Post-hooks for sys_bind

 include/linux/bpf-cgroup.h|  68 +++-
 include/linux/bpf_types.h |   6 +
 include/linux/filter.h|  10 +
 include/net/inet_common.h |   2 +
 include/net/ipv6.h|   2 +
 include/net/sock.h|   3 +
 include/net/udp.h |   1 +
 include/uapi/linux/bpf.h  |  52 ++-
 kernel/bpf/cgroup.c   |  36 ++
 kernel/bpf/syscall.c  |  42 ++
 kernel/bpf/verifier.c |   6 +
 net/core/filter.c | 479 ++-
 net/ipv4/af_inet.c|  60 ++-
 net/ipv4/tcp_ipv4.c   |  16 +
 net/ipv4/udp.c|  14 +
 net/ipv6/af_inet6.c   |  47 ++-
 net/ipv6/tcp_ipv6.c   |  16 +
 net/ipv6/udp.c|  20 +
 tools/include/uapi/linux/bpf.h|  39 +-
 tools/testing/selftests/bpf/Makefile  |   8 +-
 tools/testing/selftests/bpf/bpf_helpers.h |   2 +
 tools/testing/selftests/bpf/connect4_prog.c   |  45 +++
 tools/testing/selftests/bpf/connect6_prog.c   |  61 +++
 tools/testing/selftests/bpf/test_sock_addr.c  | 541 ++
 tools/testing/selftests/bpf/test_sock_addr.sh |  57 +++
 25 files changed, 1580 insertions(+), 53 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/connect4_prog.c
 create mode 100644 tools/testing/selftests/bpf/connect6_prog.c
 create mode 100644 tools/testing/selftests/bpf/test_sock_addr.c
 create mode 100755 tools/testing/selftests/bpf/test_sock_addr.sh

-- 
2.9.5



[PATCH RFC bpf-next 4/6] bpf: Hooks for sys_connect

2018-03-13 Thread Alexei Starovoitov
From: Andrey Ignatov 

== The problem ==

See description of the problem in the initial patch of this patch set.

== The solution ==

The patch provides much more reliable in-kernel solution for the 2nd
part of the problem: making outgoing connecttion from desired IP.

It adds new program types `BPF_PROG_TYPE_CGROUP_INET4_CONNECT` and
`BPF_PROG_TYPE_CGROUP_INET6_CONNECT` and corresponding attach types that
can be used to override both source and destination of a connection at
connect(2) time.

Local end of connection can be bound to desired IP using newly
introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though,
and doesn't support binding to port, i.e. leverages
`IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this:
* looking for a free port is expensive and can affect performance
  significantly;
* there is no use-case for port.

As for remote end (`struct sockaddr *` passed by user), both parts of it
can be overridden, remote IP and remote port. It's useful if an
application inside cgroup wants to connect to another application inside
same cgroup or to itself, but knows nothing about IP assigned to the
cgroup.

Support is added for IPv4 and IPv6, for TCP and UDP.

IPv4 and IPv6 have separate program types for same reason as sys_bind
hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields
when user passes sockaddr_in since it'd be out-of-bound.

Program types for sys_bind itself can't be reused for sys_connect either
since there is a difference in allowed helpers to call: `bpf_bind()` is
useful to call from sys_connect hooks, but doesn't make sense in
sys_bind hooks.

== Implementation notes ==

The patch introduces new field in `struct proto`: `pre_connect` that is
a pointer to a function with same signature as `connect` but is called
before it. The reason is in some cases BPF hooks should be called way
before control is passed to `sk->sk_prot->connect`. Specifically
`inet_dgram_connect` autobinds socket before calling
`sk->sk_prot->connect` and there is no way to call `bpf_bind()` from
hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since
it'd cause double-bind. On the other hand `proto.pre_connect` provides a
flexible way to add BPF hooks for connect only for necessary `proto` and
call them at desired time before `connect`. Since `bpf_bind()` is
allowed to bind only to IP and autobind in `inet_dgram_connect` binds
only port there is no chance of double-bind.

bpf_bind()` sets `force_bind_address_no_port` to bind to only IP despite
of value of `bind_address_no_port` socket field.

`bpf_bind()` sets `with_lock` to `false` when calling to `__inet_bind()`
and `__inet6_bind()` since all call-sites, where `bpf_bind()` is called,
already hold socket lock.

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h | 31 +
 include/linux/bpf_types.h  |  2 ++
 include/net/sock.h |  3 +++
 include/net/udp.h  |  1 +
 include/uapi/linux/bpf.h   | 15 ++-
 kernel/bpf/syscall.c   | 14 ++
 kernel/bpf/verifier.c  |  2 ++
 net/core/filter.c  | 67 ++
 net/ipv4/af_inet.c | 13 +
 net/ipv4/tcp_ipv4.c| 16 +++
 net/ipv4/udp.c | 14 ++
 net/ipv6/tcp_ipv6.c| 16 +++
 net/ipv6/udp.c | 20 ++
 13 files changed, 213 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index dd0cfbddcfbe..6b5c25ef1482 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
__ret; \
 })
 
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)  \
+({\
+   int __ret = 0; \
+   if (cgroup_bpf_enabled) {  \
+   lock_sock(sk); \
+   __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);\
+   release_sock(sk);  \
+   }  \
+   __ret; \
+})
+
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
 
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
 
+#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
+   

[PATCH RFC bpf-next 5/6] selftests/bpf: Selftest for sys_connect hooks

2018-03-13 Thread Alexei Starovoitov
From: Andrey Ignatov 

Add selftest for BPF_CGROUP_INET4_CONNECT and BPF_CGROUP_INET6_CONNECT
attach types.

Try to connect(2) to specified IP:port and test that:
* remote IP:port pair is overridden;
* local end of connection is bound to specified IP.

All combinations of IPv4/IPv6 and TCP/UDP are tested.

Example:
  # tcpdump -pn -i lo -w connect.pcap 2>/dev/null &
  [1] 271
  # strace -qqf -e connect -o connect.trace ./test_sock_addr.sh
  Wait for testing IPv4/IPv6 to become available ... OK
  Attached bind4 program.
  Attached connect4 program.
  Test case #1 (IPv4/TCP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Requested: connect(192.168.1.254, 4040) from (*, *) ..
 Actual: connect(127.0.0.1, ) from (127.0.0.4, 45780)
  Test case #2 (IPv4/UDP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Requested: connect(192.168.1.254, 4040) from (*, *) ..
 Actual: connect(127.0.0.1, ) from (127.0.0.4, 44708)
  Attached bind6 program.
  Attached connect6 program.
  Test case #3 (IPv6/TCP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  Requested: connect(face:b00c:1234:5678::abcd, 6060) from (*, *) .
 Actual: connect(::1, ) from (::6, 37510)
  Test case #4 (IPv6/UDP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  Requested: connect(face:b00c:1234:5678::abcd, 6060) from (*, *) .
 Actual: connect(::1, ) from (::6, 51749)
  ### SUCCESS
  # egrep 'connect\(.*AF_INET' connect.trace | \
  egrep -vw 'htons\(1025\)' | fold -b -s -w 72
  295   connect(7, {sa_family=AF_INET, sin_port=htons(4040),
  sin_addr=inet_addr("192.168.1.254")}, 128) = 0
  295   connect(8, {sa_family=AF_INET, sin_port=htons(4040),
  sin_addr=inet_addr("192.168.1.254")}, 128) = 0
  295   connect(9, {sa_family=AF_INET6, sin6_port=htons(6060),
  inet_pton(AF_INET6, "face:b00c:1234:5678::abcd", _addr),
  sin6_flowinfo=0, sin6_scope_id=0}, 128) = 0
  295   connect(10, {sa_family=AF_INET6, sin6_port=htons(6060),
  inet_pton(AF_INET6, "face:b00c:1234:5678::abcd", _addr),
  sin6_flowinfo=0, sin6_scope_id=0}, 128) = 0
  # fg
  tcpdump -pn -i lo -w connect.pcap 2> /dev/null
  # tcpdump -r connect.pcap -n tcp | cut -c 1-72
  reading from file connect.pcap, link-type EN10MB (Ethernet)
  17:20:03.346047 IP 127.0.0.4.45780 > 127.0.0.1.: Flags [S], seq 2460
  17:20:03.346084 IP 127.0.0.1. > 127.0.0.4.45780: Flags [S.], seq 320
  17:20:03.346110 IP 127.0.0.4.45780 > 127.0.0.1.: Flags [.], ack 1, w
  17:20:03.347218 IP 127.0.0.1. > 127.0.0.4.45780: Flags [R.], seq 1,
  17:20:03.356698 IP6 ::6.37510 > ::1.: Flags [S], seq 2155424486, win
  17:20:03.356733 IP6 ::1. > ::6.37510: Flags [S.], seq 1308562754, ac
  17:20:03.356752 IP6 ::6.37510 > ::1.: Flags [.], ack 1, win 342, opt
  17:20:03.357639 IP6 ::1. > ::6.37510: Flags [R.], seq 1, ack 1, win

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h| 15 +-
 tools/testing/selftests/bpf/Makefile  |  5 +-
 tools/testing/selftests/bpf/bpf_helpers.h |  2 +
 tools/testing/selftests/bpf/connect4_prog.c   | 45 
 tools/testing/selftests/bpf/connect6_prog.c   | 61 +
 tools/testing/selftests/bpf/test_sock_addr.c  | 78 +++
 tools/testing/selftests/bpf/test_sock_addr.sh | 57 
 7 files changed, 260 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/connect4_prog.c
 create mode 100644 tools/testing/selftests/bpf/connect6_prog.c
 create mode 100755 tools/testing/selftests/bpf/test_sock_addr.sh

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a6af06bb5efb..11e1b633808a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -135,6 +135,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_DEVICE,
BPF_PROG_TYPE_CGROUP_INET4_BIND,
BPF_PROG_TYPE_CGROUP_INET6_BIND,
+   BPF_PROG_TYPE_CGROUP_INET4_CONNECT,
+   BPF_PROG_TYPE_CGROUP_INET6_CONNECT,
 };
 
 enum bpf_attach_type {
@@ -147,6 +149,8 @@ enum bpf_attach_type {
BPF_CGROUP_DEVICE,
BPF_CGROUP_INET4_BIND,
BPF_CGROUP_INET6_BIND,
+   BPF_CGROUP_INET4_CONNECT,
+   BPF_CGROUP_INET6_CONNECT,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -700,6 +704,14 @@ union bpf_attr {
  * int bpf_override_return(pt_regs, rc)
  * @pt_regs: pointer to struct pt_regs
  * @rc: the return value to set
+ *
+ * int bpf_bind(ctx, addr, addr_len)
+ * Bind socket to address. Only binding to IP is supported, no port can be
+ * set in addr.
+ * @ctx: pointer to context of type bpf_sock_addr
+ * @addr: pointer to struct 

[PATCH RFC bpf-next 1/6] bpf: Hooks for sys_bind

2018-03-13 Thread Alexei Starovoitov
From: Andrey Ignatov 

== The problem ==

There is a use-case when all processes inside a cgroup should use one
single IP address on a host that has multiple IP configured.  Those
processes should use the IP for both ingress and egress, for TCP and UDP
traffic. So TCP/UDP servers should be bound to that IP to accept
incoming connections on it, and TCP/UDP clients should make outgoing
connections from that IP. It should not require changing application
code since it's often not possible.

Currently it's solved by intercepting glibc wrappers around syscalls
such as `bind(2)` and `connect(2)`. It's done by a shared library that
is preloaded for every process in a cgroup so that whenever TCP/UDP
server calls `bind(2)`, the library replaces IP in sockaddr before
passing arguments to syscall. When application calls `connect(2)` the
library transparently binds the local end of connection to that IP
(`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty).

Shared library approach is fragile though, e.g.:
* some applications clear env vars (incl. `LD_PRELOAD`);
* `/etc/ld.so.preload` doesn't help since some applications are linked
  with option `-z nodefaultlib`;
* other applications don't use glibc and there is nothing to intercept.

== The solution ==

The patch provides much more reliable in-kernel solution for the 1st
part of the problem: binding TCP/UDP servers on desired IP. It does not
depend on application environment and implementation details (whether
glibc is used or not).

It adds new eBPF program types `BPF_PROG_TYPE_CGROUP_INET4_BIND` and
`BPF_PROG_TYPE_CGROUP_INET6_BIND` and corresponding attach types
`BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already
existing `BPF_CGROUP_INET_SOCK_CREATE`).

The new program types are intended to be used with sockets (`struct sock`)
in a cgroup and provided by user `struct sockaddr`. Pointers to both of
them are parts of the context passed to programs of newly added types.

The new attach types provides hooks in `bind(2)` system call for both
IPv4 and IPv6 so that one can write a program to override IP addresses
and ports user program tries to bind to and apply such a program for
whole cgroup.

== Implementation notes ==

[1]
Separate prog/attach types for `AF_INET` and `AF_INET6` are added
intentionally to prevent reading/writing to offsets that don't make
sense for corresponding socket family. E.g. if user passes `sockaddr_in`
it doesn't make sense to read from / write to `user_ip6[]` context
fields.

[2]
The write access to `struct bpf_sock_addr_kern` is implemented using
special field as an additional "register".

There are just two registers in `sock_addr_convert_ctx_access`: `src`
with value to write and `dst` with pointer to context that can't be
changed not to break later instructions. But the fields, allowed to
write to, are not available directly and to access them address of
corresponding pointer has to be loaded first. To get additional register
the 1st not used by `src` and `dst` one is taken, its content is saved
to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load
address of pointer field, and finally the register's content is restored
from the temporary field after writing `src` value.

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h |  21 
 include/linux/bpf_types.h  |   2 +
 include/linux/filter.h |  10 ++
 include/uapi/linux/bpf.h   |  24 +
 kernel/bpf/cgroup.c|  36 +++
 kernel/bpf/syscall.c   |  14 +++
 kernel/bpf/verifier.c  |   2 +
 net/core/filter.c  | 242 +
 net/ipv4/af_inet.c |   7 ++
 net/ipv6/af_inet6.c|   7 ++
 10 files changed, 365 insertions(+)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8a4566691c8f..dd0cfbddcfbe 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
 #include 
 
 struct sock;
+struct sockaddr;
 struct cgroup;
 struct sk_buff;
 struct bpf_sock_ops_kern;
@@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
   enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+ struct sockaddr *uaddr,
+ enum bpf_attach_type type);
+
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 struct bpf_sock_ops_kern *sock_ops,
 enum bpf_attach_type type);
@@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
__ret; \
 })
 
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)   \
+({  

[PATCH RFC bpf-next 3/6] net: Introduce __inet_bind() and __inet6_bind

2018-03-13 Thread Alexei Starovoitov
From: Andrey Ignatov 

Refactor `bind()` code to make it ready to be called from BPF helper
function `bpf_bind()` (will be added soon). Implementation of
`inet_bind()` and `inet6_bind()` is separated into `__inet_bind()` and
`__inet6_bind()` correspondingly. These function can be used from both
`sk_prot->bind` and `bpf_bind()` contexts.

New functions have two additional arguments.

`force_bind_address_no_port` forces binding to IP only w/o checking
`inet_sock.bind_address_no_port` field. It'll allow to bind local end of
a connection to desired IP in `bpf_bind()` w/o changing
`bind_address_no_port` field of a socket. It's useful since `bpf_bind()`
can return an error and we'd need to restore original value of
`bind_address_no_port` in that case if we changed this before calling to
the helper.

`with_lock` specifies whether to lock socket when working with `struct
sk` or not. The argument is set to `true` for `sk_prot->bind`, i.e. old
behavior is preserved. But it will be set to `false` for `bpf_bind()`
use-case. The reason is all call-sites, where `bpf_bind()` will be
called, already hold that socket lock.

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
Signed-off-by: Alexei Starovoitov 
---
 include/net/inet_common.h |  2 ++
 include/net/ipv6.h|  2 ++
 net/ipv4/af_inet.c| 39 ---
 net/ipv6/af_inet6.c   | 37 -
 4 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 500f81375200..384b90c62c0b 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -32,6 +32,8 @@ int inet_shutdown(struct socket *sock, int how);
 int inet_listen(struct socket *sock, int backlog);
 void inet_sock_destruct(struct sock *sk);
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+   bool force_bind_address_no_port, bool with_lock);
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 int peer);
 int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 50a6f0ddb878..2e5fedc56e59 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1066,6 +1066,8 @@ void ipv6_local_error(struct sock *sk, int err, struct 
flowi6 *fl6, u32 info);
 void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
 
 int inet6_release(struct socket *sock);
+int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len,
+bool force_bind_address_no_port, bool with_lock);
 int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
  int peer);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2dec266507dc..e203a39d6988 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -432,30 +432,37 @@ EXPORT_SYMBOL(inet_release);
 
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
-   struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
-   struct inet_sock *inet = inet_sk(sk);
-   struct net *net = sock_net(sk);
-   unsigned short snum;
-   int chk_addr_ret;
-   u32 tb_id = RT_TABLE_LOCAL;
int err;
 
/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
-   err = sk->sk_prot->bind(sk, uaddr, addr_len);
-   goto out;
+   return sk->sk_prot->bind(sk, uaddr, addr_len);
}
-   err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
-   goto out;
+   return -EINVAL;
 
/* BPF prog is run before any checks are done so that if the prog
 * changes context in a wrong way it will be caught.
 */
err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
if (err)
-   goto out;
+   return err;
+
+   return __inet_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet_bind);
+
+int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+   bool force_bind_address_no_port, bool with_lock)
+{
+   struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+   struct inet_sock *inet = inet_sk(sk);
+   struct net *net = sock_net(sk);
+   unsigned short snum;
+   int chk_addr_ret;
+   u32 tb_id = RT_TABLE_LOCAL;
+   int err;
 
if (addr->sin_family != AF_INET) {
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
@@ -499,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
 *  would be illegal to use them (multicast/broadcast) in
 *  which case the sending 

Re: [PATCH net-next] tuntap: XDP_TX can use native XDP

2018-03-13 Thread Michael S. Tsirkin
On Wed, Mar 14, 2018 at 11:23:40AM +0800, Jason Wang wrote:
> Now we have ndo_xdp_xmit, switch to use it instead of the slow generic
> XDP TX routine. XDP_TX on TAP gets ~20% improvements from ~1.5Mpps to
> ~1.8Mpps on 2.60GHz Core(TM) i7-5600U.
> 
> Signed-off-by: Jason Wang 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/net/tun.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 475088f..baeafa0 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1613,7 +1613,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
> *tun,
>   unsigned int delta = 0;
>   char *buf;
>   size_t copied;
> - bool xdp_xmit = false;
>   int err, pad = TUN_RX_PAD;
>  
>   rcu_read_lock();
> @@ -1671,8 +1670,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
> *tun,
>   preempt_enable();
>   return NULL;
>   case XDP_TX:
> - xdp_xmit = true;
> - /* fall through */
> + get_page(alloc_frag->page);
> + alloc_frag->offset += buflen;
> + if (tun_xdp_xmit(tun->dev, ))
> + goto err_redirect;
> + tun_xdp_flush(tun->dev);

Why do we have to flush here though?
It might be a good idea to document the reason in a code comment.

> + rcu_read_unlock();
> + preempt_enable();
> + return NULL;
>   case XDP_PASS:
>   delta = orig_data - xdp.data;
>   break;
> @@ -1699,14 +1704,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
> *tun,
>   get_page(alloc_frag->page);
>   alloc_frag->offset += buflen;
>  
> - if (xdp_xmit) {
> - skb->dev = tun->dev;
> - generic_xdp_tx(skb, xdp_prog);
> - rcu_read_unlock();
> - preempt_enable();
> - return NULL;
> - }
> -
>   rcu_read_unlock();
>   preempt_enable();
>  
> -- 
> 2.7.4


[PATCH net-next] tuntap: XDP_TX can use native XDP

2018-03-13 Thread Jason Wang
Now we have ndo_xdp_xmit, switch to use it instead of the slow generic
XDP TX routine. XDP_TX on TAP gets ~20% improvements from ~1.5Mpps to
~1.8Mpps on 2.60GHz Core(TM) i7-5600U.

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 19 ---
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 475088f..baeafa0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1613,7 +1613,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
unsigned int delta = 0;
char *buf;
size_t copied;
-   bool xdp_xmit = false;
int err, pad = TUN_RX_PAD;
 
rcu_read_lock();
@@ -1671,8 +1670,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
preempt_enable();
return NULL;
case XDP_TX:
-   xdp_xmit = true;
-   /* fall through */
+   get_page(alloc_frag->page);
+   alloc_frag->offset += buflen;
+   if (tun_xdp_xmit(tun->dev, ))
+   goto err_redirect;
+   tun_xdp_flush(tun->dev);
+   rcu_read_unlock();
+   preempt_enable();
+   return NULL;
case XDP_PASS:
delta = orig_data - xdp.data;
break;
@@ -1699,14 +1704,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
 
-   if (xdp_xmit) {
-   skb->dev = tun->dev;
-   generic_xdp_tx(skb, xdp_prog);
-   rcu_read_unlock();
-   preempt_enable();
-   return NULL;
-   }
-
rcu_read_unlock();
preempt_enable();
 
-- 
2.7.4



[PATCH 1/7] i40e/i40evf: Eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index e554aa6cf..7028516 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1375,7 +1375,7 @@ static inline void i40e_release_rx_desc(struct i40e_ring 
*rx_ring, u32 val)
 * such as IA-64).
 */
wmb();
-   writel(val, rx_ring->tail);
+   writel_relaxed(val, rx_ring->tail);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 357d605..2d323fc 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -667,7 +667,7 @@ static inline void i40e_release_rx_desc(struct i40e_ring 
*rx_ring, u32 val)
 * such as IA-64).
 */
wmb();
-   writel(val, rx_ring->tail);
+   writel_relaxed(val, rx_ring->tail);
 }
 
 /**
-- 
2.7.4



[PATCH 2/7] ixgbe: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0da5aa2..35ca1d8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1692,7 +1692,7 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, 
u16 cleaned_count)
 * such as IA-64).
 */
wmb();
-   writel(i, rx_ring->tail);
+   writel_relaxed(i, rx_ring->tail);
}
 }
 
@@ -2453,7 +2453,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
*q_vector,
 * know there are new descriptors to fetch.
 */
wmb();
-   writel(ring->next_to_use, ring->tail);
+   writel_relaxed(ring->next_to_use, ring->tail);
 
xdp_do_flush_map();
}
@@ -10014,7 +10014,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
 * are new descriptors to fetch.
 */
wmb();
-   writel(ring->next_to_use, ring->tail);
+   writel_relaxed(ring->next_to_use, ring->tail);
 
return;
 }
-- 
2.7.4



[PATCH 5/7] igb: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index b88fae7..ba8ccb5 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8072,7 +8072,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 
cleaned_count)
 * such as IA-64).
 */
wmb();
-   writel(i, rx_ring->tail);
+   writel_relaxed(i, rx_ring->tail);
}
 }
 
-- 
2.7.4



[PATCH 1/7] i40e/i40evf: Eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index e554aa6cf..7028516 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1375,7 +1375,7 @@ static inline void i40e_release_rx_desc(struct i40e_ring 
*rx_ring, u32 val)
 * such as IA-64).
 */
wmb();
-   writel(val, rx_ring->tail);
+   writel_relaxed(val, rx_ring->tail);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 357d605..2d323fc 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -667,7 +667,7 @@ static inline void i40e_release_rx_desc(struct i40e_ring 
*rx_ring, u32 val)
 * such as IA-64).
 */
wmb();
-   writel(val, rx_ring->tail);
+   writel_relaxed(val, rx_ring->tail);
 }
 
 /**
-- 
2.7.4



[PATCH 3/7] RDMA/qedr: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/infiniband/hw/qedr/verbs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/verbs.c 
b/drivers/infiniband/hw/qedr/verbs.c
index 53f00db..ccd55f4 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -1870,7 +1870,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
 
if (rdma_protocol_roce(>ibdev, 1)) {
wmb();
-   writel(qp->rq.db_data.raw, qp->rq.db);
+   writel_relaxed(qp->rq.db_data.raw, qp->rq.db);
/* Make sure write takes effect */
mmiowb();
}
@@ -3247,7 +3247,7 @@ int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr 
*wr,
 * redundant doorbell.
 */
wmb();
-   writel(qp->sq.db_data.raw, qp->sq.db);
+   writel_relaxed(qp->sq.db_data.raw, qp->sq.db);
 
/* Make sure write sticks */
mmiowb();
-- 
2.7.4



[PATCH 4/7] igbvf: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/igbvf/netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c 
b/drivers/net/ethernet/intel/igbvf/netdev.c
index 4214c15..fe3441b 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -251,7 +251,7 @@ static void igbvf_alloc_rx_buffers(struct igbvf_ring 
*rx_ring,
 * such as IA-64).
*/
wmb();
-   writel(i, adapter->hw.hw_addr + rx_ring->tail);
+   writel_relaxed(i, adapter->hw.hw_addr + rx_ring->tail);
}
 }
 
-- 
2.7.4



[PATCH 7/7] ixgbevf: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  | 5 -
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 +++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index f695242..64d0e0b 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -244,9 +244,12 @@ static inline u16 ixgbevf_desc_unused(struct ixgbevf_ring 
*ring)
return ((ntc > ntu) ? 0 : ring->count) + ntc - ntu - 1;
 }
 
+/* Assumes caller has executed a write barrier to order memory and device
+ * requests.
+ */
 static inline void ixgbevf_write_tail(struct ixgbevf_ring *ring, u32 value)
 {
-   writel(value, ring->tail);
+   writel_relaxed(value, ring->tail);
 }
 
 #define IXGBEVF_RX_DESC(R, i)  \
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 9b3d43d..0ba7f59 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3643,6 +3643,13 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 
tx_ring->next_to_use = i;
 
+   /* Force memory writes to complete before letting h/w
+* know there are new descriptors to fetch.  (Only
+* applicable for weak-ordered memory model archs,
+* such as IA-64).
+*/
+   wmb();
+
/* notify HW of packet */
ixgbevf_write_tail(tx_ring, i);
 
-- 
2.7.4



[PATCH 6/7] e1000: eliminate duplicate barriers on weakly-ordered archs

2018-03-13 Thread Sinan Kaya
Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya 
---
 drivers/net/ethernet/intel/e1000/e1000_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c 
b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 3dd4aeb..e0e583a 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -4573,7 +4573,7 @@ e1000_alloc_jumbo_rx_buffers(struct e1000_adapter 
*adapter,
 * such as IA-64).
 */
wmb();
-   writel(i, adapter->hw.hw_addr + rx_ring->rdt);
+   writel_relaxed(i, adapter->hw.hw_addr + rx_ring->rdt);
}
 }
 
@@ -4688,7 +4688,7 @@ static void e1000_alloc_rx_buffers(struct e1000_adapter 
*adapter,
 * such as IA-64).
 */
wmb();
-   writel(i, hw->hw_addr + rx_ring->rdt);
+   writel_relaxed(i, hw->hw_addr + rx_ring->rdt);
}
 }
 
-- 
2.7.4



RE: [Intel-wired-lan] [next-queue PATCH v4 2/8] igb: Fix queue selection on MAC filters on i210 and i211

2018-03-13 Thread Brown, Aaron F
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Vinicius Costa Gomes
> Sent: Wednesday, March 7, 2018 4:37 PM
> To: intel-wired-...@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus  palen...@intel.com>
> Subject: [Intel-wired-lan] [next-queue PATCH v4 2/8] igb: Fix queue selection
> on MAC filters on i210 and i211
> 
> On the RAH registers there are semantic differences on the meaning of
> the "queue" parameter for traffic steering depending on the controller
> model: there is the 82575 meaning, which "queue" means a RX Hardware
> Queue, and the i350 meaning, where it is a reception pool.
> 
> The previous behaviour was having no effect for i210 and i211 based
> controllers because the QSEL bit of the RAH register wasn't being set.
> 
> This patch separates the condition in discrete cases, so the different
> handling is clearer.
> 
> Fixes: 83c21335c876 ("igb: improve MAC filter handling")
> Signed-off-by: Vinicius Costa Gomes 
> ---
>  drivers/net/ethernet/intel/igb/e1000_defines.h |  1 +
>  drivers/net/ethernet/intel/igb/igb_main.c  | 15 +++
>  2 files changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h
> b/drivers/net/ethernet/intel/igb/e1000_defines.h
> index 83cabff1e0ab..573bf177fd08 100644
> --- a/drivers/net/ethernet/intel/igb/e1000_defines.h
> +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
> @@ -490,6 +490,7 @@
>   * manageability enabled, allowing us room for 15 multicast addresses.
>   */
>  #define E1000_RAH_AV  0x8000/* Receive descriptor valid */
> +#define E1000_RAH_QSEL_ENABLE 0x1000
>  #define E1000_RAL_MAC_ADDR_LEN 4
>  #define E1000_RAH_MAC_ADDR_LEN 2
>  #define E1000_RAH_POOL_MASK 0x03FC
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c
> b/drivers/net/ethernet/intel/igb/igb_main.c
> index 715bb32e6901..eabedc6b6518 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -8747,12 +8747,19 @@ static void igb_rar_set_index(struct igb_adapter
> *adapter, u32 index)
>   if (is_valid_ether_addr(addr))
>   rar_high |= E1000_RAH_AV;
> 
> - if (hw->mac.type == e1000_82575)
> + switch (hw->mac.type) {
> + case e1000_82575:
> + case e1000_i210:
> + case e1000_i211:
> + rar_high |= E1000_RAH_QSEL_ENABLE;
>   rar_high |= E1000_RAH_POOL_1 *
> - adapter->mac_table[index].queue;
> - else
> +   adapter->mac_table[index].queue;
> + break;
> + default:
>   rar_high |= E1000_RAH_POOL_1 <<
> - adapter->mac_table[index].queue;
> + adapter->mac_table[index].queue;

Small nit.  Shouldn't this line be indented more to be a few spaces past the 
"|=" operator as above?

> + break;
> + }
>   }
> 
>   wr32(E1000_RAL(index), rar_low);
> --
> 2.16.2
> 
> ___
> Intel-wired-lan mailing list
> intel-wired-...@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan


RE: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address support for ethtool nftuple filters

2018-03-13 Thread Brown, Aaron F
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Vinicius Costa Gomes
> Sent: Wednesday, March 7, 2018 4:37 PM
> To: intel-wired-...@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus  palen...@intel.com>
> Subject: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address
> support for ethtool nftuple filters
> 
> This adds the capability of configuring the queue steering of arriving
> packets based on their source and destination MAC addresses.
> 
> In practical terms this adds support for the following use cases,
> characterized by these examples:
> 
> $ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
> (this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
> to the RX queue 0)
> 
> $ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 action 3
> (this will direct packets with source address "44:44:44:44:44:44" to
> the RX queue 3)

This seems to work fine on i210, and the patch series allows me to set the rx 
filters on the i350, i354 and i211, but it is not directing the packets to the 
queue I request.

With the exception of i210 the rx_queues number does not seem to be effected by 
setting the filter.  In the case of i211 the rx packets stay on rx_queue 0 with 
or without an ether src or dst filter.  The first example one seems to work at 
first since it's directing to queue 0, but changing the filter to "action 1" 
does not change the behavior.  With the i350 and i354 ports the packets are 
spread across the rx_queues with or without the filter set.

> 
> Signed-off-by: Vinicius Costa Gomes 
> ---
>  drivers/net/ethernet/intel/igb/igb_ethtool.c | 35
> 
>  1 file changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c
> b/drivers/net/ethernet/intel/igb/igb_ethtool.c
> index 94fc9a4bed8b..3f98299d4cd0 100644
> --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
> +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
> @@ -2494,6 +2494,23 @@ static int igb_get_ethtool_nfc_entry(struct
> igb_adapter *adapter,
>   fsp->h_ext.vlan_tci = rule->filter.vlan_tci;
>   fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
>   }
> + if (rule->filter.match_flags &
> IGB_FILTER_FLAG_DST_MAC_ADDR) {
> + ether_addr_copy(fsp->h_u.ether_spec.h_dest,
> + rule->filter.dst_addr);
> + /* As we only support matching by the full
> +  * mask, return the mask to userspace
> +  */
> + eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
> + }
> + if (rule->filter.match_flags &
> IGB_FILTER_FLAG_SRC_MAC_ADDR) {
> + ether_addr_copy(fsp->h_u.ether_spec.h_source,
> + rule->filter.src_addr);
> + /* As we only support matching by the full
> +  * mask, return the mask to userspace
> +  */
> + eth_broadcast_addr(fsp-
> >m_u.ether_spec.h_source);
> + }
> +
>   return 0;
>   }
>   return -EINVAL;
> @@ -2932,10 +2949,6 @@ static int igb_add_ethtool_nfc_entry(struct
> igb_adapter *adapter,
>   if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW)
>   return -EINVAL;
> 
> - if (fsp->m_u.ether_spec.h_proto != ETHER_TYPE_FULL_MASK &&
> - fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK))
> - return -EINVAL;
> -
>   input = kzalloc(sizeof(*input), GFP_KERNEL);
>   if (!input)
>   return -ENOMEM;
> @@ -2945,6 +2958,20 @@ static int igb_add_ethtool_nfc_entry(struct
> igb_adapter *adapter,
>   input->filter.match_flags = IGB_FILTER_FLAG_ETHER_TYPE;
>   }
> 
> + /* Only support matching addresses by the full mask */
> + if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
> + input->filter.match_flags |=
> IGB_FILTER_FLAG_SRC_MAC_ADDR;
> + ether_addr_copy(input->filter.src_addr,
> + fsp->h_u.ether_spec.h_source);
> + }
> +
> + /* Only support matching addresses by the full mask */
> + if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
> + input->filter.match_flags |=
> IGB_FILTER_FLAG_DST_MAC_ADDR;
> + ether_addr_copy(input->filter.dst_addr,
> + fsp->h_u.ether_spec.h_dest);
> + }
> +
>   if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
>   if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
>   err = -EINVAL;
> --
> 2.16.2
> 
> ___
> Intel-wired-lan mailing list
> intel-wired-...@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan


Re: [PATCH] pktgen: use dynamic allocation for debug print buffer

2018-03-13 Thread Wang Jian
>> +  kfree(buf);
free tb? buf is an array.

On Wed, Mar 14, 2018 at 8:25 AM, David Miller  wrote:
> From: Arnd Bergmann 
> Date: Tue, 13 Mar 2018 21:58:39 +0100
>
>> After the removal of the VLA, we get a harmless warning about a large
>> stack frame:
>>
>> net/core/pktgen.c: In function 'pktgen_if_write':
>> net/core/pktgen.c:1710:1: error: the frame size of 1076 bytes is larger than 
>> 1024 bytes [-Werror=frame-larger-than=]
>>
>> The function was previously shown to be safe despite hitting
>> the 1024 bye warning level. To get rid of the annoyging warning,
>> while keeping it readable, this changes it to use strndup_user().
>>
>> Obviously this is not a fast path, so the kmalloc() overhead
>> can be disregarded.
>>
>> Fixes: 35951393bbff ("pktgen: Remove VLA usage")
>> Signed-off-by: Arnd Bergmann 
>
> Applied, thanks.


Re:

2018-03-13 Thread VIC J
-- 
I need your partnership to transfer $17.5 million to you and you will
take 40% of it, details of the business will be disclosed once i
receive your positive reply.


Re: [RESEND] rsi: Remove stack VLA usage

2018-03-13 Thread Kees Cook
On Tue, Mar 13, 2018 at 7:11 PM, Tobin C. Harding  wrote:
> On Tue, Mar 13, 2018 at 11:00:47PM +0200, Andy Shevchenko wrote:
>> On Tue, Mar 13, 2018 at 10:17 PM, tcharding  wrote:
>> > On Mon, Mar 12, 2018 at 09:46:06AM +, Kalle Valo wrote:
>> >> tcharding  wrote:
>>
>> I'm pretty much sure it depends on the original email headers, like
>> above ^^^ — no name.
>> Perhaps git config on your side should be done.
>
> Thanks for the suggestion Andy but the 'tcharding' as the name was
> munged by either Kalle or patchwork.  I'm guessing patchwork.

Something you're sending from is using "tcharding" (see the email Andy
quotes). I see the headers as:

Date: Wed, 14 Mar 2018 07:17:57 +1100
From: tcharding 
...
Message-ID: <20180313201757.GK8631@eros>
X-Mailer: Mutt 1.5.24 (2015-08-30)
User-Agent: Mutt/1.5.24 (2015-08-30)

Your most recently email shows "Tobin C. Harding" though, and also
sent with Mutt...

Do you have multiple Mutt configurations? Is something lacking a
"From" header insertion and your MTA is filling it in for you from
your username?

-Kees

-- 
Kees Cook
Pixel Security


[PATCH net] net/sched: act_simple: don't leak 'index' in the error path

2018-03-13 Thread Davide Caratti
if the kernel fails duplicating 'sdata', creation of a new action fails
with -ENOMEM. However, subsequent attempts to install the same action
using the same value of 'index' will systematically fail with -ENOSPC,
and that value of 'index' will no more be usable by act_simple, until
rmmod / insmod of act_simple.ko is done:

 # tc actions add action simple sdata hello index 100
 # tc actions list action simple

action order 0: Simple 
 index 100 ref 1 bind 0
 # tc actions flush action simple
 # tc actions add action simple sdata hello index 100
RTNETLINK answers: Cannot allocate memory
We have an error talking to the kernel
 # tc actions flush action simple
 # tc actions add action simple sdata hello index 100
RTNETLINK answers: No space left on device
We have an error talking to the kernel
 # tc actions add action simple sdata hello index 100
RTNETLINK answers: No space left on device
We have an error talking to the kernel
 ...

Similarly to what other TC actions do, we can duplicate 'sdata' before
calling tcf_idr_create(), and avoid calling tcf_idr_cleanup(), so that
leaks of 'index' don't occur anymore.

Signed-off-by: Davide Caratti 
---

Notes:
Hello,

I observed this faulty behavior on act_bpf, in case of negative return
value of tcf_bpf_init_from_ops() and tcf_bpf_init_from_efd(). Then I
tried on act_simple, that parses its parameter in a similar way, and
reproduced the same leakage of 'index'.

So, unless you think we should fix this issue in a different way (i.e.
changing tcf_idr_cleanup() ), I will post a similar fix for act_bpf.

Any feedback is welcome, thank you in advance!

 net/sched/act_simple.c | 22 ++
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 425eac11f6da..b80d4a69a848 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -53,15 +53,6 @@ static void tcf_simp_release(struct tc_action *a)
kfree(d->tcfd_defdata);
 }
 
-static int alloc_defdata(struct tcf_defact *d, char *defdata)
-{
-   d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
-   if (unlikely(!d->tcfd_defdata))
-   return -ENOMEM;
-   strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
-   return 0;
-}
-
 static void reset_policy(struct tcf_defact *d, char *defdata,
 struct tc_defact *p)
 {
@@ -110,20 +101,18 @@ static int tcf_simp_init(struct net *net, struct nlattr 
*nla,
return -EINVAL;
}
 
-   defdata = nla_data(tb[TCA_DEF_DATA]);
-
if (!exists) {
+   defdata = nla_strdup(tb[TCA_DEF_DATA], GFP_KERNEL);
+   if (unlikely(!defdata))
+   return -ENOMEM;
+
ret = tcf_idr_create(tn, parm->index, est, a,
 _simp_ops, bind, false);
if (ret)
return ret;
 
d = to_defact(*a);
-   ret = alloc_defdata(d, defdata);
-   if (ret < 0) {
-   tcf_idr_cleanup(*a, est);
-   return ret;
-   }
+   d->tcfd_defdata = defdata;
d->tcf_action = parm->action;
ret = ACT_P_CREATED;
} else {
@@ -133,6 +122,7 @@ static int tcf_simp_init(struct net *net, struct nlattr 
*nla,
if (!ovr)
return -EEXIST;
 
+   defdata = nla_data(tb[TCA_DEF_DATA]);
reset_policy(d, defdata, parm);
}
 
-- 
2.14.3



Re: [RESEND] rsi: Remove stack VLA usage

2018-03-13 Thread Tobin C. Harding
On Tue, Mar 13, 2018 at 11:00:47PM +0200, Andy Shevchenko wrote:
> On Tue, Mar 13, 2018 at 10:17 PM, tcharding  wrote:
> > On Mon, Mar 12, 2018 at 09:46:06AM +, Kalle Valo wrote:
> >> tcharding  wrote:
> 
> I'm pretty much sure it depends on the original email headers, like
> above ^^^ — no name.
> Perhaps git config on your side should be done.

Thanks for the suggestion Andy but the 'tcharding' as the name was
munged by either Kalle or patchwork.  I'm guessing patchwork.


thanks,
Tobin.


Re: [RFC net-next 2/6] driver: net: bonding: allow registration of tc offload callbacks in bond

2018-03-13 Thread Jakub Kicinski
On Tue, 13 Mar 2018 17:53:39 +0200, Or Gerlitz wrote:
> > Starting with type 2, in our current NIC HW APIs we have to duplicate
> > these rules
> > into two rules set to HW:
> >
> > 2.1 VF rep --> uplink 0
> > 2.2 VF rep --> uplink 1
> >
> > and we do that in the driver (add/del two HW rules, combine the stat
> > results, etc)

Ack, I think our HW API also will require us to duplicate the rules
today, but IMHO we should implement some common helper module in the
core that would work for any block sharing rather than bond specific
solution.

> > 3. ingress rule on VF rep port with shared tunnel device being the
> > egress (encap)
> > and where the routing of the underlay (tunnel) goes through LAG.
> >
> > in our case, this is like 2.1/2.2 above, offload two rules, combine stats
> >
> > 4. ingress rule shared tunnel device being the ingress and VF rep port
> > being the egress (decap)
> >
> > this uses the egdev facility to be offloaded into the our driver, and
> > then in the driver
> > we will treat it like type 1, two rules need to be installed into HW,
> > but now, we can't delegate them
> > from the vxlan device b/c it has no direct connection with the bond.

Let's get rid of the egdev crutch first then :]


Re: [PATCH] drivers: net: wireless: ath: ath9: dfs: remove VLA usage

2018-03-13 Thread Miguel Ojeda
On Sun, Mar 11, 2018 at 12:12 AM, Kees Cook  wrote:
>
> The problem is that it's not a "constant expression", so the compiler
> frontend still yells about it under -Wvla. I would characterize this
> mainly as a fix for "accidental VLA" or "misdetected VLA" or something
> like that. AIUI, there really isn't a functional change here.

C++11's constexpr variables would be nice to have.

Cheers,
Miguel


Re: [Intel-wired-lan] [PATCH 12/15] ice: Add stats and ethtool support

2018-03-13 Thread Toshiaki Makita
On 2018/03/14 6:14, Jesse Brandeburg wrote:
> which iproute2 tool shows the /proc/net/dev stats?

ip -s -s link show

-- 
Toshiaki Makita



Re: [PATCH] drivers: net: wireless: ath: ath9: dfs: remove VLA usage

2018-03-13 Thread Daniel Micay
> I don't think the difference between C and C++ const pointer
> conversions

I mean I don't think the difference between them was intended.


Re: [PATCH] drivers: net: wireless: ath: ath9: dfs: remove VLA usage

2018-03-13 Thread Daniel Micay
No, it's undefined behavior to write to a const variable. The `static`
and `const` on the variable both change the code generation in the
real world as permitted / encouraged by the standard. It's placed in
read-only memory. Trying to write to it will break. It's not
"implemented defined" to write to it, it's "undefined behavior" i.e.
it's considered incorrect. There a clear distinction between those in
the standard.

You're confusing having a real `const` for a variable with having it
applied to a pointer. It's well-defined to cast away const from a
pointer and write to what it points at if it's not actually const. If
it is const, that's broken.

There's nothing implementation defined about either case.

The C standard could have considered `static const` variables to work
as constant expressions just like the C++ standard. They borrowed it
from there but made it less useful than const in what became the C++
standard. They also used stricter rules for the permitted implicit
conversions of const pointers which made those much less usable, i.e.
converting `int **` to `const int *const *` wasn't permitted like C++.
I don't think the difference between C and C++ const pointer
conversions, it's a quirk of them being standardized on different
timelines and ending up with different versions of the same thing. On
the other hand, they might have left out having ever so slightly more
useful constant expressions on purpose since people can use #define.


Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Eric Dumazet



On 03/13/2018 05:35 PM, Eric Dumazet wrote:



On 03/13/2018 05:26 PM, Eric Dumazet wrote:



On 03/13/2018 05:04 PM, Alexei Starovoitov wrote:

On 3/13/18 4:27 PM, Eric Dumazet wrote:



On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:


we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
It's not clear why it's not crashing there, but we cannot just
reject changing proto in bpf programs now.
We have to fix whatever needs to be fixed in skb_segment
(if bug is there) or fix whatever necessary on mlx5 side.
In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
through virtio would do, so if skb_segment() needs to do something
special with skb the SKB_GSO_DODGY flag is already there.


'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was
not happy with the fix and provided something else.


any link to your old patches and discussion?

I think since mlx4 can do tso on them and the packets came out
correct on the wire, there is nothing fundamentally wrong with
changing gso_size. Just tricky to teach skb_segment.



The world is not mlx4 only. Some NIC will ask skb_segment() fallback 
segmentation for various reasons (like skb->len above a given limit 
like 16KB)


Maybe https://www.spinics.net/lists/netdev/msg255549.html



Herbert patch :

commit 9d8506cc2d7ea1f911c72c100193a3677f6668c3
Author: Herbert Xu 
Date:   Thu Nov 21 11:10:04 2013 -0800

     gso: handle new frag_list of frags GRO packets



I found my initial patch.

https://www.spinics.net/lists/netdev/msg255452.html




Re: [PATCH iproute2-next 2/2] tc: Add JSON output of fq_codel stats

2018-03-13 Thread David Ahern
On 3/8/18 2:31 PM, Toke Høiland-Jørgensen wrote:
> Enable proper JSON output support for fq_codel in `tc -s qdisc` output.
> 
> Signed-off-by: Toke Høiland-Jørgensen 
> ---
>  tc/q_fq_codel.c | 49 -
>  1 file changed, 32 insertions(+), 17 deletions(-)
> 
applied to iproute2-next.




Re: [PATCH iproute2-next 1/2] tc: Add missing documentation for codel and fq_codel parameters

2018-03-13 Thread David Ahern
On 3/8/18 2:31 PM, Toke Høiland-Jørgensen wrote:
> Add missing documentation of the memory_limit fq_codel parameter and the
> ce_threshold codel and fq_codel parameters.
> 
> Signed-off-by: Toke Høiland-Jørgensen 
> ---
>  man/man8/tc-codel.8| 10 +-
>  man/man8/tc-fq_codel.8 | 18 +-
>  tc/q_fq_codel.c|  1 +
>  3 files changed, 27 insertions(+), 2 deletions(-)

applied to iproute2-next.


Re: [PATCH iproute2/net-next] tc: f_flower: Add support for matching first frag packets

2018-03-13 Thread David Ahern
On 3/9/18 2:07 AM, Simon Horman wrote:
> From: Pieter Jansen van Vuuren 
> 
> Add matching support for distinguishing between first and later fragmented
> packets.
> 
>  # tc filter add dev eth0 protocol ip parent : \
>  flower indev eth0 \
>   ip_flags firstfrag \
> ip_proto udp \
> action mirred egress redirect dev eth1
> 
>  # tc filter add dev eth0 protocol ip parent : \
>  flower indev eth0 \
>   ip_flags nofirstfrag \
> ip_proto udp \
> action mirred egress redirect dev eth1
> 
> Signed-off-by: Pieter Jansen van Vuuren 
> Reviewed-by: Jakub Kicinski 
> Signed-off-by: Simon Horman 
> ---
>  man/man8/tc-flower.8 | 8 ++--
>  tc/f_flower.c| 1 +
>  2 files changed, 7 insertions(+), 2 deletions(-)

applied to iproute2-next.


Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-13 Thread Stephen Hemminger
On Tue, 13 Mar 2018 17:36:23 -0700
"Samudrala, Sridhar"  wrote:

> OK. looks like you want to see atleast some code shared between netvsc and 
> virtio_net
> even when they are using 2 different netdev models.
> Will try to add a new file under net/core and move some functions that can be 
> shared
> between netvsc and virtio_net in BACKUP mode.

Make it work first (on both) then workout how to share.
That method worked quite well for tunnels and other netdev like objects.


Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-13 Thread Michael S. Tsirkin
On Tue, Mar 13, 2018 at 05:28:07PM -0700, Samudrala, Sridhar wrote:
> > I am not sure if it's a good idea to leave the
> > virtio_bypass around if running into failure: the guest is not
> > migratable as the VF doesn't have a backup path,
> 
> Are you talking about a failure when registering backup netdev?  This should 
> not
> happen, but i guess we can improve error handling in such scenario.

A nice way to do this would be to clear the backup feature bit.

> 
> > And perhaps the worse
> > part is that, it now has two interfaces with identical MAC address but
> > one of them is invalid (user cannot use the virtio interface as it has
> > a dampened datapath). IMHO the virtio_bypass and its lower netdev
> > should be destroyed at all when it fails to bind the VF, and
> > technically, there should be some way to propogate the failure status
> > to the hypervisor/backend, indicating that the VM is not migratable
> > because of guest software errors (maybe by clearing out the backup
> > feature from the guest virtio driver so host can see/learn it).
> 
> In BACKUP mode, user can only use the upper virtio_bypass netdev and that will
> always be there. Any failure to enslave VF netdev is not fatal, but i will see
> if we can improve the error handling of failure to enslave backup netdev.
> Also, i don't think the BACKUP feature bit is negotiable with the host.
> 
> Thanks
> Sridhar

All bits are negotiable.  It's up to the host whether to support
a device with this bit clear, or to fail negotiation.

-- 
MST


Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-13 Thread Samudrala, Sridhar

On 3/12/2018 2:08 PM, Jiri Pirko wrote:

Mon, Mar 12, 2018 at 09:58:06PM CET, sridhar.samudr...@intel.com wrote:


On 3/12/2018 1:12 PM, Jiri Pirko wrote:

Thu, Mar 01, 2018 at 09:08:43PM CET, sridhar.samudr...@intel.com wrote:

This patch enables virtio_net to switch over to a VF datapath when a VF
netdev is present with the same MAC address. It allows live migration
of a VM with a direct attached VF without the need to setup a bond/team
between a VF and virtio net device in the guest.

The hypervisor needs to enable only one datapath at any time so that
packets don't get looped back to the VM over the other datapath. When a VF
is plugged, the virtio datapath link state can be marked as down. The
hypervisor needs to unplug the VF device from the guest on the source host
and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed,
the destination hypervisor sets the MAC filter on the VF and plugs it back
to the guest to switch over to VF datapath.

When BACKUP feature is enabled, an additional netdev(bypass netdev) is
created that acts as a master device and tracks the state of the 2 lower
netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
passthru device with the same MAC is registered as 'active' netdev.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization=151189725224231=2

Signed-off-by: Sridhar Samudrala 
Signed-off-by: Alexander Duyck 
Reviewed-by: Jesse Brandeburg 

[...]



+static int virtnet_bypass_register_child(struct net_device *child_netdev)
+{
+   struct virtnet_bypass_info *vbi;
+   struct net_device *dev;
+   bool backup;
+   int ret;
+
+   if (child_netdev->addr_len != ETH_ALEN)
+   return NOTIFY_DONE;
+
+   /* We will use the MAC address to locate the virtnet_bypass netdev
+* to associate with the child netdev. If we don't find a matching
+* bypass netdev, move on.
+*/
+   dev = get_virtnet_bypass_bymac(dev_net(child_netdev),
+  child_netdev->perm_addr);
+   if (!dev)
+   return NOTIFY_DONE;
+
+   vbi = netdev_priv(dev);
+   backup = (child_netdev->dev.parent == dev->dev.parent);
+   if (backup ? rtnl_dereference(vbi->backup_netdev) :
+   rtnl_dereference(vbi->active_netdev)) {
+   netdev_info(dev,
+   "%s attempting to join bypass dev when %s already 
present\n",
+   child_netdev->name, backup ? "backup" : "active");
+   return NOTIFY_DONE;
+   }
+
+   /* Avoid non pci devices as active netdev */
+   if (!backup && (!child_netdev->dev.parent ||
+   !dev_is_pci(child_netdev->dev.parent)))
+   return NOTIFY_DONE;
+
+   ret = netdev_rx_handler_register(child_netdev,
+virtnet_bypass_handle_frame, dev);
+   if (ret != 0) {
+   netdev_err(child_netdev,
+  "can not register bypass receive handler (err = 
%d)\n",
+  ret);
+   goto rx_handler_failed;
+   }
+
+   ret = netdev_upper_dev_link(child_netdev, dev, NULL);
+   if (ret != 0) {
+   netdev_err(child_netdev,
+  "can not set master device %s (err = %d)\n",
+  dev->name, ret);
+   goto upper_link_failed;
+   }
+
+   child_netdev->flags |= IFF_SLAVE;
+
+   if (netif_running(dev)) {
+   ret = dev_open(child_netdev);
+   if (ret && (ret != -EBUSY)) {
+   netdev_err(dev, "Opening child %s failed ret:%d\n",
+  child_netdev->name, ret);
+   goto err_interface_up;
+   }
+   }

Much of this function is copy of netvsc_vf_join, should be shared with
netvsc.

Yes. This is based on netvsc_register_vf() and netvsc_vf_join(). But modified
to handle registration of 2 child netdevs(backup virtio and active vf).  In case
of netvsc, they only register VF. There is no backup netdev.
I think trying to make this code shareable with netvsc will introduce additional
checks and make it convoluted.

No problem.


Not clear what you meant here?
Want to confirm that you are agreeing that it is OK to not share this function
with netvsc.










+
+   /* Align MTU of child with master */
+   ret = dev_set_mtu(child_netdev, dev->mtu);
+   if (ret) {
+   netdev_err(dev,
+  "unable to change mtu of %s to %u register failed\n",
+  child_netdev->name, dev->mtu);
+   goto err_set_mtu;
+   }
+
+   call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
+
+ 

[PATCH net-next 4/6] ibmvnic: Update TX pool initialization routine

2018-03-13 Thread Thomas Falcon
Introduce function that initializes one TX pool. Use that to
create each pool entry in both the standard TX pool and TSO
pool arrays.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 90 --
 1 file changed, 48 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index e6e934e..cda5bf0 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -635,13 +635,43 @@ static void release_tx_pools(struct ibmvnic_adapter 
*adapter)
adapter->num_active_tx_pools = 0;
 }
 
+static int init_one_tx_pool(struct net_device *netdev,
+   struct ibmvnic_tx_pool *tx_pool,
+   int num_entries, int buf_size)
+{
+   struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+   int i;
+
+   tx_pool->tx_buff = kcalloc(num_entries,
+  sizeof(struct ibmvnic_tx_buff),
+  GFP_KERNEL);
+   if (!tx_pool->tx_buff)
+   return -1;
+
+   if (alloc_long_term_buff(adapter, _pool->long_term_buff,
+num_entries * buf_size))
+   return -1;
+
+   tx_pool->free_map = kcalloc(num_entries, sizeof(int), GFP_KERNEL);
+   if (!tx_pool->free_map)
+   return -1;
+
+   for (i = 0; i < num_entries; i++)
+   tx_pool->free_map[i] = i;
+
+   tx_pool->consumer_index = 0;
+   tx_pool->producer_index = 0;
+   tx_pool->num_buffers = num_entries;
+   tx_pool->buf_size = buf_size;
+
+   return 0;
+}
+
 static int init_tx_pools(struct net_device *netdev)
 {
struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-   struct device *dev = >vdev->dev;
-   struct ibmvnic_tx_pool *tx_pool;
int tx_subcrqs;
-   int i, j;
+   int i, rc;
 
tx_subcrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
adapter->tx_pool = kcalloc(tx_subcrqs,
@@ -649,53 +679,29 @@ static int init_tx_pools(struct net_device *netdev)
if (!adapter->tx_pool)
return -1;
 
+   adapter->tso_pool = kcalloc(tx_subcrqs,
+   sizeof(struct ibmvnic_tx_pool), GFP_KERNEL);
+   if (!adapter->tso_pool)
+   return -1;
+
adapter->num_active_tx_pools = tx_subcrqs;
 
for (i = 0; i < tx_subcrqs; i++) {
-   tx_pool = >tx_pool[i];
-
-   netdev_dbg(adapter->netdev,
-  "Initializing tx_pool[%d], %lld buffs\n",
-  i, adapter->req_tx_entries_per_subcrq);
-
-   tx_pool->tx_buff = kcalloc(adapter->req_tx_entries_per_subcrq,
-  sizeof(struct ibmvnic_tx_buff),
-  GFP_KERNEL);
-   if (!tx_pool->tx_buff) {
-   dev_err(dev, "tx pool buffer allocation failed\n");
-   release_tx_pools(adapter);
-   return -1;
-   }
-
-   if (alloc_long_term_buff(adapter, _pool->long_term_buff,
-adapter->req_tx_entries_per_subcrq *
-(adapter->req_mtu + VLAN_HLEN))) {
-   release_tx_pools(adapter);
-   return -1;
-   }
-
-   /* alloc TSO ltb */
-   if (alloc_long_term_buff(adapter, _pool->tso_ltb,
-IBMVNIC_TSO_BUFS *
-IBMVNIC_TSO_BUF_SZ)) {
+   rc = init_one_tx_pool(netdev, >tx_pool[i],
+ adapter->req_tx_entries_per_subcrq,
+ adapter->req_mtu + VLAN_HLEN);
+   if (rc) {
release_tx_pools(adapter);
-   return -1;
+   return rc;
}
 
-   tx_pool->tso_index = 0;
-
-   tx_pool->free_map = kcalloc(adapter->req_tx_entries_per_subcrq,
-   sizeof(int), GFP_KERNEL);
-   if (!tx_pool->free_map) {
+   init_one_tx_pool(netdev, >tso_pool[i],
+IBMVNIC_TSO_BUFS,
+IBMVNIC_TSO_BUF_SZ);
+   if (rc) {
release_tx_pools(adapter);
-   return -1;
+   return rc;
}
-
-   for (j = 0; j < adapter->req_tx_entries_per_subcrq; j++)
-   tx_pool->free_map[j] = j;
-
-   tx_pool->consumer_index = 0;
-   tx_pool->producer_index = 0;
}
 
return 0;
-- 
1.8.3.1



Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Eric Dumazet



On 03/13/2018 05:26 PM, Eric Dumazet wrote:



On 03/13/2018 05:04 PM, Alexei Starovoitov wrote:

On 3/13/18 4:27 PM, Eric Dumazet wrote:



On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:


we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
It's not clear why it's not crashing there, but we cannot just
reject changing proto in bpf programs now.
We have to fix whatever needs to be fixed in skb_segment
(if bug is there) or fix whatever necessary on mlx5 side.
In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
through virtio would do, so if skb_segment() needs to do something
special with skb the SKB_GSO_DODGY flag is already there.


'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was
not happy with the fix and provided something else.


any link to your old patches and discussion?

I think since mlx4 can do tso on them and the packets came out
correct on the wire, there is nothing fundamentally wrong with
changing gso_size. Just tricky to teach skb_segment.



The world is not mlx4 only. Some NIC will ask skb_segment() fallback 
segmentation for various reasons (like skb->len above a given limit like 
16KB)


Maybe https://www.spinics.net/lists/netdev/msg255549.html



Herbert patch :

commit 9d8506cc2d7ea1f911c72c100193a3677f6668c3
Author: Herbert Xu 
Date:   Thu Nov 21 11:10:04 2013 -0800

gso: handle new frag_list of frags GRO packets



[PATCH net-next 1/6] ibmvnic: Generalize TX pool structure

2018-03-13 Thread Thomas Falcon
Remove some unused fields in the structure and include values
describing the individual buffer size and number of buffers in
a TX pool. This allows us to use these fields for TX pool buffer
accounting as opposed to using hard coded values. Finally, split
TSO buffers out and provide an additional TX pool array for TSO.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.h | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.h 
b/drivers/net/ethernet/ibm/ibmvnic.h
index 099c89d..a2e21b3 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -917,11 +917,9 @@ struct ibmvnic_tx_pool {
int *free_map;
int consumer_index;
int producer_index;
-   wait_queue_head_t ibmvnic_tx_comp_q;
-   struct task_struct *work_thread;
struct ibmvnic_long_term_buff long_term_buff;
-   struct ibmvnic_long_term_buff tso_ltb;
-   int tso_index;
+   int num_buffers;
+   int buf_size;
 };
 
 struct ibmvnic_rx_buff {
@@ -1044,6 +1042,7 @@ struct ibmvnic_adapter {
u64 promisc;
 
struct ibmvnic_tx_pool *tx_pool;
+   struct ibmvnic_tx_pool *tso_pool;
struct completion init_done;
int init_done_rc;
 
-- 
1.8.3.1



[PATCH net-next 5/6] ibmvnic: Update TX and TX completion routines

2018-03-13 Thread Thomas Falcon
Update TX and TX completion routines to account for TX pool
restructuring. TX routine first chooses the pool depending
on whether a packet is GSO or not, then uses it accordingly.

For the completion routine to know which pool it needs to use,
set the most significant bit of the correlator index to one
if the packet uses the TSO pool. On completion, unset the bit
and use the correlator index to release the buffer pool entry.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 55 +++---
 drivers/net/ethernet/ibm/ibmvnic.h |  1 +
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index cda5bf0..3b752e3 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1414,8 +1414,11 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
ret = NETDEV_TX_OK;
goto out;
}
+   if (skb_is_gso(skb))
+   tx_pool = >tso_pool[queue_num];
+   else
+   tx_pool = >tx_pool[queue_num];
 
-   tx_pool = >tx_pool[queue_num];
tx_scrq = adapter->tx_scrq[queue_num];
txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
@@ -1423,20 +1426,10 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
 
index = tx_pool->free_map[tx_pool->consumer_index];
 
-   if (skb_is_gso(skb)) {
-   offset = tx_pool->tso_index * IBMVNIC_TSO_BUF_SZ;
-   dst = tx_pool->tso_ltb.buff + offset;
-   memset(dst, 0, IBMVNIC_TSO_BUF_SZ);
-   data_dma_addr = tx_pool->tso_ltb.addr + offset;
-   tx_pool->tso_index++;
-   if (tx_pool->tso_index == IBMVNIC_TSO_BUFS)
-   tx_pool->tso_index = 0;
-   } else {
-   offset = index * (adapter->req_mtu + VLAN_HLEN);
-   dst = tx_pool->long_term_buff.buff + offset;
-   memset(dst, 0, adapter->req_mtu + VLAN_HLEN);
-   data_dma_addr = tx_pool->long_term_buff.addr + offset;
-   }
+   offset = index * tx_pool->buf_size;
+   dst = tx_pool->long_term_buff.buff + offset;
+   memset(dst, 0, tx_pool->buf_size);
+   data_dma_addr = tx_pool->long_term_buff.addr + offset;
 
if (skb_shinfo(skb)->nr_frags) {
int cur, i;
@@ -1459,8 +1452,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
}
 
tx_pool->consumer_index =
-   (tx_pool->consumer_index + 1) %
-   adapter->req_tx_entries_per_subcrq;
+   (tx_pool->consumer_index + 1) % tx_pool->num_buffers;
 
tx_buff = _pool->tx_buff[index];
tx_buff->skb = skb;
@@ -1476,11 +1468,13 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_crq.v1.n_crq_elem = 1;
tx_crq.v1.n_sge = 1;
tx_crq.v1.flags1 = IBMVNIC_TX_COMP_NEEDED;
-   tx_crq.v1.correlator = cpu_to_be32(index);
+
if (skb_is_gso(skb))
-   tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->tso_ltb.map_id);
+   tx_crq.v1.correlator =
+   cpu_to_be32(index | IBMVNIC_TSO_POOL_MASK);
else
-   tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->long_term_buff.map_id);
+   tx_crq.v1.correlator = cpu_to_be32(index);
+   tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->long_term_buff.map_id);
tx_crq.v1.sge_len = cpu_to_be32(skb->len);
tx_crq.v1.ioba = cpu_to_be64(data_dma_addr);
 
@@ -1543,7 +1537,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
 
if (tx_pool->consumer_index == 0)
tx_pool->consumer_index =
-   adapter->req_tx_entries_per_subcrq - 1;
+   tx_pool->num_buffers - 1;
else
tx_pool->consumer_index--;
 
@@ -2545,6 +2539,7 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter 
*adapter,
   struct ibmvnic_sub_crq_queue *scrq)
 {
struct device *dev = >vdev->dev;
+   struct ibmvnic_tx_pool *tx_pool;
struct ibmvnic_tx_buff *txbuff;
union sub_crq *next;
int index;
@@ -2564,7 +2559,14 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter 
*adapter,
continue;
}
index = be32_to_cpu(next->tx_comp.correlators[i]);
-   txbuff = >tx_pool[pool].tx_buff[index];
+   if (index & IBMVNIC_TSO_POOL_MASK) {
+   tx_pool = >tso_pool[pool];
+   index &= ~IBMVNIC_TSO_POOL_MASK;
+   } else {
+   tx_pool = 

[PATCH net-next 6/6] ibmvnic: Improve TX buffer accounting

2018-03-13 Thread Thomas Falcon
Improve TX pool buffer accounting to prevent the producer
index from overruning the consumer. First, set the next free
index to an invalid value if it is in use. If next buffer
to be consumed is in use, drop the packet.

Finally, if the transmit fails for some other reason, roll
back the consumer index and set the free map entry to its original
value. This should also be done if the DMA map fails.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 30 +-
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 3b752e3..f599dad 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1426,6 +1426,16 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
 
index = tx_pool->free_map[tx_pool->consumer_index];
 
+   if (index == IBMVNIC_INVALID_MAP) {
+   dev_kfree_skb_any(skb);
+   tx_send_failed++;
+   tx_dropped++;
+   ret = NETDEV_TX_OK;
+   goto out;
+   }
+
+   tx_pool->free_map[tx_pool->consumer_index] = IBMVNIC_INVALID_MAP;
+
offset = index * tx_pool->buf_size;
dst = tx_pool->long_term_buff.buff + offset;
memset(dst, 0, tx_pool->buf_size);
@@ -1522,7 +1532,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_map_failed++;
tx_dropped++;
ret = NETDEV_TX_OK;
-   goto out;
+   goto tx_err_out;
}
lpar_rc = send_subcrq_indirect(adapter, handle_array[queue_num],
   (u64)tx_buff->indir_dma,
@@ -1534,13 +1544,6 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
}
if (lpar_rc != H_SUCCESS) {
dev_err(dev, "tx failed with code %ld\n", lpar_rc);
-
-   if (tx_pool->consumer_index == 0)
-   tx_pool->consumer_index =
-   tx_pool->num_buffers - 1;
-   else
-   tx_pool->consumer_index--;
-
dev_kfree_skb_any(skb);
tx_buff->skb = NULL;
 
@@ -1556,7 +1559,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_send_failed++;
tx_dropped++;
ret = NETDEV_TX_OK;
-   goto out;
+   goto tx_err_out;
}
 
if (atomic_add_return(num_entries, _scrq->used)
@@ -1569,7 +1572,16 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_bytes += skb->len;
txq->trans_start = jiffies;
ret = NETDEV_TX_OK;
+   goto out;
 
+tx_err_out:
+   /* roll back consumer index and map array*/
+   if (tx_pool->consumer_index == 0)
+   tx_pool->consumer_index =
+   tx_pool->num_buffers - 1;
+   else
+   tx_pool->consumer_index--;
+   tx_pool->free_map[tx_pool->consumer_index] = index;
 out:
netdev->stats.tx_dropped += tx_dropped;
netdev->stats.tx_bytes += tx_bytes;
-- 
1.8.3.1



[PATCH net-next 3/6] ibmvnic: Update release RX pool routine

2018-03-13 Thread Thomas Falcon
Introduce function that frees one TX pool.  Use that to release
each pool in both the standard TX pool and TSO pool arrays.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 81a4cd9..e6e934e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -608,25 +608,30 @@ static void release_vpd_data(struct ibmvnic_adapter 
*adapter)
adapter->vpd = NULL;
 }
 
+static void release_one_tx_pool(struct ibmvnic_adapter *adapter,
+   struct ibmvnic_tx_pool *tx_pool)
+{
+   kfree(tx_pool->tx_buff);
+   kfree(tx_pool->free_map);
+   free_long_term_buff(adapter, _pool->long_term_buff);
+}
+
 static void release_tx_pools(struct ibmvnic_adapter *adapter)
 {
-   struct ibmvnic_tx_pool *tx_pool;
int i;
 
if (!adapter->tx_pool)
return;
 
for (i = 0; i < adapter->num_active_tx_pools; i++) {
-   netdev_dbg(adapter->netdev, "Releasing tx_pool[%d]\n", i);
-   tx_pool = >tx_pool[i];
-   kfree(tx_pool->tx_buff);
-   free_long_term_buff(adapter, _pool->long_term_buff);
-   free_long_term_buff(adapter, _pool->tso_ltb);
-   kfree(tx_pool->free_map);
+   release_one_tx_pool(adapter, >tx_pool[i]);
+   release_one_tx_pool(adapter, >tso_pool[i]);
}
 
kfree(adapter->tx_pool);
adapter->tx_pool = NULL;
+   kfree(adapter->tso_pool);
+   adapter->tso_pool = NULL;
adapter->num_active_tx_pools = 0;
 }
 
-- 
1.8.3.1



[PATCH net-next 2/6] ibmvnic: Update and clean up reset TX pool routine

2018-03-13 Thread Thomas Falcon
Update TX pool reset routine to accommodate new TSO pool array. Introduce
a function that resets one TX pool, and use that function to initialize
each pool in both pool arrays.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 45 +-
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 6ff43d7..81a4cd9 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -557,36 +557,41 @@ static int init_rx_pools(struct net_device *netdev)
return 0;
 }
 
+static int reset_one_tx_pool(struct ibmvnic_adapter *adapter,
+struct ibmvnic_tx_pool *tx_pool)
+{
+   int rc, i;
+
+   rc = reset_long_term_buff(adapter, _pool->long_term_buff);
+   if (rc)
+   return rc;
+
+   memset(tx_pool->tx_buff, 0,
+  tx_pool->num_buffers *
+  sizeof(struct ibmvnic_tx_buff));
+
+   for (i = 0; i < tx_pool->num_buffers; i++)
+   tx_pool->free_map[i] = i;
+
+   tx_pool->consumer_index = 0;
+   tx_pool->producer_index = 0;
+
+   return 0;
+}
+
 static int reset_tx_pools(struct ibmvnic_adapter *adapter)
 {
-   struct ibmvnic_tx_pool *tx_pool;
int tx_scrqs;
-   int i, j, rc;
+   int i, rc;
 
tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
for (i = 0; i < tx_scrqs; i++) {
-   netdev_dbg(adapter->netdev, "Re-setting tx_pool[%d]\n", i);
-
-   tx_pool = >tx_pool[i];
-
-   rc = reset_long_term_buff(adapter, _pool->long_term_buff);
+   rc = reset_one_tx_pool(adapter, >tso_pool[i]);
if (rc)
return rc;
-
-   rc = reset_long_term_buff(adapter, _pool->tso_ltb);
+   rc = reset_one_tx_pool(adapter, >tx_pool[i]);
if (rc)
return rc;
-
-   memset(tx_pool->tx_buff, 0,
-  adapter->req_tx_entries_per_subcrq *
-  sizeof(struct ibmvnic_tx_buff));
-
-   for (j = 0; j < adapter->req_tx_entries_per_subcrq; j++)
-   tx_pool->free_map[j] = j;
-
-   tx_pool->consumer_index = 0;
-   tx_pool->producer_index = 0;
-   tx_pool->tso_index = 0;
}
 
return 0;
-- 
1.8.3.1



[PATCH net-next 0/6] ibmvnic: Update TX pool and TX routines

2018-03-13 Thread Thomas Falcon
This patch restructures the TX pool data structure and provides a
separate TX pool array for TSO transmissions. This is already used
in some way due to our unique DMA situation, namely that we cannot
use single DMA mappings for packet data. Previously, both buffer
arrays used the same pool entry. This restructuring allows for
some additional cleanup in the driver code, especially in some
places in the device transmit routine.

In addition, it allows us to more easily track the consumer
and producer indexes of a particular pool. This has been
further improved by better tracking of in-use buffers to
prevent possible data corruption in case an invalid buffer
entry is used.

Thomas Falcon (6):
  ibmvnic: Generalize TX pool structure
  ibmvnic: Update and clean up reset TX pool routine
  ibmvnic: Update release RX pool routine
  ibmvnic: Update TX pool initialization routine
  ibmvnic: Update TX and TX completion routines
  ibmvnic: Improve TX buffer accounting

 drivers/net/ethernet/ibm/ibmvnic.c | 235 +
 drivers/net/ethernet/ibm/ibmvnic.h |   8 +-
 2 files changed, 136 insertions(+), 107 deletions(-)

-- 
1.8.3.1



Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-13 Thread Samudrala, Sridhar

On 3/12/2018 3:44 PM, Siwei Liu wrote:

Apologies, still some comments going. Please see inline.

On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
 wrote:

This patch enables virtio_net to switch over to a VF datapath when a VF
netdev is present with the same MAC address. It allows live migration
of a VM with a direct attached VF without the need to setup a bond/team
between a VF and virtio net device in the guest.

The hypervisor needs to enable only one datapath at any time so that
packets don't get looped back to the VM over the other datapath. When a VF
is plugged, the virtio datapath link state can be marked as down. The
hypervisor needs to unplug the VF device from the guest on the source host
and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed,
the destination hypervisor sets the MAC filter on the VF and plugs it back
to the guest to switch over to VF datapath.

When BACKUP feature is enabled, an additional netdev(bypass netdev) is
created that acts as a master device and tracks the state of the 2 lower
netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
passthru device with the same MAC is registered as 'active' netdev.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization=151189725224231=2

Signed-off-by: Sridhar Samudrala 
Signed-off-by: Alexander Duyck 
Reviewed-by: Jesse Brandeburg 
---
  drivers/net/virtio_net.c | 683 ++-
  1 file changed, 682 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bcd13fe906ca..f2860d86c952 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,6 +30,8 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  #include 
  #include 

@@ -206,6 +208,9 @@ struct virtnet_info {
 u32 speed;

 unsigned long guest_offloads;
+
+   /* upper netdev created when BACKUP feature enabled */
+   struct net_device *bypass_netdev;
  };

  struct padded_vnet_hdr {
@@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, struct 
netdev_bpf *xdp)
 }
  }

+static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
+ size_t len)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+   int ret;
+
+   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
+   return -EOPNOTSUPP;
+
+   ret = snprintf(buf, len, "_bkup");
+   if (ret >= len)
+   return -EOPNOTSUPP;
+
+   return 0;
+}
+
  static const struct net_device_ops virtnet_netdev = {
 .ndo_open= virtnet_open,
 .ndo_stop= virtnet_close,
@@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev = {
 .ndo_xdp_xmit   = virtnet_xdp_xmit,
 .ndo_xdp_flush  = virtnet_xdp_flush,
 .ndo_features_check = passthru_features_check,
+   .ndo_get_phys_port_name = virtnet_get_phys_port_name,
  };

  static void virtnet_config_changed_work(struct work_struct *work)
@@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device *vdev)
 return 0;
  }

+/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
+ * When BACKUP feature is enabled, an additional netdev(bypass netdev)
+ * is created that acts as a master device and tracks the state of the
+ * 2 lower netdevs. The original virtio_net netdev is registered as
+ * 'backup' netdev and a passthru device with the same MAC is registered
+ * as 'active' netdev.
+ */
+
+/* bypass state maintained when BACKUP feature is enabled */
+struct virtnet_bypass_info {
+   /* passthru netdev with same MAC */
+   struct net_device __rcu *active_netdev;
+
+   /* virtio_net netdev */
+   struct net_device __rcu *backup_netdev;
+
+   /* active netdev stats */
+   struct rtnl_link_stats64 active_stats;
+
+   /* backup netdev stats */
+   struct rtnl_link_stats64 backup_stats;
+
+   /* aggregated stats */
+   struct rtnl_link_stats64 bypass_stats;
+
+   /* spinlock while updating stats */
+   spinlock_t stats_lock;
+};
+
+static void virtnet_bypass_child_open(struct net_device *dev,
+ struct net_device *child_netdev)
+{
+   int err = dev_open(child_netdev);
+
+   if (err)
+   netdev_warn(dev, "unable to open slave: %s: %d\n",
+   child_netdev->name, err);
+}

Why ignoring the error?? i.e. virtnet_bypass_child_open should have
return value. I don't believe the caller is in a safe context to
guarantee the dev_open always succeeds.


OK.  Will handle this in the next revision.





+
+static int virtnet_bypass_open(struct 

Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Eric Dumazet



On 03/13/2018 05:04 PM, Alexei Starovoitov wrote:

On 3/13/18 4:27 PM, Eric Dumazet wrote:



On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:


we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
It's not clear why it's not crashing there, but we cannot just
reject changing proto in bpf programs now.
We have to fix whatever needs to be fixed in skb_segment
(if bug is there) or fix whatever necessary on mlx5 side.
In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
through virtio would do, so if skb_segment() needs to do something
special with skb the SKB_GSO_DODGY flag is already there.


'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was
not happy with the fix and provided something else.


any link to your old patches and discussion?

I think since mlx4 can do tso on them and the packets came out
correct on the wire, there is nothing fundamentally wrong with
changing gso_size. Just tricky to teach skb_segment.



The world is not mlx4 only. Some NIC will ask skb_segment() fallback 
segmentation for various reasons (like skb->len above a given limit like 
16KB)


Maybe https://www.spinics.net/lists/netdev/msg255549.html



Re: [PATCH] pktgen: use dynamic allocation for debug print buffer

2018-03-13 Thread David Miller
From: Arnd Bergmann 
Date: Tue, 13 Mar 2018 21:58:39 +0100

> After the removal of the VLA, we get a harmless warning about a large
> stack frame:
> 
> net/core/pktgen.c: In function 'pktgen_if_write':
> net/core/pktgen.c:1710:1: error: the frame size of 1076 bytes is larger than 
> 1024 bytes [-Werror=frame-larger-than=]
> 
> The function was previously shown to be safe despite hitting
> the 1024 bye warning level. To get rid of the annoyging warning,
> while keeping it readable, this changes it to use strndup_user().
> 
> Obviously this is not a fast path, so the kmalloc() overhead
> can be disregarded.
> 
> Fixes: 35951393bbff ("pktgen: Remove VLA usage")
> Signed-off-by: Arnd Bergmann 

Applied, thanks.


Re: [PATCH net-next v2 2/4] net: qualcomm: rmnet: Update copyright year to 2018

2018-03-13 Thread Joe Perches
On Tue, 2018-03-13 at 18:15 -0600, Subash Abhinov Kasiviswanathan wrote:
> > Did any work actually occur on all these files in 2018?
> > $ git log --name-only --since=01-01-2018
> > drivers/net/ethernet/qualcomm/rmnet/ | \
> >   grep "^drivers" | sort | uniq
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
> > drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
> 
> Hi
> 
> There were changes to all these files as you pointed out
> except for -
> drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
> drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h

Yup.  That's why I sent that list.

Files without modifications should generally not
have updated copyrights.

cheers, Joe


Re: [PATCH net-next v2 2/4] net: qualcomm: rmnet: Update copyright year to 2018

2018-03-13 Thread Subash Abhinov Kasiviswanathan

Did any work actually occur on all these files in 2018?
$ git log --name-only --since=01-01-2018
drivers/net/ethernet/qualcomm/rmnet/ | \
  grep "^drivers" | sort | uniq
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c


Hi

There were changes to all these files as you pointed out
except for -
drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project


Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Alexei Starovoitov

On 3/13/18 4:27 PM, Eric Dumazet wrote:



On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:


we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
It's not clear why it's not crashing there, but we cannot just
reject changing proto in bpf programs now.
We have to fix whatever needs to be fixed in skb_segment
(if bug is there) or fix whatever necessary on mlx5 side.
In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
through virtio would do, so if skb_segment() needs to do something
special with skb the SKB_GSO_DODGY flag is already there.


'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was
not happy with the fix and provided something else.


any link to your old patches and discussion?

I think since mlx4 can do tso on them and the packets came out
correct on the wire, there is nothing fundamentally wrong with
changing gso_size. Just tricky to teach skb_segment.


GSO_DODGY has nothing to do with the problem really.

Changing gso_size is breaking GRO since it ends up changing the number
of segments on the wire. TCP is not going to be happy, so you'll also
have to fix TCP eventually.






Re: [PATCH] bpf: whitelist syscalls for error injection

2018-03-13 Thread Howard McLauchlan
On 03/13/2018 04:49 PM, Yonghong Song wrote:
> 
> 
> On 3/13/18 4:45 PM, Omar Sandoval wrote:
>> On Tue, Mar 13, 2018 at 04:16:27PM -0700, Howard McLauchlan wrote:
>>> Error injection is a useful mechanism to fail arbitrary kernel
>>> functions. However, it is often hard to guarantee an error propagates
>>> appropriately to user space programs. By injecting into syscalls, we can
>>> return arbitrary values to user space directly; this increases
>>> flexibility and robustness in testing, allowing us to test user space
>>> error paths effectively.
>>>
>>> The following script, for example, fails calls to sys_open() from a
>>> given pid:
>>>
>>> from bcc import BPF
>>> from sys import argv
>>>
>>> pid = argv[1]
>>>
>>> prog = r"""
>>>
>>> int kprobe__SyS_open(struct pt_regs *ctx, const char *pathname, int flags)
>>> {
>>>  u32 pid = bpf_get_current_pid_tgid();
>>>  if (pid == %s)
>>>  bpf_override_return(ctx, -ENOENT);
>>>  return 0;
>>> }
>>> """ % pid
>>>
>>> b = BPF(text = prog)
>>> while 1:
>>>  b.perf_buffer_poll()
>>>
>>> This patch whitelists all syscalls defined with SYSCALL_DEFINE for error
>>> injection.
>>>
>>> Signed-off-by: Howard McLauchlan 
>>> ---
>>> based on 4.16-rc5
>>>   include/linux/syscalls.h | 3 +++
>>>   1 file changed, 3 insertions(+)
>>>
>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>> index a78186d826d7..e8c6d63ace78 100644
>>> --- a/include/linux/syscalls.h
>>> +++ b/include/linux/syscalls.h
>>> @@ -191,6 +191,8 @@ static inline int is_syscall_trace_event(struct 
>>> trace_event_call *tp_event)
>>>     #define SYSCALL_DEFINE0(sname)    \
>>>   SYSCALL_METADATA(_##sname, 0);    \
>>> +    asmlinkage long sys_##sname(void);    \
>>> +    ALLOW_ERROR_INJECTION(sys_##sname, ERRNO);    \
>>>   asmlinkage long sys_##sname(void)
> 
> duplication of asmlinkage in the above?
> 
The pre-declaration is necessary to ensure ALLOW_ERROR_INJECTION works 
appropriately. There can be syscalls
that are not pre-declared elsewhere which will fail compilation if not declared 
in this block.
>>>     #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, 
>>> __VA_ARGS__)
>>> @@ -210,6 +212,7 @@ static inline int is_syscall_trace_event(struct 
>>> trace_event_call *tp_event)
>>>   #define __SYSCALL_DEFINEx(x, name, ...)    \
>>>   asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))    \
>>>   __attribute__((alias(__stringify(SyS##name;    \
>>> +    ALLOW_ERROR_INJECTION(sys##name, ERRNO);    \
>>>   static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));    \
>>>   asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));    \
>>>   asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))    \
>>> -- 
>>> 2.14.1
>>>
>>
>> Adding a few more people to Cc
>>


Re: [PATCH] bpf: whitelist syscalls for error injection

2018-03-13 Thread Yonghong Song



On 3/13/18 4:45 PM, Omar Sandoval wrote:

On Tue, Mar 13, 2018 at 04:16:27PM -0700, Howard McLauchlan wrote:

Error injection is a useful mechanism to fail arbitrary kernel
functions. However, it is often hard to guarantee an error propagates
appropriately to user space programs. By injecting into syscalls, we can
return arbitrary values to user space directly; this increases
flexibility and robustness in testing, allowing us to test user space
error paths effectively.

The following script, for example, fails calls to sys_open() from a
given pid:

from bcc import BPF
from sys import argv

pid = argv[1]

prog = r"""

int kprobe__SyS_open(struct pt_regs *ctx, const char *pathname, int flags)
{
 u32 pid = bpf_get_current_pid_tgid();
 if (pid == %s)
 bpf_override_return(ctx, -ENOENT);
 return 0;
}
""" % pid

b = BPF(text = prog)
while 1:
 b.perf_buffer_poll()

This patch whitelists all syscalls defined with SYSCALL_DEFINE for error
injection.

Signed-off-by: Howard McLauchlan 
---
based on 4.16-rc5
  include/linux/syscalls.h | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a78186d826d7..e8c6d63ace78 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,8 @@ static inline int is_syscall_trace_event(struct 
trace_event_call *tp_event)
  
  #define SYSCALL_DEFINE0(sname)	\

SYSCALL_METADATA(_##sname, 0);  \
+   asmlinkage long sys_##sname(void);  \
+   ALLOW_ERROR_INJECTION(sys_##sname, ERRNO);  \
asmlinkage long sys_##sname(void)


duplication of asmlinkage in the above?

  
  #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)

@@ -210,6 +212,7 @@ static inline int is_syscall_trace_event(struct 
trace_event_call *tp_event)
  #define __SYSCALL_DEFINEx(x, name, ...)   
\
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))   \
__attribute__((alias(__stringify(SyS##name; \
+   ALLOW_ERROR_INJECTION(sys##name, ERRNO);\
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));  \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))   \
--
2.14.1



Adding a few more people to Cc



Re: [PATCH] bpf: whitelist syscalls for error injection

2018-03-13 Thread Omar Sandoval
On Tue, Mar 13, 2018 at 04:16:27PM -0700, Howard McLauchlan wrote:
> Error injection is a useful mechanism to fail arbitrary kernel
> functions. However, it is often hard to guarantee an error propagates
> appropriately to user space programs. By injecting into syscalls, we can
> return arbitrary values to user space directly; this increases
> flexibility and robustness in testing, allowing us to test user space
> error paths effectively.
> 
> The following script, for example, fails calls to sys_open() from a
> given pid:
> 
> from bcc import BPF
> from sys import argv
> 
> pid = argv[1]
> 
> prog = r"""
> 
> int kprobe__SyS_open(struct pt_regs *ctx, const char *pathname, int flags)
> {
> u32 pid = bpf_get_current_pid_tgid();
> if (pid == %s)
> bpf_override_return(ctx, -ENOENT);
> return 0;
> }
> """ % pid
> 
> b = BPF(text = prog)
> while 1:
> b.perf_buffer_poll()
> 
> This patch whitelists all syscalls defined with SYSCALL_DEFINE for error
> injection.
> 
> Signed-off-by: Howard McLauchlan 
> ---
> based on 4.16-rc5
>  include/linux/syscalls.h | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a78186d826d7..e8c6d63ace78 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -191,6 +191,8 @@ static inline int is_syscall_trace_event(struct 
> trace_event_call *tp_event)
>  
>  #define SYSCALL_DEFINE0(sname)   \
>   SYSCALL_METADATA(_##sname, 0);  \
> + asmlinkage long sys_##sname(void);  \
> + ALLOW_ERROR_INJECTION(sys_##sname, ERRNO);  \
>   asmlinkage long sys_##sname(void)
>  
>  #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
> @@ -210,6 +212,7 @@ static inline int is_syscall_trace_event(struct 
> trace_event_call *tp_event)
>  #define __SYSCALL_DEFINEx(x, name, ...)  
> \
>   asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))   \
>   __attribute__((alias(__stringify(SyS##name; \
> + ALLOW_ERROR_INJECTION(sys##name, ERRNO);\
>   static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \
>   asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));  \
>   asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))   \
> -- 
> 2.14.1
> 

Adding a few more people to Cc


[PATCH 2/2] hns: Clean up string operations

2018-03-13 Thread Ben Hutchings
The driver-internal string operations are only ever used for
statistics, so remove the stringset parameters and rename them
accordingly.

Signed-off-by: Ben Hutchings 
---
 drivers/net/ethernet/hisilicon/hns/hnae.h  | 13 
 drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c  | 37 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c | 16 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c  |  9 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h  | 10 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 28 ++--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h |  6 ++--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c  | 11 +++
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h  |  4 +--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c  | 20 
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h  |  4 +--
 .../net/ethernet/hisilicon/hns/hns_dsaf_xgmac.c| 24 +-
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c   |  9 +++---
 13 files changed, 78 insertions(+), 113 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h 
b/drivers/net/ethernet/hisilicon/hns/hnae.h
index 3e62692af011..480fa2b49be7 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.h
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.h
@@ -457,10 +457,10 @@ enum hnae_media_type {
  *   update Old network device statistics
  * get_ethtool_stats()
  *   get ethtool network device statistics
- * get_strings()
- *   get a set of strings that describe the requested objects
- * get_sset_count()
- *   get number of strings that @get_strings will write
+ * get_stats_strings()
+ *   get a set of strings that describe the statistics
+ * get_stats_count()
+ *   get number of strings that @get_stats_strings will write
  * update_led_status()
  *   update the led status
  * set_led_id()
@@ -522,9 +522,8 @@ struct hnae_ae_ops {
void (*update_stats)(struct hnae_handle *handle,
 struct net_device_stats *net_stats);
void (*get_stats)(struct hnae_handle *handle, u64 *data);
-   void (*get_strings)(struct hnae_handle *handle,
-   u32 stringset, u8 *data);
-   int (*get_sset_count)(struct hnae_handle *handle, int stringset);
+   void (*get_stats_strings)(struct hnae_handle *handle, u8 *data);
+   int (*get_stats_count)(struct hnae_handle *handle);
void (*update_led_status)(struct hnae_handle *handle);
int (*set_led_id)(struct hnae_handle *handle,
  enum hnae_led_state status);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c 
b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
index bd68379d2bea..638476aa6311 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
@@ -679,21 +679,20 @@ void hns_ae_get_stats(struct hnae_handle *handle, u64 
*data)
 
for (idx = 0; idx < handle->q_num; idx++) {
hns_rcb_get_stats(handle->qs[idx], p);
-   p += hns_rcb_get_ring_sset_count((int)ETH_SS_STATS);
+   p += hns_rcb_get_ring_stats_count();
}
 
hns_ppe_get_stats(ppe_cb, p);
-   p += hns_ppe_get_sset_count((int)ETH_SS_STATS);
+   p += hns_ppe_get_stats_count();
 
hns_mac_get_stats(mac_cb, p);
-   p += hns_mac_get_sset_count(mac_cb, (int)ETH_SS_STATS);
+   p += hns_mac_get_stats_count(mac_cb);
 
if (mac_cb->mac_type == HNAE_PORT_SERVICE)
hns_dsaf_get_stats(vf_cb->dsaf_dev, p, vf_cb->port_index);
 }
 
-void hns_ae_get_strings(struct hnae_handle *handle,
-   u32 stringset, u8 *data)
+void hns_ae_get_stats_strings(struct hnae_handle *handle, u8 *data)
 {
int port;
int idx;
@@ -711,21 +710,21 @@ void hns_ae_get_strings(struct hnae_handle *handle,
ppe_cb = hns_get_ppe_cb(handle);
 
for (idx = 0; idx < handle->q_num; idx++) {
-   hns_rcb_get_strings(stringset, p, idx);
-   p += ETH_GSTRING_LEN * hns_rcb_get_ring_sset_count(stringset);
+   hns_rcb_get_stats_strings(p, idx);
+   p += ETH_GSTRING_LEN * hns_rcb_get_ring_stats_count();
}
 
-   hns_ppe_get_strings(ppe_cb, stringset, p);
-   p += ETH_GSTRING_LEN * hns_ppe_get_sset_count(stringset);
+   hns_ppe_get_stats_strings(ppe_cb, p);
+   p += ETH_GSTRING_LEN * hns_ppe_get_stats_count();
 
-   hns_mac_get_strings(mac_cb, stringset, p);
-   p += ETH_GSTRING_LEN * hns_mac_get_sset_count(mac_cb, stringset);
+   hns_mac_get_stats_strings(mac_cb, p);
+   p += ETH_GSTRING_LEN * hns_mac_get_stats_count(mac_cb);
 
if (mac_cb->mac_type == HNAE_PORT_SERVICE)
-   hns_dsaf_get_strings(stringset, p, port, dsaf_dev);
+   hns_dsaf_get_stats_strings(p, port, dsaf_dev);
 }
 
-int hns_ae_get_sset_count(struct hnae_handle *handle, int 

[PATCH net 1/2] hns: Fix string set validation in ethtool string operations

2018-03-13 Thread Ben Hutchings
The hns driver used to assume that it would only ever be asked
about ETH_SS_TEST or ETH_SS_STATS, and for any other stringset
it would claim a count of 0 but if asked for names would copy
over all the statistic names.  This resulted in a potential
heap buffer overflow.

This was "fixed" some time ago by making it report the number of
statistics when asked about the ETH_SS_PRIV_FLAGS stringset.  This
doesn't make any sense, and it left the bug still exploitable with
other stringset numbers.

As a minimal but complete fix, stop calling the driver-internal
string operations for any stringset other than ETH_SS_STATS.

Fixes: 511e6bc071db ("net: add Hisilicon Network Subsystem DSAF support")
Fixes: 412b65d15a7f ("net: hns: fix ethtool_get_strings overflow ...")
Signed-off-by: Ben Hutchings 
---
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index 7ea7f8a4aa2a..b836742f7ab6 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -889,11 +889,6 @@ void hns_get_strings(struct net_device *netdev, u32 
stringset, u8 *data)
struct hnae_handle *h = priv->ae_handle;
char *buff = (char *)data;
 
-   if (!h->dev->ops->get_strings) {
-   netdev_err(netdev, "h->dev->ops->get_strings is null!\n");
-   return;
-   }
-
if (stringset == ETH_SS_TEST) {
if (priv->ae_handle->phy_if != PHY_INTERFACE_MODE_XGMII) {
memcpy(buff, hns_nic_test_strs[MAC_INTERNALLOOP_MAC],
@@ -907,7 +902,7 @@ void hns_get_strings(struct net_device *netdev, u32 
stringset, u8 *data)
memcpy(buff, hns_nic_test_strs[MAC_INTERNALLOOP_PHY],
   ETH_GSTRING_LEN);
 
-   } else {
+   } else if (stringset == ETH_SS_STATS && h->dev->ops->get_strings) {
snprintf(buff, ETH_GSTRING_LEN, "rx_packets");
buff = buff + ETH_GSTRING_LEN;
snprintf(buff, ETH_GSTRING_LEN, "tx_packets");
@@ -969,7 +964,7 @@ void hns_get_strings(struct net_device *netdev, u32 
stringset, u8 *data)
 /**
  * nic_get_sset_count - get string set count witch returned by nic_get_strings.
  * @dev: net device
- * @stringset: string set index, 0: self test string; 1: statistics string.
+ * @stringset: string set index
  *
  * Return string set count.
  */
@@ -979,10 +974,6 @@ int hns_get_sset_count(struct net_device *netdev, int 
stringset)
struct hnae_handle *h = priv->ae_handle;
struct hnae_ae_ops *ops = h->dev->ops;
 
-   if (!ops->get_sset_count) {
-   netdev_err(netdev, "get_sset_count is null!\n");
-   return -EOPNOTSUPP;
-   }
if (stringset == ETH_SS_TEST) {
u32 cnt = (sizeof(hns_nic_test_strs) / ETH_GSTRING_LEN);
 
@@ -993,8 +984,10 @@ int hns_get_sset_count(struct net_device *netdev, int 
stringset)
cnt--;
 
return cnt;
+   } else if (stringset == ETH_SS_STATS && ops->get_sset_count) {
+   return HNS_NET_STATS_CNT + ops->get_sset_count(h, stringset);
} else {
-   return (HNS_NET_STATS_CNT + ops->get_sset_count(h, stringset));
+   return -EOPNOTSUPP;
}
 }
 



signature.asc
Description: Digital signature


Re: [PATCH net-next v2 2/4] net: qualcomm: rmnet: Update copyright year to 2018

2018-03-13 Thread Joe Perches
On Tue, 2018-03-13 at 16:50 -0600, Subash Abhinov Kasiviswanathan wrote:
> Signed-off-by: Subash Abhinov Kasiviswanathan 
> ---
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c  | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h  | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c| 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h| 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c| 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h | 2 +-

Did any work actually occur on all these files in 2018?
$ git log --name-only --since=01-01-2018 drivers/net/ethernet/qualcomm/rmnet/ | 
\
  grep "^drivers" | sort | uniq
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c



Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Eric Dumazet



On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:


we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
It's not clear why it's not crashing there, but we cannot just
reject changing proto in bpf programs now.
We have to fix whatever needs to be fixed in skb_segment
(if bug is there) or fix whatever necessary on mlx5 side.
In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
through virtio would do, so if skb_segment() needs to do something
special with skb the SKB_GSO_DODGY flag is already there.


'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was 
not happy with the fix and provided something else.


GSO_DODGY has nothing to do with the problem really.

Changing gso_size is breaking GRO since it ends up changing the number 
of segments on the wire. TCP is not going to be happy, so you'll also 
have to fix TCP eventually.





Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Daniel Borkmann
On 03/14/2018 12:09 AM, Alexei Starovoitov wrote:
> On 3/13/18 3:47 PM, Eric Dumazet wrote:
>>
>>
>> On 03/13/2018 03:37 PM, Yonghong Song wrote:
>>> Adding additional cc's:
>>>    Saeed Mahameed as this is most likely mlx5 driver related.
>>>    Diptanu Gon Choudhury who initially reported the issue.
>>>
>>>
>>> On 3/13/18 1:44 AM, Steffen Klassert wrote:
 On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:
>
>
> On 03/12/2018 11:08 PM, Yonghong Song wrote:
>>
>>
>> On 3/12/18 11:04 PM, Eric Dumazet wrote:
>>>
>>>
>>> On 03/12/2018 10:45 PM, Yonghong Song wrote:
 ...
 Setup:
 =

 The test will involve three machines:
     M_ipv6 <-> M_nat <-> M_ipv4

 The M_nat will do ipv4<->ipv6 address translation and then
 forward packet
 to proper destination. The control plane will configure M_nat
 properly
 will understand virtual ipv4 address for machine M_ipv6, and
 virtual ipv6 address for machine M_ipv4.

 M_nat runs a bpf program, which is attached to clsact (ingress)
 qdisc.
 The program uses bpf_skb_change_proto to do protocol conversion.
 bpf_skb_change_proto will adjust skb header_len and len properly
 based on protocol change.
 After the conversion, the program will make proper change on
 ethhdr and ip4/6 header, recalculate checksum, and send the
 packet out
 through bpf_redirect.

 Experiment:
 ===

 MTU: 1500B for all three machines.

 The tso/lro/gro are enabled on the M_nat box.

 ping works on both ways of M_ipv6 <-> M_ipv4.
 It works for transfering a small file (4KB) between M_ipv6 and
 M_ipv4 (both ways).
 Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
 failed with the above BUG_ON, really fast.
 Did not really test from M_ipv4 to M_ipv6 with large file.

 The error path likely to be (also from the above call stack):
     nic -> lro/gro -> bpf_program -> gso (BUG_ON)

 Just out of curiosity, are these packets created with LRO or GRO?
 Usually LRO is disabled if forwarding is enabled on a machine,
 because segmented LGO packets are likely corrupt.
>>>
>>> In our experiments, LRO is disabled.
>>> On mlx5, when GRO is on, the BUG_ON will happen, and
>>> when GRO is off, the BUG_ON will not happen.
>>>

 These packets take an alternative redirect path, so not sure what
 happens here.


 In one of experiments, I explicitly printed the skb->len and
 skb->data_len. The values are below:
     skb_segment: len 2856, data_len 2686
 They should be equal to avoid BUG.

 In another experiment, I got:
     skb_segment: len 1428, data_len 1258

 In both cases, the difference is 170 bytes. Not sure whether
 this is just a coincidence or not.

 Workaround:
 ===

 A workaround to avoid BUG_ON is to disable lro/gro. This way,
 kernel will not receive big packets and hence gso is not really
 called.

 I am not familiar with gso code. Does anybody hit this BUG_ON
 before?
 Any suggestion on how to debug this?

>>>
>>> skb_segment() works if incoming GRO packet is not modified in its
>>> geometry.
>>>
>>> In your case it seems you had to adjust gso_size (calling
>>> skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
>>> skb_segment() badly, because geometry changes, unless you had
>>> specific MTU/MSS restrictions.
>>>
>>> You will have to make skb_segment() more generic if you really
>>> want this.
>>
>> In net/core/filter.c function bpf_skb_change_proto, which is called
>> in the bpf program, does some GSO adjustment. Could you help check
>> whether it satisfies my above use case or not? Thanks!
>
> As I said this  helper ends up modifying gso_size by +/- 20
> (sizeof(ipv6
> header) - sizeof(ipv4 header))
>
> So it wont work if skb_segment() is called after this change.

 Even HW TSO use gso_size to segment the packets. Would'nt this
 result in broken packets too, if gso_size is modified on a
 forwarding path?

>
> Not clear why the GRO packet is not sent as is (as a TSO packet) since
> mlx4/mlx5 NICs certainly support TSO.
>>>
>>> This is a very good observation.
>>> We did the same experiment on mlx4, the same kernel, the same
>>> userspace apps, the same bpf program. The only difference is mlx4 vs.
>>> mlx5.
>>> The mlx4 works fine with LRO=off and GRO=on, while
>>> mlx5 failed with the same LRO/GRO configuration.
>>>
>>> On mlx4 box, we are able to see TSO packets are 

Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Alexei Starovoitov

On 3/13/18 3:47 PM, Eric Dumazet wrote:



On 03/13/2018 03:37 PM, Yonghong Song wrote:

Adding additional cc's:
   Saeed Mahameed as this is most likely mlx5 driver related.
   Diptanu Gon Choudhury who initially reported the issue.


On 3/13/18 1:44 AM, Steffen Klassert wrote:

On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:



On 03/12/2018 11:08 PM, Yonghong Song wrote:



On 3/12/18 11:04 PM, Eric Dumazet wrote:



On 03/12/2018 10:45 PM, Yonghong Song wrote:

...
Setup:
=

The test will involve three machines:
M_ipv6 <-> M_nat <-> M_ipv4

The M_nat will do ipv4<->ipv6 address translation and then
forward packet
to proper destination. The control plane will configure M_nat
properly
will understand virtual ipv4 address for machine M_ipv6, and
virtual ipv6 address for machine M_ipv4.

M_nat runs a bpf program, which is attached to clsact (ingress)
qdisc.
The program uses bpf_skb_change_proto to do protocol conversion.
bpf_skb_change_proto will adjust skb header_len and len properly
based on protocol change.
After the conversion, the program will make proper change on
ethhdr and ip4/6 header, recalculate checksum, and send the
packet out
through bpf_redirect.

Experiment:
===

MTU: 1500B for all three machines.

The tso/lro/gro are enabled on the M_nat box.

ping works on both ways of M_ipv6 <-> M_ipv4.
It works for transfering a small file (4KB) between M_ipv6 and
M_ipv4 (both ways).
Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
failed with the above BUG_ON, really fast.
Did not really test from M_ipv4 to M_ipv6 with large file.

The error path likely to be (also from the above call stack):
nic -> lro/gro -> bpf_program -> gso (BUG_ON)


Just out of curiosity, are these packets created with LRO or GRO?
Usually LRO is disabled if forwarding is enabled on a machine,
because segmented LGO packets are likely corrupt.


In our experiments, LRO is disabled.
On mlx5, when GRO is on, the BUG_ON will happen, and
when GRO is off, the BUG_ON will not happen.



These packets take an alternative redirect path, so not sure what
happens here.



In one of experiments, I explicitly printed the skb->len and
skb->data_len. The values are below:
skb_segment: len 2856, data_len 2686
They should be equal to avoid BUG.

In another experiment, I got:
skb_segment: len 1428, data_len 1258

In both cases, the difference is 170 bytes. Not sure whether
this is just a coincidence or not.

Workaround:
===

A workaround to avoid BUG_ON is to disable lro/gro. This way,
kernel will not receive big packets and hence gso is not really
called.

I am not familiar with gso code. Does anybody hit this BUG_ON
before?
Any suggestion on how to debug this?



skb_segment() works if incoming GRO packet is not modified in its
geometry.

In your case it seems you had to adjust gso_size (calling
skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
skb_segment() badly, because geometry changes, unless you had
specific MTU/MSS restrictions.

You will have to make skb_segment() more generic if you really
want this.


In net/core/filter.c function bpf_skb_change_proto, which is called
in the bpf program, does some GSO adjustment. Could you help check
whether it satisfies my above use case or not? Thanks!


As I said this  helper ends up modifying gso_size by +/- 20
(sizeof(ipv6
header) - sizeof(ipv4 header))

So it wont work if skb_segment() is called after this change.


Even HW TSO use gso_size to segment the packets. Would'nt this
result in broken packets too, if gso_size is modified on a
forwarding path?



Not clear why the GRO packet is not sent as is (as a TSO packet) since
mlx4/mlx5 NICs certainly support TSO.


This is a very good observation.
We did the same experiment on mlx4, the same kernel, the same
userspace apps, the same bpf program. The only difference is mlx4 vs.
mlx5.
The mlx4 works fine with LRO=off and GRO=on, while
mlx5 failed with the same LRO/GRO configuration.

On mlx4 box, we are able to see TSO packets are increasing as the
large file scp is in progress.
# ethtool -S eth0 | grep tso
  tso_packets: 45495
# ethtool -S eth0 | grep tso
  tso_packets: 45865
# ethtool -S eth0 | grep tso
  tso_packets: 46337
# ethtool -S eth0 | grep tso
  tso_packets: 46724

And use bcc tool to track to func call count for skb_segment
and find it is called 0 times. Clearly, mlx4 is able to take
the packet as TSO and hence the packet will not go to
the stack.

# funccount.py -i 3 'skb_segment'
Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.

FUNCCOUNT

FUNCCOUNT
...

CC Saeed Mahameed who may help debug and provide some insights
what is the problem.


There are many reasons why a GRO packet might need to be segmented by
software (skb_segment())

This is the step where you have crashes, so really mlx4/mlx5 difference
do not matter.

gso_size should not be dynamically 

Re: [PATCH net-next 1/4] net: qualcomm: rmnet: Fix casting issues

2018-03-13 Thread Subash Abhinov Kasiviswanathan
Ummm, if you change pkt_len to be a proper __be16, then you don't need 
these

casts when passing it to ntohs().


Hi David

I have fixed this now in v2.

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project


[PATCH net-next v2 3/4] net: qualcomm: rmnet: Remove unnecessary device assignment

2018-03-13 Thread Subash Abhinov Kasiviswanathan
Device of the de-aggregated skb is correctly assigned after inspecting
the mux_id, so remove the assignment in rmnet_map_deaggregate().

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 49e420e..e8f6c79 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -323,7 +323,6 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
if (!skbn)
return NULL;
 
-   skbn->dev = skb->dev;
skb_reserve(skbn, RMNET_MAP_DEAGGR_HEADROOM);
skb_put(skbn, packet_len);
memcpy(skbn->data, skb->data, packet_len);
-- 
1.9.1



[PATCH net-next v2 2/4] net: qualcomm: rmnet: Update copyright year to 2018

2018-03-13 Thread Subash Abhinov Kasiviswanathan
Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c  | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h  | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c| 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h| 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c| 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index c494918..096301a 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index 00e4634..0b5b5da 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights 
reserved.
+/* Copyright (c) 2013-2014, 2016-2018 The Linux Foundation. All rights 
reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 601edec..c758248 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
index 3537e4c..1bc6828 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 4f362df..884f1f5 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index b0dbca0..afa2b86 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index c74a6c5..49e420e 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
index de0143e..98365ef 100644
--- 

[PATCH net-next v2 4/4] net: qualcomm: rmnet: Implement fill_info

2018-03-13 Thread Subash Abhinov Kasiviswanathan
This is needed to query the mux_id and flags of a rmnet device.

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 096301a..d0f3e0f 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -331,6 +331,35 @@ static size_t rmnet_get_size(const struct net_device *dev)
   nla_total_size(sizeof(struct ifla_vlan_flags)); /* 
IFLA_VLAN_FLAGS */
 }
 
+static int rmnet_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+   struct rmnet_priv *priv = netdev_priv(dev);
+   struct net_device *real_dev;
+   struct ifla_vlan_flags f;
+   struct rmnet_port *port;
+
+   real_dev = priv->real_dev;
+
+   if (!rmnet_is_real_dev_registered(real_dev))
+   return -ENODEV;
+
+   if (nla_put_u16(skb, IFLA_VLAN_ID, priv->mux_id))
+   goto nla_put_failure;
+
+   port = rmnet_get_port_rtnl(real_dev);
+
+   f.flags = port->data_format;
+   f.mask  = ~0;
+
+   if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), ))
+   goto nla_put_failure;
+
+   return 0;
+
+nla_put_failure:
+   return -EMSGSIZE;
+}
+
 struct rtnl_link_ops rmnet_link_ops __read_mostly = {
.kind   = "rmnet",
.maxtype= __IFLA_VLAN_MAX,
@@ -341,6 +370,7 @@ struct rtnl_link_ops rmnet_link_ops __read_mostly = {
.dellink= rmnet_dellink,
.get_size   = rmnet_get_size,
.changelink = rmnet_changelink,
+   .fill_info  = rmnet_fill_info,
 };
 
 /* Needs either rcu_read_lock() or rtnl lock */
-- 
1.9.1



[PATCH net-next v2 1/4] net: qualcomm: rmnet: Fix casting issues

2018-03-13 Thread Subash Abhinov Kasiviswanathan
Fix warnings which were reported when running with sparse
(make C=1 CF=-D__CHECK_ENDIAN__)

drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c:81:15:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:271:37:
warning: incorrect type in assignment (different base types)
expected unsigned short [unsigned] [usertype] pkt_len
got restricted __be16 [usertype] 
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:287:29:
warning: incorrect type in assignment (different base types)
expected unsigned short [unsigned] [usertype] pkt_len
got restricted __be16 [usertype] 
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:310:22:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:319:13:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:49:18:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:50:18:
warning: cast to restricted __be32
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:74:21:
warning: cast to restricted __be16

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 6ce31e2..4f362df 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -23,8 +23,8 @@ struct rmnet_map_control_command {
struct {
u16 ip_family:2;
u16 reserved:14;
-   u16 flow_control_seq_num;
-   u32 qos_id;
+   __be16 flow_control_seq_num;
+   __be32 qos_id;
} flow_control;
u8 data[0];
};
@@ -44,7 +44,7 @@ struct rmnet_map_header {
u8  reserved_bit:1;
u8  cd_bit:1;
u8  mux_id;
-   u16 pkt_len;
+   __be16 pkt_len;
 }  __aligned(1);
 
 struct rmnet_map_dl_csum_trailer {
-- 
1.9.1



[PATCH net-next v2 0/4] net: qualcomm: rmnet: Updates 2018-03-12

2018-03-13 Thread Subash Abhinov Kasiviswanathan
This series contains some minor updates for rmnet driver.

Patch 1 contains fixes for sparse warnings.
Patch 2 updates the copyright date to 2018.
Patch 3 is a cleanup in receive path.
Patch 4 has the implementation of the fill_info operation.

v1->v2: Remove the force casts since the data type is changed to __be types
as mentioned by David.

Subash Abhinov Kasiviswanathan (4):
  net: qualcomm: rmnet: Fix casting issues
  net: qualcomm: rmnet: Update copyright year to 2018
  net: qualcomm: rmnet: Remove unnecessary device assignment
  net: qualcomm: rmnet: Implement fill_info

 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 32 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.h   |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h|  8 +++---
 .../ethernet/qualcomm/rmnet/rmnet_map_command.c|  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   |  3 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_private.h|  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c|  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h|  2 +-
 10 files changed, 43 insertions(+), 14 deletions(-)

-- 
1.9.1



Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Eric Dumazet



On 03/13/2018 03:37 PM, Yonghong Song wrote:

Adding additional cc's:
   Saeed Mahameed as this is most likely mlx5 driver related.
   Diptanu Gon Choudhury who initially reported the issue.


On 3/13/18 1:44 AM, Steffen Klassert wrote:

On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:



On 03/12/2018 11:08 PM, Yonghong Song wrote:



On 3/12/18 11:04 PM, Eric Dumazet wrote:



On 03/12/2018 10:45 PM, Yonghong Song wrote:

...
Setup:
=

The test will involve three machines:
    M_ipv6 <-> M_nat <-> M_ipv4

The M_nat will do ipv4<->ipv6 address translation and then
forward packet
to proper destination. The control plane will configure M_nat 
properly

will understand virtual ipv4 address for machine M_ipv6, and
virtual ipv6 address for machine M_ipv4.

M_nat runs a bpf program, which is attached to clsact (ingress) 
qdisc.

The program uses bpf_skb_change_proto to do protocol conversion.
bpf_skb_change_proto will adjust skb header_len and len properly
based on protocol change.
After the conversion, the program will make proper change on
ethhdr and ip4/6 header, recalculate checksum, and send the packet 
out

through bpf_redirect.

Experiment:
===

MTU: 1500B for all three machines.

The tso/lro/gro are enabled on the M_nat box.

ping works on both ways of M_ipv6 <-> M_ipv4.
It works for transfering a small file (4KB) between M_ipv6 and
M_ipv4 (both ways).
Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
failed with the above BUG_ON, really fast.
Did not really test from M_ipv4 to M_ipv6 with large file.

The error path likely to be (also from the above call stack):
    nic -> lro/gro -> bpf_program -> gso (BUG_ON)


Just out of curiosity, are these packets created with LRO or GRO?
Usually LRO is disabled if forwarding is enabled on a machine,
because segmented LGO packets are likely corrupt.


In our experiments, LRO is disabled.
On mlx5, when GRO is on, the BUG_ON will happen, and
when GRO is off, the BUG_ON will not happen.



These packets take an alternative redirect path, so not sure what
happens here.



In one of experiments, I explicitly printed the skb->len and
skb->data_len. The values are below:
    skb_segment: len 2856, data_len 2686
They should be equal to avoid BUG.

In another experiment, I got:
    skb_segment: len 1428, data_len 1258

In both cases, the difference is 170 bytes. Not sure whether
this is just a coincidence or not.

Workaround:
===

A workaround to avoid BUG_ON is to disable lro/gro. This way,
kernel will not receive big packets and hence gso is not really 
called.


I am not familiar with gso code. Does anybody hit this BUG_ON before?
Any suggestion on how to debug this?



skb_segment() works if incoming GRO packet is not modified in its
geometry.

In your case it seems you had to adjust gso_size (calling
skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
skb_segment() badly, because geometry changes, unless you had
specific MTU/MSS restrictions.

You will have to make skb_segment() more generic if you really want 
this.


In net/core/filter.c function bpf_skb_change_proto, which is called
in the bpf program, does some GSO adjustment. Could you help check
whether it satisfies my above use case or not? Thanks!


As I said this  helper ends up modifying gso_size by +/- 20 (sizeof(ipv6
header) - sizeof(ipv4 header))

So it wont work if skb_segment() is called after this change.


Even HW TSO use gso_size to segment the packets. Would'nt this
result in broken packets too, if gso_size is modified on a
forwarding path?



Not clear why the GRO packet is not sent as is (as a TSO packet) since
mlx4/mlx5 NICs certainly support TSO.


This is a very good observation.
We did the same experiment on mlx4, the same kernel, the same userspace 
apps, the same bpf program. The only difference is mlx4 vs. mlx5.

The mlx4 works fine with LRO=off and GRO=on, while
mlx5 failed with the same LRO/GRO configuration.

On mlx4 box, we are able to see TSO packets are increasing as the
large file scp is in progress.
# ethtool -S eth0 | grep tso
  tso_packets: 45495
# ethtool -S eth0 | grep tso
  tso_packets: 45865
# ethtool -S eth0 | grep tso
  tso_packets: 46337
# ethtool -S eth0 | grep tso
  tso_packets: 46724

And use bcc tool to track to func call count for skb_segment
and find it is called 0 times. Clearly, mlx4 is able to take
the packet as TSO and hence the packet will not go to
the stack.

# funccount.py -i 3 'skb_segment'
Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.

FUNC    COUNT

FUNC    COUNT
...

CC Saeed Mahameed who may help debug and provide some insights
what is the problem.


There are many reasons why a GRO packet might need to be segmented by 
software (skb_segment())


This is the step where you have crashes, so really mlx4/mlx5 difference 
do not matter.


gso_size should not be dynamically changed. This needs to be 

Re: BUG_ON triggered in skb_segment

2018-03-13 Thread Yonghong Song

Adding additional cc's:
  Saeed Mahameed as this is most likely mlx5 driver related.
  Diptanu Gon Choudhury who initially reported the issue.


On 3/13/18 1:44 AM, Steffen Klassert wrote:

On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:



On 03/12/2018 11:08 PM, Yonghong Song wrote:



On 3/12/18 11:04 PM, Eric Dumazet wrote:



On 03/12/2018 10:45 PM, Yonghong Song wrote:

...
Setup:
=

The test will involve three machines:
    M_ipv6 <-> M_nat <-> M_ipv4

The M_nat will do ipv4<->ipv6 address translation and then
forward packet
to proper destination. The control plane will configure M_nat properly
will understand virtual ipv4 address for machine M_ipv6, and
virtual ipv6 address for machine M_ipv4.

M_nat runs a bpf program, which is attached to clsact (ingress) qdisc.
The program uses bpf_skb_change_proto to do protocol conversion.
bpf_skb_change_proto will adjust skb header_len and len properly
based on protocol change.
After the conversion, the program will make proper change on
ethhdr and ip4/6 header, recalculate checksum, and send the packet out
through bpf_redirect.

Experiment:
===

MTU: 1500B for all three machines.

The tso/lro/gro are enabled on the M_nat box.

ping works on both ways of M_ipv6 <-> M_ipv4.
It works for transfering a small file (4KB) between M_ipv6 and
M_ipv4 (both ways).
Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
failed with the above BUG_ON, really fast.
Did not really test from M_ipv4 to M_ipv6 with large file.

The error path likely to be (also from the above call stack):
    nic -> lro/gro -> bpf_program -> gso (BUG_ON)


Just out of curiosity, are these packets created with LRO or GRO?
Usually LRO is disabled if forwarding is enabled on a machine,
because segmented LGO packets are likely corrupt.


In our experiments, LRO is disabled.
On mlx5, when GRO is on, the BUG_ON will happen, and
when GRO is off, the BUG_ON will not happen.



These packets take an alternative redirect path, so not sure what
happens here.



In one of experiments, I explicitly printed the skb->len and
skb->data_len. The values are below:
    skb_segment: len 2856, data_len 2686
They should be equal to avoid BUG.

In another experiment, I got:
    skb_segment: len 1428, data_len 1258

In both cases, the difference is 170 bytes. Not sure whether
this is just a coincidence or not.

Workaround:
===

A workaround to avoid BUG_ON is to disable lro/gro. This way,
kernel will not receive big packets and hence gso is not really called.

I am not familiar with gso code. Does anybody hit this BUG_ON before?
Any suggestion on how to debug this?



skb_segment() works if incoming GRO packet is not modified in its
geometry.

In your case it seems you had to adjust gso_size (calling
skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
skb_segment() badly, because geometry changes, unless you had
specific MTU/MSS restrictions.

You will have to make skb_segment() more generic if you really want this.


In net/core/filter.c function bpf_skb_change_proto, which is called
in the bpf program, does some GSO adjustment. Could you help check
whether it satisfies my above use case or not? Thanks!


As I said this  helper ends up modifying gso_size by +/- 20 (sizeof(ipv6
header) - sizeof(ipv4 header))

So it wont work if skb_segment() is called after this change.


Even HW TSO use gso_size to segment the packets. Would'nt this
result in broken packets too, if gso_size is modified on a
forwarding path?



Not clear why the GRO packet is not sent as is (as a TSO packet) since
mlx4/mlx5 NICs certainly support TSO.


This is a very good observation.
We did the same experiment on mlx4, the same kernel, the same userspace 
apps, the same bpf program. The only difference is mlx4 vs. mlx5.

The mlx4 works fine with LRO=off and GRO=on, while
mlx5 failed with the same LRO/GRO configuration.

On mlx4 box, we are able to see TSO packets are increasing as the
large file scp is in progress.
# ethtool -S eth0 | grep tso
 tso_packets: 45495
# ethtool -S eth0 | grep tso
 tso_packets: 45865
# ethtool -S eth0 | grep tso
 tso_packets: 46337
# ethtool -S eth0 | grep tso
 tso_packets: 46724

And use bcc tool to track to func call count for skb_segment
and find it is called 0 times. Clearly, mlx4 is able to take
the packet as TSO and hence the packet will not go to
the stack.

# funccount.py -i 3 'skb_segment'
Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.

FUNCCOUNT

FUNCCOUNT
...

CC Saeed Mahameed who may help debug and provide some insights
what is the problem.



If the packets are generated with GRO, there could be data chained
at the frag_list pointer. Most NICs can't offload such skbs, so if
skb_segment() can't split at the frag_list pointer, it will just
segment the packets based on gso_size.



你好!你好吗?

2018-03-13 Thread Hannah Justin



Re: [PATCH v3] kernel.h: Skip single-eval logic on literals in min()/max()

2018-03-13 Thread Kees Cook
On Tue, Mar 13, 2018 at 2:02 PM, Andrew Morton
 wrote:
> On Mon, 12 Mar 2018 21:28:57 -0700 Kees Cook  wrote:
>
>> On Mon, Mar 12, 2018 at 4:57 PM, Linus Torvalds
>>  wrote:
>> > On Mon, Mar 12, 2018 at 3:55 PM, Andrew Morton
>> >  wrote:
>> >>
>> >> Replacing the __builtin_choose_expr() with ?: works of course.
>> >
>> > Hmm. That sounds like the right thing to do. We were so myopically
>> > staring at the __builtin_choose_expr() problem that we overlooked the
>> > obvious solution.
>> >
>> > Using __builtin_constant_p() together with a ?: is in fact our common
>> > pattern, so that should be fine. The only real reason to use
>> > __builtin_choose_expr() is if you want to get the *type* to vary
>> > depending on which side you choose, but that's not an issue for
>> > min/max.
>>
>> This doesn't solve it for -Wvla, unfortunately. That was the point of
>> Josh's original suggestion of __builtin_choose_expr().
>>
>> Try building with KCFLAGS=-Wval and checking net/ipv6/proc.c:
>>
>> net/ipv6/proc.c: In function ‘snmp6_seq_show_item’:
>> net/ipv6/proc.c:198:2: warning: ISO C90 forbids array ‘buff’ whose
>> size can’t be evaluated [-Wvla]
>>   unsigned long buff[SNMP_MIB_MAX];
>>   ^~~~
>
> PITA.  Didn't we once have a different way of detecting VLAs?  Some
> post-compilation asm parser, iirc.
>
> I suppose the world wouldn't end if we had a gcc version ifdef in
> kernel.h.  We'll get to remove it in, oh, ten years.

For fixing only 6 VLAs, we don't need all this effort. When it looked
like we could get away with just a "better" max(), sure. ;)

I'll send a "const_max()" which will refuse to work on
non-constant-values (so it doesn't get accidentally used on variables
that could be exposed to double-evaluation), and will work for stack
array declarations (to avoid the overly-sensitive -Wvla checks).

-Kees

-- 
Kees Cook
Pixel Security


Re: [PATCH] net: dsa: drop some VLAs in switch.c

2018-03-13 Thread Salvatore Mesoraca
2018-03-13 20:58 GMT+01:00 Vivien Didelot :
> Hi Salvatore,

Hi Vivien,

> Salvatore Mesoraca  writes:
>
>> dsa_switch's num_ports is currently fixed to DSA_MAX_PORTS. So we avoid
>> 2 VLAs[1] by using DSA_MAX_PORTS instead of ds->num_ports.
>>
>> [1] https://lkml.org/lkml/2018/3/7/621
>>
>> Signed-off-by: Salvatore Mesoraca 
>
> NAK.
>
> We are in the process to remove hardcoded limits such as DSA_MAX_PORTS
> and DSA_MAX_SWITCHES, so we have to stick with ds->num_ports.

I can rewrite the patch using kmalloc.
Although, if ds->num_ports will always be less than or equal to 12, it
should be better to
just use DSA_MAX_PORTS.

Thank you,

Salvatore


[PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-13 Thread Song Liu
Currently, bpf stackmap store address for each entry in the call trace.
To map these addresses to user space files, it is necessary to maintain
the mapping from these virtual address to symbols in the binary. Usually,
the user space profiler (such as perf) has to scan /proc/pid/maps at the
beginning of profiling, and monitor mmap2() calls afterwards. Given the
cost of maintaining the address map, this solution is not practical for
system wide profiling that is always on.

This patch tries to solve this problem with a variation of stackmap. This
variation is enabled by flag BPF_F_STACK_BUILD_ID, and only works for
user stack. Instead of storing addresses, the variation stores ELF file
build_id + offset.

Build ID is a 20-byte unique identifier for ELF files. The following
command shows the Build ID of /bin/bash:

  [user@]$ readelf -n /bin/bash
  ...
Build ID: XX
  ...

With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID
for each entry in the call trace, and translate it into the following
struct:

  struct bpf_stack_build_id_offset {
  __s32   status;
  unsigned char   build_id[BPF_BUILD_ID_SIZE];
  union {
  __u64   offset;
  __u64   ip;
  };
  };

The search of build_id is limited to the first page of the file, and this
page should be in page cache. Otherwise, we fallback to store ip for this
entry (ip field in struct bpf_stack_build_id_offset). This requires the
build_id to be stored in the first page. A quick survey of binary and
dynamic library files in a few different systems shows that almost all
binary and dynamic library files have build_id in the first page.

User space can access struct bpf_stack_build_id_offset with bpf
syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to
maintain mapping from build id to binary files. This mostly static
mapping is much easier to maintain than per process address maps.

Note: Stackmap with build_id only works in non-nmi context at this time.
This is because we need to take mm->mmap_sem for find_vma(). If this
changes, we would like to allow build_id lookup in nmi context.

Signed-off-by: Song Liu 
---
 include/uapi/linux/bpf.h |  22 
 kernel/bpf/stackmap.c| 257 +++
 2 files changed, 257 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a66769..1e15d17 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -231,6 +231,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY   (1U << 3)
 #define BPF_F_WRONLY   (1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID   (1U << 5)
+
+enum bpf_stack_build_id_status {
+   /* user space need an empty entry to identify end of a trace */
+   BPF_STACK_BUILD_ID_EMPTY = 0,
+   /* with valid build_id and offset */
+   BPF_STACK_BUILD_ID_VALID = 1,
+   /* couldn't get build_id, fallback to ip */
+   BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+   __s32   status;
+   unsigned char   build_id[BPF_BUILD_ID_SIZE];
+   union {
+   __u64   offset;
+   __u64   ip;
+   };
+};
+
 union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32   map_type;   /* one of enum bpf_map_type */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43..7b134e9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "percpu_freelist.h"
 
-#define STACK_CREATE_FLAG_MASK \
-   (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK \
+   (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |\
+BPF_F_STACK_BUILD_ID)
 
 struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
u32 nr;
-   u64 ip[];
+   u64 data[];
 };
 
 struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
 };
 
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+   return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+   return stack_map_use_build_id(map) ?
+   sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 {
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
-   value_size < 8 || value_size % 8 ||
-   value_size / 8 > 

[PATCH bpf-next v5 0/2] bpf stackmap with build_id+offset

2018-03-13 Thread Song Liu
Changes v4 -> v5:

1. Only allow build_id lookup in non-nmi context. Added comment and
   commit message to highlight this limitation.
2. Minor fix reported by kbuild test robot.

Changes v3 -> v4:

1. Add fallback when build_id lookup failed. In this case, status is set
   to BPF_STACK_BUILD_ID_IP, and ip of this entry is saved.
2. Handle cases where vma is only part of the file (vma->vm_pgoff != 0).
   Thanks to Teng for helping me identify this issue!
3. Address feedbacks for previous versions.

Song Liu (2):
  bpf: extend stackmap to save binary_build_id+offset instead of address
  bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID

 include/uapi/linux/bpf.h   |  22 ++
 kernel/bpf/stackmap.c  | 257 +++--
 tools/include/uapi/linux/bpf.h |  22 ++
 tools/testing/selftests/bpf/Makefile   |  12 +-
 tools/testing/selftests/bpf/test_progs.c   | 164 -
 .../selftests/bpf/test_stacktrace_build_id.c   |  60 +
 tools/testing/selftests/bpf/urandom_read.c |  22 ++
 7 files changed, 535 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_build_id.c
 create mode 100644 tools/testing/selftests/bpf/urandom_read.c

--
2.9.5


[PATCH bpf-next v5 2/2] bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID

2018-03-13 Thread Song Liu
test_stacktrace_build_id() is added. It accesses tracepoint urandom_read
with "dd" and "urandom_read" and gathers stack traces. Then it reads the
stack traces from the stackmap.

urandom_read is a statically link binary that reads from /dev/urandom.
test_stacktrace_build_id() calls readelf to read build ID of urandom_read
and compares it with build ID from the stackmap.

Signed-off-by: Song Liu 
---
 tools/include/uapi/linux/bpf.h |  22 +++
 tools/testing/selftests/bpf/Makefile   |  12 +-
 tools/testing/selftests/bpf/test_progs.c   | 164 -
 .../selftests/bpf/test_stacktrace_build_id.c   |  60 
 tools/testing/selftests/bpf/urandom_read.c |  22 +++
 5 files changed, 278 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_build_id.c
 create mode 100644 tools/testing/selftests/bpf/urandom_read.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db6bdc3..1944d0a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -231,6 +231,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY   (1U << 3)
 #define BPF_F_WRONLY   (1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID   (1U << 5)
+
+enum bpf_stack_build_id_status {
+   /* user space need an empty entry to identify end of a trace */
+   BPF_STACK_BUILD_ID_EMPTY = 0,
+   /* with valid build_id and offset */
+   BPF_STACK_BUILD_ID_VALID = 1,
+   /* couldn't get build_id, fallback to ip */
+   BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+   __s32   status;
+   unsigned char   build_id[BPF_BUILD_ID_SIZE];
+   union {
+   __u64   offset;
+   __u64   ip;
+   };
+};
+
 union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32   map_type;   /* one of enum bpf_map_type */
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 8567a858..b0d29fd 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,6 +13,14 @@ endif
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
-I../../../include
 LDLIBS += -lcap -lelf -lrt -lpthread
 
+TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
+all: $(TEST_CUSTOM_PROGS)
+
+$(TEST_CUSTOM_PROGS): urandom_read
+
+urandom_read: urandom_read.c
+   $(CC) -o $(TEST_CUSTOM_PROGS) -static $<
+
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs \
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
@@ -21,7 +29,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o 
sockmap_parse_prog.o \
sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-   sample_map_ret0.o test_tcpbpf_kern.o
+   sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -74,3 +82,5 @@ $(OUTPUT)/%.o: %.c
$(CLANG) $(CLANG_FLAGS) \
 -O2 -target bpf -emit-llvm -c $< -o - |  \
$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
+
+EXTRA_CLEAN := $(TEST_CUSTOM_PROGS)
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 27ad540..e9df48b 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -841,7 +841,8 @@ static void test_tp_attach_query(void)
 static int compare_map_keys(int map1_fd, int map2_fd)
 {
__u32 key, next_key;
-   char val_buf[PERF_MAX_STACK_DEPTH * sizeof(__u64)];
+   char val_buf[PERF_MAX_STACK_DEPTH *
+sizeof(struct bpf_stack_build_id)];
int err;
 
err = bpf_map_get_next_key(map1_fd, NULL, );
@@ -964,6 +965,166 @@ static void test_stacktrace_map()
return;
 }
 
+static int extract_build_id(char *build_id, size_t size)
+{
+   FILE *fp;
+   char *line = NULL;
+   size_t len = 0;
+
+   fp = popen("readelf -n ./urandom_read | grep 'Build ID'", "r");
+   if (fp == NULL)
+   return -1;
+
+   if (getline(, , fp) == -1)
+   goto err;
+   fclose(fp);
+
+   if (len > size)
+   len = size;
+   memcpy(build_id, line, len);
+   build_id[len] = '\0';
+   return 0;
+err:
+   fclose(fp);
+   return -1;
+}
+
+static void test_stacktrace_build_id(void)
+{
+   int control_map_fd, stackid_hmap_fd, stackmap_fd;
+   const char *file = "./test_stacktrace_build_id.o";
+   

[PATCH net] net: systemport: Rewrite __bcm_sysport_tx_reclaim()

2018-03-13 Thread Florian Fainelli
There is no need for complex checking between the last consumed index
and current consumed index, a simple subtraction will do.

This also eliminates the possibility of a permanent transmit queue stall
under the following conditions:

- one CPU bursts ring->size worth of traffic (up to 256 buffers), to the
  point where we run out of free descriptors, so we stop the transmit
  queue at the end of bcm_sysport_xmit()

- because of our locking, we have the transmit process disable
  interrupts which means we can be blocking the TX reclamation process

- when TX reclamation finally runs, we will be computing the difference
  between ring->c_index (last consumed index by SW) and what the HW
  reports through its register

- this register is masked with (ring->size - 1) = 0xff, which will lead
  to stripping the upper bits of the index (register is 16-bits wide)

- we will be computing last_tx_cn as 0, which means there is no work to
  be done, and we never wake-up the transmit queue, leaving it
  permanently disabled

A practical example is e.g: ring->c_index aka last_c_index = 12, we
pushed 256 entries, HW consumer index = 268, we mask it with 0xff = 12,
so last_tx_cn == 0, nothing happens.

Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC 
driver")
Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 33 ++
 drivers/net/ethernet/broadcom/bcmsysport.h |  2 +-
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index f15a8fc6dfc9..3fc549b88c43 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -855,10 +855,12 @@ static void bcm_sysport_tx_reclaim_one(struct 
bcm_sysport_tx_ring *ring,
 static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
 struct bcm_sysport_tx_ring *ring)
 {
-   unsigned int c_index, last_c_index, last_tx_cn, num_tx_cbs;
unsigned int pkts_compl = 0, bytes_compl = 0;
struct net_device *ndev = priv->netdev;
+   unsigned int txbds_processed = 0;
struct bcm_sysport_cb *cb;
+   unsigned int txbds_ready;
+   unsigned int c_index;
u32 hw_ind;
 
/* Clear status before servicing to reduce spurious interrupts */
@@ -871,29 +873,23 @@ static unsigned int __bcm_sysport_tx_reclaim(struct 
bcm_sysport_priv *priv,
/* Compute how many descriptors have been processed since last call */
hw_ind = tdma_readl(priv, TDMA_DESC_RING_PROD_CONS_INDEX(ring->index));
c_index = (hw_ind >> RING_CONS_INDEX_SHIFT) & RING_CONS_INDEX_MASK;
-   ring->p_index = (hw_ind & RING_PROD_INDEX_MASK);
-
-   last_c_index = ring->c_index;
-   num_tx_cbs = ring->size;
-
-   c_index &= (num_tx_cbs - 1);
-
-   if (c_index >= last_c_index)
-   last_tx_cn = c_index - last_c_index;
-   else
-   last_tx_cn = num_tx_cbs - last_c_index + c_index;
+   txbds_ready = (c_index - ring->c_index) & RING_CONS_INDEX_MASK;
 
netif_dbg(priv, tx_done, ndev,
- "ring=%d c_index=%d last_tx_cn=%d last_c_index=%d\n",
- ring->index, c_index, last_tx_cn, last_c_index);
+ "ring=%d old_c_index=%u c_index=%u txbds_ready=%u\n",
+ ring->index, ring->c_index, c_index, txbds_ready);
 
-   while (last_tx_cn-- > 0) {
-   cb = ring->cbs + last_c_index;
+   while (txbds_processed < txbds_ready) {
+   cb = >cbs[ring->clean_index];
bcm_sysport_tx_reclaim_one(ring, cb, _compl, _compl);
 
ring->desc_count++;
-   last_c_index++;
-   last_c_index &= (num_tx_cbs - 1);
+   txbds_processed++;
+
+   if (likely(ring->clean_index < ring->size - 1))
+   ring->clean_index++;
+   else
+   ring->clean_index = 0;
}
 
u64_stats_update_begin(>syncp);
@@ -1394,6 +1390,7 @@ static int bcm_sysport_init_tx_ring(struct 
bcm_sysport_priv *priv,
netif_tx_napi_add(priv->netdev, >napi, bcm_sysport_tx_poll, 64);
ring->index = index;
ring->size = size;
+   ring->clean_index = 0;
ring->alloc_size = ring->size;
ring->desc_cpu = p;
ring->desc_count = ring->size;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h 
b/drivers/net/ethernet/broadcom/bcmsysport.h
index f5a984c1c986..19c91c76e327 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -706,7 +706,7 @@ struct bcm_sysport_tx_ring {
unsigned intdesc_count; /* Number of descriptors */
unsigned intcurr_desc;  /* Current descriptor */
unsigned intc_index;/* Last consumer index */
-   

Re: Possible tcp regression in 4.14.13..26

2018-03-13 Thread Eric Dumazet



On 03/13/2018 02:41 PM, Timofey Titovets wrote:

Hi list,
We currently upgrade our servers from 4.14.13 to 4.14.26
And get some problems with random kernel oops by:
[  422.081094] BUG: unable to handle kernel NULL pointer dereference
at 0038
[  422.081254] IP: tcp_push+0x42/0x110
[  422.081314] PGD 0 P4D 0
[  422.081364] Oops: 0002 [#1] SMP PTI

Currently we assume that this caused by:
net: tcp: close sock if net namespace is exiting
[ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ]

If that can helps we heavy use docker and ceph rbd.

I will write if we get more info or fix that.

Thanks.



Can you provide a full stack trace ?



Re: [PATCH iproute2] treat "default" and "all"/"any" parameters differenty

2018-03-13 Thread Luca Boccassi
On Tue, 2018-03-13 at 21:19 +0100, Alexander Zubkov wrote:
> Debian maintainer found that basic command:
>   # ip route flush all
> No longer worked as expected which breaks user scripts and
> expectations. It no longer flushed all IPv4 routes.
> 
> Recently behaviour of "default" prefix parameter was corrected. But
> at
> the same time behaviour of "all"/"any" was altered too, because they
> were the same branch of the code. As those parameters mean different,
> they need to be treated differently in code too. This patch reflects
> the difference.
> 
> Reported-by: Luca Boccassi 
> Signed-off-by: Alexander Zubkov 
> ---
>   lib/utils.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/utils.c b/lib/utils.c
> index 9fa5220..b289d9c 100644
> --- a/lib/utils.c
> +++ b/lib/utils.c
> @@ -658,7 +658,8 @@ int get_prefix_1(inet_prefix *dst, char *arg,
> int 
> family)
>   dst->family = family;
>   dst->bytelen = 0;
>   dst->bitlen = 0;
> - dst->flags |= PREFIXLEN_SPECIFIED;
> + if (strcmp(arg, "default") == 0)
> + dst->flags |= PREFIXLEN_SPECIFIED;
>   return 0;
>   }
> 
> --
> 1.9.1

Tested-by: Luca Boccassi 

Yep solves the problem, ls all/any now work as before. Thanks!

-- 
Kind regards,
Luca Boccassi

signature.asc
Description: This is a digitally signed message part


Re: [Intel-wired-lan] [PATCH 12/15] ice: Add stats and ethtool support

2018-03-13 Thread Eric Dumazet



On 03/13/2018 02:14 PM, Jesse Brandeburg wrote:

On Tue, 13 Mar 2018 12:17:10 -0700
Eric Dumazet  wrote:


Yes, this is a recurring mistake

See commit
bf909456f6a89654cb65c01fe83a4f9b133bf978 Revert "net: hns3: Add packet
statistics of netdev"


Thanks for the pointer, that was a useful thread to review.  I
understand the point that was made about not having the netdev stats
shown in ethtool -S.  We definitely do provide per-queue stats as well
as these regular stats in ethtool -S.

I do remember from the past discussions that it *is* useless for the
driver to keep internally any stats that were already stored via the
get_stats NDO, and we missed it in the internal review that this driver
was doing that, so that will be fixed.

Maybe it's just that I've been doing this too long, but I regularly
(and many other customers/users do as well) depend on the ethtool stats
being atomically updated w.r.t. each other.  This means that if I'm
getting the over rx_packets, as well as the per-queue rx_packets, and I
read them all at once from the driver with ethtool, then I can check
that things are working as expected.

If I have to gather the netdev stats from /proc/net/dev (which iproute2
tool shows the /proc/net/dev stats?) and somehow atomically gather the
ethtool -S stats.  What's the user to do in this brave new world where
ethtool doesn't at least have rx/tx bytes and packets?


I do not believe ethtool -S provides an atomic snapshot of counters anyway.

Maybe some drivers/NIC implement such a feature, but it is not documented.

In any case, that could be implemented generically (by core networking 
stack), instead of being copied/pasted in many drivers, if this would be 
really needed.


core networking stack would append (or insert) ndo_stats.
(The atomicity rule if really needed would need to pass an extra struct 
rtnl_link_stats64 pointer to get_ethtool_stats() method)





Possible tcp regression in 4.14.13..26

2018-03-13 Thread Timofey Titovets
Hi list,
We currently upgrade our servers from 4.14.13 to 4.14.26
And get some problems with random kernel oops by:
[  422.081094] BUG: unable to handle kernel NULL pointer dereference
at 0038
[  422.081254] IP: tcp_push+0x42/0x110
[  422.081314] PGD 0 P4D 0
[  422.081364] Oops: 0002 [#1] SMP PTI

Currently we assume that this caused by:
net: tcp: close sock if net namespace is exiting
[ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ]

If that can helps we heavy use docker and ceph rbd.

I will write if we get more info or fix that.

Thanks.

-- 
Have a nice day,
Timofey.


[PATCH v5 net-next 0/4] net: macb: Introduce phy-handle DT functionality

2018-03-13 Thread Brad Mouring

Consider the situation where a macb netdev is connected through
a phydev that sits on a mii bus other than the one provided to
this particular netdev. This situation is what this patchset aims
to accomplish through the existing phy-handle optional binding.

This optional binding (as described in the ethernet DT bindings doc)
directs the netdev to the phydev to use. This is precisely the
situation this patchset aims to solve, so it makes sense to introduce
the functionality to this driver (where the physical layout discussed
was encountered).

The devicetree snippet would look something like this:

...
   ethernet@feedf00d {
   ...
   phy-handle = <> // the first netdev is physically wired to phy0
   ...
   phy0: phy@0 {
   ...
   reg = <0x0> // MDIO address 0
   ...
   }
   phy1: phy@1 {
   ...
   reg = <0x1> // MDIO address 1
   ...
   }
   ...
   }

   ethernet@deadbeef {
   ...
   phy-handle = <> // tells the driver to use phy1 on the
// first mac's mdio bus (it's wired thusly)
   ...
   }
...

The work done to add the phy_node in the first place (dacdbb4dfc1a1:
"net: macb: add fixed-link node support") will consume the
device_node (if found).

v2: Reorganization of mii probe/init functions, suggested by Andrew Lunn
v3: Moved some of the bus init code back into init (erroneously moved to probe)
some style issues, and an unintialized variable warning addressed.
v4: Add Reviewed-by: tags
Skip fallback code if phy-handle phandle is found
v5: Cleanup formatting issues
Fix compile failure introduced in 1/4 "net: macb: Reorganize macb_mii
bringup"
Fix typo in "Documentation: macb: Document phy-handle binding"


[PATCH v5 net-next 4/4] Documentation: macb: Document phy-handle binding

2018-03-13 Thread Brad Mouring
Document the existence of the optional binding, directing to the
general ethernet document that describes this binding.

Signed-off-by: Brad Mouring 
Reviewed-by: Florian Fainelli 
---
 Documentation/devicetree/bindings/net/macb.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/macb.txt 
b/Documentation/devicetree/bindings/net/macb.txt
index 27966ae741e0..457d5ae16f23 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -29,6 +29,7 @@ Optional properties for PHY child node:
 - reset-gpios : Should specify the gpio for phy reset
 - magic-packet : If present, indicates that the hardware supports waking
   up via magic packet.
+- phy-handle : see ethernet.txt file in the same directory
 
 Examples:
 
-- 
2.16.2



[PATCH v5 net-next 1/4] net: macb: Reorganize macb_mii bringup

2018-03-13 Thread Brad Mouring
The macb mii setup (mii_probe() and mii_init()) previously was
somewhat interspersed, likely a result of organic growth and hacking.

This change moves mii bus registration into mii_init and probing the
bus for devices into mii_probe.

Signed-off-by: Brad Mouring 
Suggested-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/cadence/macb_main.c | 79 +---
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index e84afcf1ecb5..4b514c705e57 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -472,8 +472,42 @@ static int macb_mii_probe(struct net_device *dev)
struct macb *bp = netdev_priv(dev);
struct macb_platform_data *pdata;
struct phy_device *phydev;
-   int phy_irq;
-   int ret;
+   struct device_node *np;
+   int phy_irq, ret, i;
+
+   pdata = dev_get_platdata(>pdev->dev);
+   np = bp->pdev->dev.of_node;
+   ret = 0;
+
+   if (np) {
+   if (of_phy_is_fixed_link(np)) {
+   if (of_phy_register_fixed_link(np) < 0) {
+   dev_err(>pdev->dev,
+   "broken fixed-link specification\n");
+   return -ENODEV;
+   }
+   bp->phy_node = of_node_get(np);
+   } else {
+   /* fallback to standard phy registration if no phy were
+* found during dt phy registration
+*/
+   if (!phy_find_first(bp->mii_bus)) {
+   for (i = 0; i < PHY_MAX_ADDR; i++) {
+   struct phy_device *phydev;
+
+   phydev = mdiobus_scan(bp->mii_bus, i);
+   if (IS_ERR(phydev) &&
+   PTR_ERR(phydev) != -ENODEV) {
+   ret = PTR_ERR(phydev);
+   break;
+   }
+   }
+
+   if (ret)
+   return -ENODEV;
+   }
+   }
+   }
 
if (bp->phy_node) {
phydev = of_phy_connect(dev, bp->phy_node,
@@ -488,7 +522,6 @@ static int macb_mii_probe(struct net_device *dev)
return -ENXIO;
}
 
-   pdata = dev_get_platdata(>pdev->dev);
if (pdata) {
if (gpio_is_valid(pdata->phy_irq_pin)) {
ret = devm_gpio_request(>pdev->dev,
@@ -533,7 +566,7 @@ static int macb_mii_init(struct macb *bp)
 {
struct macb_platform_data *pdata;
struct device_node *np;
-   int err = -ENXIO, i;
+   int err, i;
 
/* Enable management port */
macb_writel(bp, NCR, MACB_BIT(MPE));
@@ -556,39 +589,9 @@ static int macb_mii_init(struct macb *bp)
dev_set_drvdata(>dev->dev, bp->mii_bus);
 
np = bp->pdev->dev.of_node;
-   if (np) {
-   if (of_phy_is_fixed_link(np)) {
-   if (of_phy_register_fixed_link(np) < 0) {
-   dev_err(>pdev->dev,
-   "broken fixed-link specification\n");
-   goto err_out_unregister_bus;
-   }
-   bp->phy_node = of_node_get(np);
-
-   err = mdiobus_register(bp->mii_bus);
-   } else {
-   /* try dt phy registration */
-   err = of_mdiobus_register(bp->mii_bus, np);
-
-   /* fallback to standard phy registration if no phy were
-* found during dt phy registration
-*/
-   if (!err && !phy_find_first(bp->mii_bus)) {
-   for (i = 0; i < PHY_MAX_ADDR; i++) {
-   struct phy_device *phydev;
 
-   phydev = mdiobus_scan(bp->mii_bus, i);
-   if (IS_ERR(phydev) &&
-   PTR_ERR(phydev) != -ENODEV) {
-   err = PTR_ERR(phydev);
-   break;
-   }
-   }
-
-   if (err)
-   goto err_out_unregister_bus;
-   }
-   }
+   if (np) {
+   err = of_mdiobus_register(bp->mii_bus, np);

[PATCH v5 net-next 2/4] net: macb: Remove redundant poll irq assignment

2018-03-13 Thread Brad Mouring
In phy_device's general probe, this device will already be set for
phy register polling, rendering this code redundant.

Signed-off-by: Brad Mouring 
Suggested-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/cadence/macb_main.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 4b514c705e57..db1dc301bed3 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -566,7 +566,7 @@ static int macb_mii_init(struct macb *bp)
 {
struct macb_platform_data *pdata;
struct device_node *np;
-   int err, i;
+   int err;
 
/* Enable management port */
macb_writel(bp, NCR, MACB_BIT(MPE));
@@ -593,9 +593,6 @@ static int macb_mii_init(struct macb *bp)
if (np) {
err = of_mdiobus_register(bp->mii_bus, np);
} else {
-   for (i = 0; i < PHY_MAX_ADDR; i++)
-   bp->mii_bus->irq[i] = PHY_POLL;
-
if (pdata)
bp->mii_bus->phy_mask = pdata->phy_mask;
 
-- 
2.16.2



[PATCH v5 net-next 3/4] net: macb: Add phy-handle DT support

2018-03-13 Thread Brad Mouring
This optional binding (as described in the ethernet DT bindings doc)
directs the netdev to the phydev to use. This is useful for a phy
chip that has >1 phy in it, and two netdevs are using the same phy
chip (i.e. the second mac's phy lives on the first mac's MDIO bus)

The devicetree snippet would look something like this:

ethernet@feedf00d {
...
phy-handle = <> // the first netdev is physically wired to phy0
...
phy0: phy@0 {
...
reg = <0x0> // MDIO address 0
...
}
phy1: phy@1 {
...
reg = <0x1> // MDIO address 1
...
}
...
}

ethernet@deadbeef {
...
phy-handle = <> // tells the driver to use phy1 on the
 // first mac's mdio bus (it's 
wired thusly)
...
}

The work done to add the phy_node in the first place (dacdbb4dfc1a1:
"net: macb: add fixed-link node support") will consume the
device_node (if found).

Signed-off-by: Brad Mouring 
---
 drivers/net/ethernet/cadence/macb_main.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index db1dc301bed3..d09bd43680b3 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -488,10 +488,12 @@ static int macb_mii_probe(struct net_device *dev)
}
bp->phy_node = of_node_get(np);
} else {
-   /* fallback to standard phy registration if no phy were
-* found during dt phy registration
+   bp->phy_node = of_parse_phandle(np, "phy-handle", 0);
+   /* fallback to standard phy registration if no
+* phy-handle was found nor any phy found during
+* dt phy registration
 */
-   if (!phy_find_first(bp->mii_bus)) {
+   if (!bp->phy_node && !phy_find_first(bp->mii_bus)) {
for (i = 0; i < PHY_MAX_ADDR; i++) {
struct phy_device *phydev;
 
-- 
2.16.2



Re: [PATCH bpf-next v4 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-13 Thread Song Liu


> On Mar 12, 2018, at 3:47 PM, Song Liu  wrote:
> 
> 
> 
>> On Mar 12, 2018, at 2:31 PM, Alexei Starovoitov  wrote:
>> 
>> On 3/12/18 2:12 PM, Song Liu wrote:
>>> 
 On Mar 12, 2018, at 2:00 PM, Alexei Starovoitov  wrote:
 
 On 3/12/18 1:39 PM, Song Liu wrote:
> + page = find_get_page(vma->vm_file->f_mapping, 0);
 
 did you test it with config_debug_atomic_sleep ?
 it should have complained...
>>> 
>>> Yeah, I have CONFIG_DEBUG_ATOMIC_SLEEP=y.
>>> 
>>> I think find_get_page() will not sleep. The variation find_get_page_flags()
>>> may sleep with flag FGP_CREAT.
>> 
>> I see. gfp_mask == 0 and no locks. should work indeed.
>> curious how perf report looks like for heavy bpf_get_stackid() usage?
> 
> I modified samples/bpf/sampleip to only call bpf_get_stackid(). The following 
> is captured with bpf_get_stackid() called at 10k Hz. stressapptest is running
> with 16 threads on a system with 56 cores. 
> 
> 
> Samples: 1M of event 'cycles:pp', Event count (approx.): 628092326243
>  Overhead  Command  Shared Object Symbol
> +   51.61%  stressappteststressapptest [.] AdlerMemcpyC   
>   
> -   20.82%  stressapptest[kernel.vmlinux]  [k] 
> queued_spin_lock_slowpath
>   - queued_spin_lock_slowpath 
>  
>  - 20.80% pcpu_freelist_pop   
>  
>   bpf_get_stackid 
>  
>   bpf_get_stackid_tp  
>  
> - 0x590c  
>  
>  16.12% AdlerMemcpyC  
>  
>  4.50% OsLayer::CpuStressWorkload 
>  
> +   14.36%  stressappteststressapptest [.] 
> OsLayer::CpuStressWorkload
> -8.74%  stressapptest[kernel.vmlinux]  [k] _raw_spin_lock 
>
>   - _raw_spin_lock
>   
>  - 8.73% bpf_get_stackid  
>   
>   bpf_get_stackid_tp  
>   
> + 0x590c  
>   
> -0.67%  stressapptest[kernel.vmlinux]  [k] 
> pcpu_freelist_pop 
>   - pcpu_freelist_pop 
>   
>  - 0.67% bpf_get_stackid  
>   
>   bpf_get_stackid_tp  
>   
> + 0x590c
> 
> Seems lock contention is the dominating overhead here. This should be the same
> for original stackmap. 
> 
> Song


Samples: 172K of event 'cycles:pp', Event count (approx.): 102311012653
Overhead  CommandShared Object Symbol   
 
  78.84%  stressapptest  stressapptest [.] AdlerMemcpyC 
   
   8.78%  stressapptest  stressapptest [.] OsLayer::CpuStressWorkload   
   
   3.14%  stressapptest  [kernel.vmlinux]  [k] _raw_spin_lock   
   
   2.56%  stressapptest  stressapptest [.] WorkerThread::FillPage   
   
   0.45%  stressapptest  [kernel.vmlinux]  [k] perf_callchain_user  
   
   0.37%  stressapptest  [kernel.vmlinux]  [.] native_irq_return_iret   
   
   0.31%  stressapptest  [kernel.vmlinux]  [k] clear_page_erms  
   
   0.29%  stressapptest  [kernel.vmlinux]  [k] pcpu_freelist_pop
   
   0.27%  stressapptest  stressapptest [.] CalculateAdlerChecksum   
   
   0.25%  stressapptest  [kernel.vmlinux]  [k] bpf_get_stackid  
   
   0.22%  swapper[kernel.vmlinux]  [k] poll_idle

This perf output is taken with stressapptest running on 4 cores. 
bpf_get_stackid() and pcps_free_list_pop combined about 0.54% of CPU. 

Song





[pci PATCH v6 5/5] pci-pf-stub: Add PF driver stub for PFs that function only to enable VFs

2018-03-13 Thread Alexander Duyck
From: Alexander Duyck 

Add a new driver called "pci-pf-stub" to act as a "white-list" for PF
devices that provide no other functionality other then acting as a means of
allocating a set of VFs. For now I only have one example ID provided by
Amazon in terms of devices that require this functionality. The general
idea is that in the future we will see other devices added as vendors come
up with devices where the PF is more or less just a lightweight shim used
to allocate VFs.

Signed-off-by: Alexander Duyck 
---

v6: New driver to address concerns about Amazon devices left unsupported

 drivers/pci/Kconfig   |   12 +
 drivers/pci/Makefile  |2 ++
 drivers/pci/pci-pf-stub.c |   60 +
 include/linux/pci_ids.h   |2 ++
 4 files changed, 76 insertions(+)
 create mode 100644 drivers/pci/pci-pf-stub.c

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8f8480..cdef2a2a9bc5 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -71,6 +71,18 @@ config PCI_STUB
 
  When in doubt, say N.
 
+config PCI_PF_STUB
+   tristate "PCI PF Stub driver"
+   depends on PCI
+   depends on PCI_IOV
+   help
+ Say Y or M here if you want to enable support for devices that
+ require SR-IOV support, while at the same time the PF itself is
+ not providing any actual services on the host itself such as
+ storage or networking.
+
+ When in doubt, say N.
+
 config XEN_PCIDEV_FRONTEND
 tristate "Xen PCI Frontend"
 depends on PCI && X86 && XEN
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 941970936840..4e133d3df403 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -43,6 +43,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
 
 obj-$(CONFIG_PCI_STUB) += pci-stub.o
 
+obj-$(CONFIG_PCI_PF_STUB) += pci-pf-stub.o
+
 obj-$(CONFIG_PCI_ECAM) += ecam.o
 
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
new file mode 100644
index ..d218924d9bdb
--- /dev/null
+++ b/drivers/pci/pci-pf-stub.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/* pci-pf-stub - simple stub driver for PCI SR-IOV PF device
+ *
+ * This driver is meant to act as a "white-list" for devices that provde
+ * SR-IOV functionality while at the same time not actually needing a
+ * driver of their own.
+ */
+
+#include 
+#include 
+
+/**
+ * pci_pf_stub_white_list - White list of devices to bind pci-pf-stub onto
+ *
+ * This table provides the list of IDs this driver is supposed to bind
+ * onto. You could think of this as a list of "quirked" devices where we
+ * are adding support for SR-IOV here since there are no other drivers
+ * that they would be running under.
+ *
+ * Layout of the table below is as follows:
+ * { Vendor ID, Device ID,
+ *   SubVendor ID, SubDevice ID,
+ *   Class, Class Mask,
+ *   private data (not used) }
+ */
+static const struct pci_device_id pci_pf_stub_white_list[] = {
+   { PCI_VDEVICE(AMAZON, 0x0053) },
+   /* required last entry */
+   { 0 }
+};
+MODULE_DEVICE_TABLE(pci, pci_pf_stub_white_list);
+
+static int pci_pf_stub_probe(struct pci_dev *dev,
+const struct pci_device_id *id)
+{
+   pci_info(dev, "claimed by pci-pf-stub\n");
+   return 0;
+}
+
+static struct pci_driver pf_stub_driver = {
+   .name   = "pci-pf-stub",
+   .id_table   = pci_pf_stub_white_list,
+   .probe  = pci_pf_stub_probe,
+   .sriov_configure= pci_sriov_configure_simple,
+};
+
+static int __init pci_pf_stub_init(void)
+{
+   return pci_register_driver(_stub_driver);
+}
+
+static void __exit pci_pf_stub_exit(void)
+{
+   pci_unregister_driver(_stub_driver);
+}
+
+module_init(pci_pf_stub_init);
+module_exit(pci_pf_stub_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a6b30667a331..b10621896017 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2548,6 +2548,8 @@
 #define PCI_VENDOR_ID_CIRCUITCO0x1cc8
 #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD 0x0001
 
+#define PCI_VENDOR_ID_AMAZON   0x1d0f
+
 #define PCI_VENDOR_ID_TEKRAM   0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290 0xdc29
 



[pci PATCH v6 4/5] nvme: Migrate over to unmanaged SR-IOV support

2018-03-13 Thread Alexander Duyck
From: Alexander Duyck 

Instead of implementing our own version of a SR-IOV configuration stub in
the nvme driver we can just reuse the existing
pci_sriov_configure_simple function.

Signed-off-by: Alexander Duyck 
---

v5: Replaced call to pci_sriov_configure_unmanaged with
pci_sriov_configure_simple
v6: Dropped "#ifdef" checks for IOV wrapping sriov_configure definition

 drivers/nvme/host/pci.c |   20 +---
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5933a5c732e8..5e963058882a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2580,24 +2580,6 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_put_ctrl(>ctrl);
 }
 
-static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs)
-{
-   int ret = 0;
-
-   if (numvfs == 0) {
-   if (pci_vfs_assigned(pdev)) {
-   dev_warn(>dev,
-   "Cannot disable SR-IOV VFs while assigned\n");
-   return -EPERM;
-   }
-   pci_disable_sriov(pdev);
-   return 0;
-   }
-
-   ret = pci_enable_sriov(pdev, numvfs);
-   return ret ? ret : numvfs;
-}
-
 #ifdef CONFIG_PM_SLEEP
 static int nvme_suspend(struct device *dev)
 {
@@ -2716,7 +2698,7 @@ static void nvme_error_resume(struct pci_dev *pdev)
.driver = {
.pm = _dev_pm_ops,
},
-   .sriov_configure = nvme_pci_sriov_configure,
+   .sriov_configure = pci_sriov_configure_simple,
.err_handler= _err_handler,
 };
 



[pci PATCH v6 3/5] ena: Migrate over to unmanaged SR-IOV support

2018-03-13 Thread Alexander Duyck
From: Alexander Duyck 

Instead of implementing our own version of a SR-IOV configuration stub in
the ena driver we can just reuse the existing
pci_sriov_configure_simple function.

Signed-off-by: Alexander Duyck 
---

v5: Replaced call to pci_sriov_configure_unmanaged with
pci_sriov_configure_simple
v6: Dropped "#ifdef" checks for IOV wrapping sriov_configure definition

 drivers/net/ethernet/amazon/ena/ena_netdev.c |   28 +-
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 6975150d144e..6054deb1e6aa 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -3385,32 +3385,6 @@ static int ena_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 }
 
 /*/
-static int ena_sriov_configure(struct pci_dev *dev, int numvfs)
-{
-   int rc;
-
-   if (numvfs > 0) {
-   rc = pci_enable_sriov(dev, numvfs);
-   if (rc != 0) {
-   dev_err(>dev,
-   "pci_enable_sriov failed to enable: %d vfs with 
the error: %d\n",
-   numvfs, rc);
-   return rc;
-   }
-
-   return numvfs;
-   }
-
-   if (numvfs == 0) {
-   pci_disable_sriov(dev);
-   return 0;
-   }
-
-   return -EINVAL;
-}
-
-/*/
-/*/
 
 /* ena_remove - Device Removal Routine
  * @pdev: PCI device information struct
@@ -3525,7 +3499,7 @@ static int ena_resume(struct pci_dev *pdev)
.suspend= ena_suspend,
.resume = ena_resume,
 #endif
-   .sriov_configure = ena_sriov_configure,
+   .sriov_configure = pci_sriov_configure_simple,
 };
 
 static int __init ena_init(void)



[pci PATCH v6 2/5] virtio_pci: Add support for unmanaged SR-IOV on virtio_pci devices

2018-03-13 Thread Alexander Duyck
From: Alexander Duyck 

Hardware-realized virtio_pci devices can implement SR-IOV, so this
patch enables its use. The device in question is an upcoming Intel
NIC that implements both a virtio_net PF and virtio_net VFs. These
are hardware realizations of what has been up to now been a software
interface.

The device in question has the following 4-part PCI IDs:

PF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 15fe
VF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 05fe

The patch currently needs no check for device ID, because the callback
will never be made for devices that do not assert the capability or
when run on a platform incapable of SR-IOV.

One reason for this patch is because the hardware requires the
vendor ID of a VF to be the same as the vendor ID of the PF that
created it. So it seemed logical to simply have a fully-functioning
virtio_net PF create the VFs. This patch makes that possible.

Signed-off-by: Mark Rustad 
Signed-off-by: Alexander Duyck 
---

v4: Dropped call to pci_disable_sriov in virtio_pci_remove function
v5: Replaced call to pci_sriov_configure_unmanaged with
pci_sriov_configure_simple
v6: Dropped "#ifdef" checks for IOV wrapping sriov_configure definition

 drivers/virtio/virtio_pci_common.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 48d4d1cf1cb6..67a227fd7aa0 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -596,6 +596,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 #ifdef CONFIG_PM_SLEEP
.driver.pm  = _pci_pm_ops,
 #endif
+   .sriov_configure = pci_sriov_configure_simple,
 };
 
 module_pci_driver(virtio_pci_driver);



[pci PATCH v6 1/5] pci: Add pci_sriov_configure_simple for PFs that don't manage VF resources

2018-03-13 Thread Alexander Duyck
From: Alexander Duyck 

This patch adds a common configuration function called
pci_sriov_configure_simple that will allow for managing VFs on devices
where the PF is not capable of managing VF resources.

Signed-off-by: Alexander Duyck 
---

v5: New patch replacing pci_sriov_configure_unmanaged with
  pci_sriov_configure_simple
Dropped bits related to autoprobe changes
v6: Defined pci_sriov_configure_simple as NULL if IOV is disabled

 drivers/pci/iov.c   |   32 
 include/linux/pci.h |3 +++
 2 files changed, 35 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924ae0350..bd7021491fdb 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -807,3 +807,35 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
return dev->sriov->total_VFs;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
+
+/**
+ * pci_sriov_configure_simple - helper to configure unmanaged SR-IOV
+ * @dev: the PCI device
+ * @nr_virtfn: number of virtual functions to enable, 0 to disable
+ *
+ * Used to provide generic enable/disable SR-IOV option for devices
+ * that do not manage the VFs generated by their driver
+ */
+int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
+{
+   int err = -EINVAL;
+
+   might_sleep();
+
+   if (!dev->is_physfn)
+   return -ENODEV;
+
+   if (pci_vfs_assigned(dev)) {
+   pci_warn(dev,
+"Cannot modify SR-IOV while VFs are assigned\n");
+   err = -EPERM;
+   } else if (!nr_virtfn) {
+   sriov_disable(dev);
+   err = 0;
+   } else if (!dev->sriov->num_VFs) {
+   err = sriov_enable(dev, nr_virtfn);
+   }
+
+   return err ? err : nr_virtfn;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 024a1beda008..f3099e940cda 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1953,6 +1953,7 @@ static inline void pci_mmcfg_late_init(void) { }
 int pci_vfs_assigned(struct pci_dev *dev);
 int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
 int pci_sriov_get_totalvfs(struct pci_dev *dev);
+int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
 resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
 void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
 #else
@@ -1980,6 +1981,8 @@ static inline int pci_sriov_set_totalvfs(struct pci_dev 
*dev, u16 numvfs)
 { return 0; }
 static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
 { return 0; }
+/* since this expected to be used as a function pointer just define as NULL */
+#define pci_sriov_configure_simple NULL
 static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int 
resno)
 { return 0; }
 static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { 
}



[pci PATCH v6 0/5] Add support for unmanaged SR-IOV

2018-03-13 Thread Alexander Duyck
This series is meant to add support for SR-IOV on devices when the VFs are
not managed by the kernel. Examples of recent patches attempting to do this
include:
virto - https://patchwork.kernel.org/patch/10241225/
pci-stub - https://patchwork.kernel.org/patch/10109935/
vfio - https://patchwork.kernel.org/patch/10103353/
uio - https://patchwork.kernel.org/patch/9974031/

Since this is quickly blowing up into a multi-driver problem it is probably
best to implement this solution as generically as possible.

This series is an attempt to do that. What we do with this patch set is
provide a generic framework to enable SR-IOV in the case that the PF driver
doesn't support managing the VFs itself.

I based my patch set originally on the patch by Mark Rustad but there isn't
much left after going through and cleaning out the bits that were no longer
needed, and after incorporating the feedback from David Miller. At this point
the only items to be fully reused was his patch description which is now
present in patch 3 of the set.

This solution is limited in scope to just adding support for devices that
provide no functionality for SR-IOV other than allocating the VFs by
calling pci_enable_sriov. Previous sets had included patches for VFIO, but
for now I am dropping that as the scope of that work is larger then I
think I can take on at this time.

v2: Reduced scope back to just virtio_pci and vfio-pci
Broke into 3 patch set from single patch
Changed autoprobe behavior to always set when num_vfs is set non-zero
v3: Updated Documentation to clarify when sriov_unmanaged_autoprobe is used
Wrapped vfio_pci_sriov_configure to fix build errors w/o SR-IOV in kernel
v4: Dropped vfio-pci patch
Added ena and nvme to drivers now using pci_sriov_configure_unmanaged
Dropped pci_disable_sriov call in virtio_pci to be consistent with ena
v5: Dropped sriov_unmanaged_autoprobe and pci_sriov_conifgure_unmanaged
Added new patch that enables pci_sriov_configure_simple
Updated drivers to use pci_sriov_configure_simple
v6: Defined pci_sriov_configure_simple as NULL when SR-IOV is not enabled
Updated drivers to drop "#ifdef" checks for IOV
Added pci-pf-stub as place for PF-only drivers to add support

Cc: Mark Rustad 
Cc: Maximilian Heyne 
Cc: Liang-Min Wang 
Cc: David Woodhouse 

---

Alexander Duyck (5):
  pci: Add pci_sriov_configure_simple for PFs that don't manage VF resources
  virtio_pci: Add support for unmanaged SR-IOV on virtio_pci devices
  ena: Migrate over to unmanaged SR-IOV support
  nvme: Migrate over to unmanaged SR-IOV support
  pci-pf-stub: Add PF driver stub for PFs that function only to enable VFs


 drivers/net/ethernet/amazon/ena/ena_netdev.c |   28 
 drivers/nvme/host/pci.c  |   20 -
 drivers/pci/Kconfig  |   12 +
 drivers/pci/Makefile |2 +
 drivers/pci/iov.c|   32 ++
 drivers/pci/pci-pf-stub.c|   60 ++
 drivers/virtio/virtio_pci_common.c   |1 
 include/linux/pci.h  |3 +
 include/linux/pci_ids.h  |2 +
 9 files changed, 114 insertions(+), 46 deletions(-)
 create mode 100644 drivers/pci/pci-pf-stub.c

--


[PATCH iproute2 1/1] tc: use get_u32() in psample action to match types

2018-03-13 Thread Roman Mashak
Signed-off-by: Roman Mashak 
---
 tc/m_sample.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tc/m_sample.c b/tc/m_sample.c
index ff5ee6bd1ef6..dff986f5 100644
--- a/tc/m_sample.c
+++ b/tc/m_sample.c
@@ -65,7 +65,7 @@ static int parse_sample(struct action_util *a, int *argc_p, 
char ***argv_p,
while (argc > 0) {
if (matches(*argv, "rate") == 0) {
NEXT_ARG();
-   if (get_unsigned(, *argv, 10) != 0) {
+   if (get_u32(, *argv, 10) != 0) {
fprintf(stderr, "Illegal rate %s\n", *argv);
usage();
return -1;
@@ -73,7 +73,7 @@ static int parse_sample(struct action_util *a, int *argc_p, 
char ***argv_p,
rate_set = true;
} else if (matches(*argv, "group") == 0) {
NEXT_ARG();
-   if (get_unsigned(, *argv, 10) != 0) {
+   if (get_u32(, *argv, 10) != 0) {
fprintf(stderr, "Illegal group num %s\n",
*argv);
usage();
@@ -82,7 +82,7 @@ static int parse_sample(struct action_util *a, int *argc_p, 
char ***argv_p,
group_set = true;
} else if (matches(*argv, "trunc") == 0) {
NEXT_ARG();
-   if (get_unsigned(, *argv, 10) != 0) {
+   if (get_u32(, *argv, 10) != 0) {
fprintf(stderr, "Illegal truncation size %s\n",
*argv);
usage();
-- 
2.7.4



  1   2   3   >