[ovs-dev] [PATCH v18] Improved Packet Drop Statistics in OVS

2019-12-17 Thread Anju Thomas via dev
Currently OVS maintains explicit packet drop/error counters only on port
level.  Packets that are dropped as part of normal OpenFlow processing
are counted in flow stats of “drop” flows or as table misses in table
stats. These can only be interpreted by controllers that know the
semantics of the configured OpenFlow pipeline.  Without that knowledge,
it is impossible for an OVS user to obtain e.g. the total number of
packets dropped due to OpenFlow rules.

Furthermore, there are numerous other reasons for which packets can be
dropped by OVS slow path that are not related to the OpenFlow pipeline.
The generated datapath flow entries include a drop action to avoid
further expensive upcalls to the slow path, but subsequent packets
dropped by the datapath are not accounted anywhere.

Finally, the datapath itself drops packets in certain error situations.
Also, these drops are today not accounted for.This makes it difficult
for OVS users to monitor packet drop in an OVS instance and to alert a
management system in case of a unexpected increase of such drops.
AlsoOVS trouble-shooters face difficulties in analysing packet drops.

With this patch we implement following changes to address the issues
mentioned above.

1. Identify and account all the silent packet drop scenarios

2. Display these drops in ovs-appctl coverage/show

Co-authored-by: Rohith Basavaraja 
Co-authored-by: Keshav Gupta 
Signed-off-by: Anju Thomas 
Signed-off-by: Rohith Basavaraja 
Signed-off-by: Keshav Gupta 
Acked-by: Eelco Chaudron size
 3. Add version history

v12(Ben's comments)
 1. new dp action in kernel block in openvswitch.h
 2. change xlate_error enum to be used as u32
 3. resetting xlate error in case of congestion drop
and forwarding disabled

Signed-off-by: Anju Thomas 
---
 datapath/linux/compat/include/linux/openvswitch.h |  24 +++
 lib/dpif-netdev.c |  44 -
 lib/dpif.c|   7 +
 lib/dpif.h|   1 +
 lib/odp-execute.c |  78 +
 lib/odp-util.c|   5 +
 ofproto/ofproto-dpif-ipfix.c  |   1 +
 ofproto/ofproto-dpif-sflow.c  |   1 +
 ofproto/ofproto-dpif-xlate.c  |  33 +++-
 ofproto/ofproto-dpif-xlate.h  |  13 --
 ofproto/ofproto-dpif.c|  10 ++
 ofproto/ofproto-dpif.h|   8 +-
 tests/automake.mk |   3 +-
 tests/dpif-netdev.at  |   8 +
 tests/drop-stats.at   | 190 ++
 tests/ofproto-dpif.at |   2 +-
 tests/testsuite.at|   1 +
 tests/tunnel-push-pop.at  |  28 +++-
 tests/tunnel.at   |  16 +-
 vswitchd/vswitch.xml  |   5 +
 20 files changed, 454 insertions(+), 24 deletions(-)
 create mode 100644 tests/drop-stats.at

diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
b/datapath/linux/compat/include/linux/openvswitch.h
index 778827f..fef1bc8 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -407,6 +407,28 @@ enum ovs_tunnel_key_attr {
 #define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1)
 
 /**
+ * enum xlate_error -  Different types of error during translation
+ */
+
+#ifndef __KERNEL__
+enum xlate_error {
+   XLATE_OK = 0,
+   XLATE_BRIDGE_NOT_FOUND,
+   XLATE_RECURSION_TOO_DEEP,
+   XLATE_TOO_MANY_RESUBMITS,
+   XLATE_STACK_TOO_DEEP,
+   XLATE_NO_RECIRCULATION_CONTEXT,
+   XLATE_RECIRCULATION_CONFLICT,
+   XLATE_TOO_MANY_MPLS_LABELS,
+   XLATE_INVALID_TUNNEL_METADATA,
+   XLATE_UNSUPPORTED_PACKET_TYPE,
+   XLATE_CONGESTION_DROP,
+   XLATE_FORWARDING_DISABLED,
+   XLATE_MAX,
+};
+#endif
+
+/**
  * enum ovs_frag_type - IPv4 and IPv6 fragment type
  * @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
  * @OVS_FRAG_TYPE_FIRST: Packet is a fragment with offset 0.
@@ -962,6 +984,7 @@ struct check_pkt_len_arg {
  * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set
  * of actions if greater than the specified packet length, else execute
  * another set of actions.
+ * @OVS_ACTION_ATTR_DROP: Explicit drop action.
  */
 
 enum ovs_action_attr {
@@ -994,6 +1017,7 @@ enum ovs_action_attr {
 #ifndef __KERNEL__
OVS_ACTION_ATTR_TUNNEL_PUSH,   /* struct ovs_action_push_tnl*/
OVS_ACTION_ATTR_TUNNEL_POP,/* u32 port number. */
+   OVS_ACTION_ATTR_DROP,  /* u32 xlate_error. */
 #endif
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 3f21211..f7f1c43 1006

Re: [ovs-dev] [PATCH 3/3] system-afxdp.at: Add test for infinite re-addition of failed ports.

2019-12-17 Thread William Tu
On Tue, Dec 17, 2019 at 4:34 PM Ilya Maximets  wrote:
>
> On 09.12.2019 21:42, William Tu wrote:
> > On Sat, Dec 07, 2019 at 03:46:18PM +0100, Ilya Maximets wrote:
> >> New file created for AF_XDP specific tests.
> >>
> >> Signed-off-by: Ilya Maximets 
> > tested and LGTM, one minor comment.
> >
> > Acked-by: William Tu 
> >> ---
> >>  tests/automake.mk   |  3 ++-
> >>  tests/system-afxdp-testsuite.at |  1 +
> >>  tests/system-afxdp.at   | 24 
> >>  3 files changed, 27 insertions(+), 1 deletion(-)
> >>  create mode 100644 tests/system-afxdp.at
> >>
> >> diff --git a/tests/automake.mk b/tests/automake.mk
> >> index 4bf8f00d5..529eab54e 100644
> >> --- a/tests/automake.mk
> >> +++ b/tests/automake.mk
> >> @@ -156,7 +156,8 @@ SYSTEM_USERSPACE_TESTSUITE_AT = \
> >>  SYSTEM_AFXDP_TESTSUITE_AT = \
> >>  tests/system-userspace-macros.at \
> >>  tests/system-afxdp-testsuite.at \
> >> -tests/system-afxdp-macros.at
> >> +tests/system-afxdp-macros.at \
> >> +tests/system-afxdp.at
> >>
> >>  SYSTEM_TESTSUITE_AT = \
> >>  tests/system-common-macros.at \
> >> diff --git a/tests/system-afxdp-testsuite.at 
> >> b/tests/system-afxdp-testsuite.at
> >> index 9b7a29066..22fecf063 100644
> >> --- a/tests/system-afxdp-testsuite.at
> >> +++ b/tests/system-afxdp-testsuite.at
> >> @@ -24,3 +24,4 @@ m4_include([tests/system-userspace-macros.at])
> >>  m4_include([tests/system-afxdp-macros.at])
> >>
> >>  m4_include([tests/system-traffic.at])
> >> +m4_include([tests/system-afxdp.at])
> > should we move system-afxdp.at before system-traffic?
> > So these more basic test cases get triggered first.
>
> This might make sense.  I thought about this before sending the patch.
> Will change before applying.

OK thanks!
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH net-next v2] openvswitch: add TTL decrement action

2019-12-17 Thread Pravin Shelar
On Tue, Dec 17, 2019 at 7:51 AM Matteo Croce  wrote:
>
> New action to decrement TTL instead of setting it to a fixed value.
> This action will decrement the TTL and, in case of expired TTL, drop it
> or execute an action passed via a nested attribute.
> The default TTL expired action is to drop the packet.
>
> Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
>
> Tested with a corresponding change in the userspace:
>
> # ovs-dpctl dump-flows
> in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, 
> actions:dec_ttl{ttl<=1 action:(drop)},1,1
> in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, 
> actions:dec_ttl{ttl<=1 action:(drop)},1,2
> in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, 
> actions:2
> in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, 
> actions:1
>
> # ping -c1 192.168.0.2 -t 42
> IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), 
> length 84)
> 192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
> # ping -c1 192.168.0.2 -t 120
> IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), 
> length 84)
> 192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
> # ping -c1 192.168.0.2 -t 1
> #
>
> Co-authored-by: Bindiya Kurle 
> Signed-off-by: Bindiya Kurle 
> Signed-off-by: Matteo Croce 
> ---
>  include/uapi/linux/openvswitch.h |  22 +++
>  net/openvswitch/actions.c|  71 +
>  net/openvswitch/flow_netlink.c   | 105 +++
>  3 files changed, 198 insertions(+)
>
> diff --git a/include/uapi/linux/openvswitch.h 
> b/include/uapi/linux/openvswitch.h
> index a87b44cd5590..b6684bc04883 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -927,6 +927,7 @@ enum ovs_action_attr {
> OVS_ACTION_ATTR_METER,/* u32 meter ID. */
> OVS_ACTION_ATTR_CLONE,/* Nested OVS_CLONE_ATTR_*.  */
> OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */
> +   OVS_ACTION_ATTR_DEC_TTL,   /* Nested OVS_DEC_TTL_ATTR_*. */
>
> __OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
>* from userspace. */
> @@ -939,6 +940,23 @@ enum ovs_action_attr {
>  };
>
>  #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
> +enum ovs_dec_ttl_attr {
> +   OVS_DEC_TTL_ATTR_UNSPEC,
> +   OVS_DEC_TTL_ATTR_ACTION_TYPE,/* Action Type u32 */
> +   OVS_DEC_TTL_ATTR_ACTION, /* nested action */
> +   __OVS_DEC_TTL_ATTR_MAX,
> +#ifdef __KERNEL__
> +   OVS_DEC_TTL_ATTR_ARG  /* struct sample_arg  */
> +#endif
> +};
> +

I do not see need for type or OVS_DEC_TTL_ACTION_DROP, if there are no
nested action the datapath can drop the packet.

> +#ifdef __KERNEL__
> +struct dec_ttl_arg {
> +   u32 action_type;/* dec_ttl action type.*/
> +};
> +#endif
> +
> +#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1)
>
>  /* Meters. */
>  #define OVS_METER_FAMILY  "ovs_meter"
> @@ -1009,6 +1027,10 @@ enum ovs_ct_limit_attr {
> __OVS_CT_LIMIT_ATTR_MAX
>  };
>
> +enum ovs_dec_ttl_action {/*Actions supported by dec_ttl */
> +   OVS_DEC_TTL_ACTION_DROP,
> +   OVS_DEC_TTL_ACTION_USER_SPACE
> +};
>  #define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
>
>  #define OVS_ZONE_LIMIT_DEFAULT_ZONE -1
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 4c8395462303..5329668732b1 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -960,6 +960,31 @@ static int output_userspace(struct datapath *dp, struct 
> sk_buff *skb,
> return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
>  }
>
> +static int dec_ttl(struct datapath *dp, struct sk_buff *skb,
> +  struct sw_flow_key *fk, const struct nlattr *attr, bool 
> last)
> +{
> +   struct nlattr *actions;
> +   struct nlattr *dec_ttl_arg;
> +   int rem = nla_len(attr);
> +   const struct dec_ttl_arg *arg;
> +
> +   /* The first action is always OVS_DEC_TTL_ATTR_ARG. */
> +   dec_ttl_arg = nla_data(attr);
> +   arg = nla_data(dec_ttl_arg);
> +   actions = nla_next(dec_ttl_arg, &rem);
> +
> +   switch (arg->action_type) {
> +   case OVS_DEC_TTL_ACTION_DROP:
> +   consume_skb(skb);
> +   break;
> +
> +   case OVS_DEC_TTL_ACTION_USER_SPACE:
> +   return clone_execute(dp, skb, fk, 0, actions, rem, last, 
> false);
> +   }
> +
> +   return 0;
> +}
> +
>  /* When 'last' is true, sample() should always consume the 'skb'.
>   * Otherwise, sample() should keep 'skb' intact regardless what
>   * actions are executed within sample().
> @@ -1176,6 +1201,44 @@ static int execute_check_pkt_len(struct datapath *dp, 
> struct sk_buff *skb,
>  

Re: [ovs-dev] [PATCH v2] Use batch process recv for tap and raw socket in netdev datapath

2019-12-17 Thread 0-day Robot
Bleep bloop.  Greetings , I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Unexpected sign-offs from developers who are not authors or co-authors 
or committers: Ben Pfaff 
Lines checked: 302, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] Use batch process recv for tap and raw socket in netdev datapath

2019-12-17 Thread yang_y_yi
Hi, William


I used OVS DPDK to test it, you shouldn't add tap interface to ovs DPDK bridge 
if you use vdev to add tap, virtio_user is just for it, but that won't use this 
receive function to receive packets.

At 2019-12-17 02:55:50, "William Tu"  wrote:
>On Fri, Dec 06, 2019 at 02:09:24AM -0500, yang_y...@163.com wrote:
>> From: Yi Yang 
>> 
>> Current netdev_linux_rxq_recv_tap and netdev_linux_rxq_recv_sock
>> just receive single packet, that is very inefficient, per my test
>> case which adds two tap ports or veth ports into OVS bridge
>> (datapath_type=netdev) and use iperf3 to do performance test
>> between two ports (they are set into different network name space).
>> 
>> The result is as below:
>> 
>>   tap:  295 Mbits/sec
>>   veth: 207 Mbits/sec
>> 
>> After I change netdev_linux_rxq_recv_tap and
>> netdev_linux_rxq_recv_sock to use batch process, the performance
>> is boosted by about 7 times, here is the result:
>> 
>>   tap:  1.96 Gbits/sec
>>   veth: 1.47 Gbits/sec
>> 
>> Undoubtedly this is a huge improvement although it can't match
>> OVS kernel datapath yet.
>> 
>> FYI: here is thr result for OVS kernel datapath:
>> 
>>   tap:  37.2 Gbits/sec
>>   veth: 36.3 Gbits/sec
>> 
>> Note: performance result is highly related with your test machine
>> , you shouldn't expect the same results on your test machine.
>> 
>> Signed-off-by: Yi Yang 
>
>Hi Yi Yang,
>
>Are you testing this using OVS-DPDK?
>If you're using OVS-DPDK, then you should use DPDK's vdev to
>open and attach tap/veth device to OVS. I think you'll see much
>better performance.
>
>The performance issue you pointed out only happens when using
>userspace datapath without DPDK library, where afxdp is used.
>I'm still looking for a better solutions for faster interface
>for veth (af_packet) and tap.
>
>Thanks
>William
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] Use batch process recv for tap and raw socket in netdev datapath

2019-12-17 Thread yang_y_yi
From: Yi Yang 

Current netdev_linux_rxq_recv_tap and netdev_linux_rxq_recv_sock
just receive single packet, that is very inefficient, per my test
case which adds two tap ports or veth ports into OVS bridge
(datapath_type=netdev) and use iperf3 to do performance test
between two ports (they are set into different network name space).

The result is as below:

  tap:  295 Mbits/sec
  veth: 207 Mbits/sec

After I change netdev_linux_rxq_recv_tap and
netdev_linux_rxq_recv_sock to use batch process, the performance
is boosted by about 7 times, here is the result:

  tap:  1.96 Gbits/sec
  veth: 1.47 Gbits/sec

Undoubtedly this is a huge improvement although it can't match
OVS kernel datapath yet.

FYI: here is thr result for OVS kernel datapath:

  tap:  37.2 Gbits/sec
  veth: 36.3 Gbits/sec

Note: performance result is highly related with your test machine
, you shouldn't expect the same results on your test machine.

Changes since v1:
  - Add fix from Ben Pfaff

Signed-off-by: Yi Yang 
Signed-off-by: Ben Pfaff 
---
 include/sparse/sys/socket.h |   3 +
 lib/netdev-linux.c  | 167 +---
 2 files changed, 112 insertions(+), 58 deletions(-)

diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h
index 4178f57..d3c3611 100644
--- a/include/sparse/sys/socket.h
+++ b/include/sparse/sys/socket.h
@@ -27,6 +27,7 @@
 
 typedef unsigned short int sa_family_t;
 typedef __socklen_t socklen_t;
+struct timespec;
 
 struct sockaddr {
 sa_family_t sa_family;
@@ -163,6 +164,8 @@ ssize_t recvmsg(int, struct msghdr *, int);
 ssize_t send(int, const void *, size_t, int);
 ssize_t sendmsg(int, const struct msghdr *, int);
 int sendmmsg(int, struct mmsghdr *, unsigned int, unsigned int);
+int recvmmsg(int, struct mmsghdr *, unsigned int,
+ unsigned int, struct timespec *);
 ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *,
socklen_t);
 int setsockopt(int, int, int, const void *, socklen_t);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index f8e59ba..3414a64 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1151,90 +1151,147 @@ auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
 }
 
+/*
+ * Receive packets from raw socket in batch process for better performance,
+ * it can receive NETDEV_MAX_BURST packets at most once, the received
+ * packets are added into *batch. The return value is 0 or errno.
+ *
+ * It also used recvmmsg to reduce multiple syscalls overhead;
+ */
 static int
-netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
+netdev_linux_batch_rxq_recv_sock(int fd, int mtu,
+ struct dp_packet_batch *batch)
 {
 size_t size;
 ssize_t retval;
-struct iovec iov;
+struct iovec iovs[NETDEV_MAX_BURST];
 struct cmsghdr *cmsg;
 union {
 struct cmsghdr cmsg;
 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
-} cmsg_buffer;
-struct msghdr msgh;
-
-/* Reserve headroom for a single VLAN tag */
-dp_packet_reserve(buffer, VLAN_HEADER_LEN);
-size = dp_packet_tailroom(buffer);
-
-iov.iov_base = dp_packet_data(buffer);
-iov.iov_len = size;
-msgh.msg_name = NULL;
-msgh.msg_namelen = 0;
-msgh.msg_iov = &iov;
-msgh.msg_iovlen = 1;
-msgh.msg_control = &cmsg_buffer;
-msgh.msg_controllen = sizeof cmsg_buffer;
-msgh.msg_flags = 0;
+} cmsg_buffers[NETDEV_MAX_BURST];
+struct mmsghdr mmsgs[NETDEV_MAX_BURST];
+struct dp_packet *buffers[NETDEV_MAX_BURST];
+int i;
+
+for (i = 0; i < NETDEV_MAX_BURST; i++) {
+ buffers[i] = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
+  DP_NETDEV_HEADROOM);
+ /* Reserve headroom for a single VLAN tag */
+ dp_packet_reserve(buffers[i], VLAN_HEADER_LEN);
+ size = dp_packet_tailroom(buffers[i]);
+ iovs[i].iov_base = dp_packet_data(buffers[i]);
+ iovs[i].iov_len = size;
+ mmsgs[i].msg_hdr.msg_name = NULL;
+ mmsgs[i].msg_hdr.msg_namelen = 0;
+ mmsgs[i].msg_hdr.msg_iov = &iovs[i];
+ mmsgs[i].msg_hdr.msg_iovlen = 1;
+ mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
+ mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
+ mmsgs[i].msg_hdr.msg_flags = 0;
+}
 
 do {
-retval = recvmsg(fd, &msgh, MSG_TRUNC);
+retval = recvmmsg(fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
 } while (retval < 0 && errno == EINTR);
 
 if (retval < 0) {
-return errno;
-} else if (retval > size) {
-return EMSGSIZE;
+/* Save -errno to retval temporarily */
+retval = -errno;
+i = 0;
+goto free_buffers;
 }
 
-dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
-
-for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR

[ovs-dev] 答复: [PATCH] Use batch process recv for tap and raw socket in netdev datapath

2019-12-17 Thread 杨�D
Ben, thank for your review, for recvmmsg, we have to prepare some buffers
for it, but we have no way to know how many packets are there for socket, so
these mallocs are must-have overhead, maybe self-adaptive malloc mechanism
is better, for example, the first receive just mallocs 4 buffers, if it
receives 4 buffers successfully, we can increase it to 8, till it is up to
32, if it can't receive all the buffers, we can decrease it by one half, but
this will make code complicated a bit.

Your fix is right, I should be set to 0 when retval < 0, thank for your
review again, I'll update it with your fix patch and send another version.

-邮件原件-
发件人: Ben Pfaff [mailto:b...@ovn.org] 
发送时间: 2019年12月18日 4:14
收件人: yang_y...@163.com
抄送: ovs-dev@openvswitch.org; ian.sto...@intel.com; Yi Yang (杨�D)-云服务集
团 
主题: Re: [PATCH] Use batch process recv for tap and raw socket in netdev
datapath

On Fri, Dec 06, 2019 at 02:09:24AM -0500, yang_y...@163.com wrote:
> From: Yi Yang 
> 
> Current netdev_linux_rxq_recv_tap and netdev_linux_rxq_recv_sock just 
> receive single packet, that is very inefficient, per my test case 
> which adds two tap ports or veth ports into OVS bridge
> (datapath_type=netdev) and use iperf3 to do performance test between 
> two ports (they are set into different network name space).

Thanks for the patch!  This is an impressive performance improvement!

Each call to netdev_linux_batch_rxq_recv_sock() now calls malloc() 32 times.
This is expensive if only a few packets (or none) are received.
Maybe it doesn't matter, but I wonder whether it affects performance.

I think that no packets are freed on error.  Fix:

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index
9cb45d5c7d29..3414a6495ced 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1198,6 +1198,7 @@ netdev_linux_batch_rxq_recv_sock(int fd, int mtu,
 if (retval < 0) {
 /* Save -errno to retval temporarily */
 retval = -errno;
+i = 0;
 goto free_buffers;
 }
 

To get sparse to work, one must fold in the following:

diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h index
4178f57e2bda..e954ade714b5 100644
--- a/include/sparse/sys/socket.h
+++ b/include/sparse/sys/socket.h
@@ -27,6 +27,7 @@
 
 typedef unsigned short int sa_family_t;  typedef __socklen_t socklen_t;
+struct timespec;
 
 struct sockaddr {
 sa_family_t sa_family;
@@ -171,4 +172,7 @@ int sockatmark(int);  int socket(int, int, int);  int
socketpair(int, int, int, int[2]);
 
+int sendmmsg(int, struct mmsghdr *, unsigned int, int); int 
+recvmmsg(int, struct mmsghdr *, unsigned int, int, struct timespec *);
+
 #endif /*  for sparse */
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 3/3] system-afxdp.at: Add test for infinite re-addition of failed ports.

2019-12-17 Thread Ilya Maximets
On 09.12.2019 21:42, William Tu wrote:
> On Sat, Dec 07, 2019 at 03:46:18PM +0100, Ilya Maximets wrote:
>> New file created for AF_XDP specific tests.
>>
>> Signed-off-by: Ilya Maximets 
> tested and LGTM, one minor comment.
> 
> Acked-by: William Tu 
>> ---
>>  tests/automake.mk   |  3 ++-
>>  tests/system-afxdp-testsuite.at |  1 +
>>  tests/system-afxdp.at   | 24 
>>  3 files changed, 27 insertions(+), 1 deletion(-)
>>  create mode 100644 tests/system-afxdp.at
>>
>> diff --git a/tests/automake.mk b/tests/automake.mk
>> index 4bf8f00d5..529eab54e 100644
>> --- a/tests/automake.mk
>> +++ b/tests/automake.mk
>> @@ -156,7 +156,8 @@ SYSTEM_USERSPACE_TESTSUITE_AT = \
>>  SYSTEM_AFXDP_TESTSUITE_AT = \
>>  tests/system-userspace-macros.at \
>>  tests/system-afxdp-testsuite.at \
>> -tests/system-afxdp-macros.at
>> +tests/system-afxdp-macros.at \
>> +tests/system-afxdp.at
>>  
>>  SYSTEM_TESTSUITE_AT = \
>>  tests/system-common-macros.at \
>> diff --git a/tests/system-afxdp-testsuite.at 
>> b/tests/system-afxdp-testsuite.at
>> index 9b7a29066..22fecf063 100644
>> --- a/tests/system-afxdp-testsuite.at
>> +++ b/tests/system-afxdp-testsuite.at
>> @@ -24,3 +24,4 @@ m4_include([tests/system-userspace-macros.at])
>>  m4_include([tests/system-afxdp-macros.at])
>>  
>>  m4_include([tests/system-traffic.at])
>> +m4_include([tests/system-afxdp.at])
> should we move system-afxdp.at before system-traffic?
> So these more basic test cases get triggered first.

This might make sense.  I thought about this before sending the patch.
Will change before applying.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v6] netdev-dpdk: add support for the RTE_ETH_EVENT_INTR_RESET event

2019-12-17 Thread Ilya Maximets
On 11.11.2019 15:02, Eelco Chaudron wrote:
> Currently, OVS does not register and therefore not handle the
> interface reset event from the DPDK framework. This would cause a
> problem in cases where a VF is used as an interface, and its
> configuration changes.
> 
> As an example in the following scenario the MAC change is not
> detected/acted upon until OVS is restarted without the patch applied:
> 
>   $ echo 1 > /sys/bus/pci/devices/:05:00.1/sriov_numvfs
>   $ ovs-vsctl add-port ovs_pvp_br0 dpdk0 -- \
> set Interface dpdk0 type=dpdk -- \
> set Interface dpdk0 options:dpdk-devargs=:05:0a.0
> 
>   $ ip link set p5p2 vf 0 mac 52:54:00:92:d3:33
> 
> Requires patch "bridge: Allow manual notifications about interfaces' updates."
> 
> Signed-off-by: Eelco Chaudron 

Thanks! Applied to master.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] bridge: Allow manual notifications about interfaces' updates.

2019-12-17 Thread Ilya Maximets
On 17.12.2019 19:39, Ilya Maximets wrote:
> On 17.12.2019 19:25, Ben Pfaff wrote:
>> On Tue, Dec 17, 2019 at 02:45:03PM +0100, Eelco Chaudron wrote:
>>>
>>>
>>> On 6 Nov 2019, at 14:32, Eelco Chaudron wrote:
>>>
 On 5 Nov 2019, at 18:20, Ilya Maximets wrote:

> Sometimes interface updates could happen in a way ifnotifier is not
> able to catch.  For example some heavy operations (device reset) in
> netdev-dpdk could require re-applying of the bridge configuration.
>
> For this purpose new manual notifier introduced. Its function
> 'if_notifier_manual_report()' could be called directly by the code
> that aware about changes.  This new notifier is thread-safe.
>
> Signed-off-by: Ilya Maximets 

 Reviewed and tested this patch in combination with the “netdev-dpdk: add
 support for the RTE_ETH_EVENT_INTR_RESET event”.

 LGTM,

 Acked-by: Eelco Chaudron 
>>>
>>> Is there anything waiting to get this included so my other patch could get
>>> included?
>>
>> Seems reasonable.  The only thing I noticed in it is two function
>> definitions where the function name should be moved to the beginning of
>> a line.
> 
> Thanks for spotting.  Will fix before applying.
> 
>>
>> Ilya, I'm assuming you'll apply it when you're satisfied?
> 
> I'm taking the last look at the dependent patch
> https://patchwork.ozlabs.org/patch/1192944/
> It seems OK so far and I'm going to apply both patches soon.
> 
> Best regards, Ilya Maximets.
> 

Thanks Eelco and Ben.  Fixed and applied.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v7 1/2] netdev-linux: Detect numa node id.

2019-12-17 Thread 0-day Robot
Bleep bloop.  Greetings Yi-Hung Wei, I am a robot and I have tried out your 
patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line lacks whitespace around operator
#134 FILE: lib/netdev-linux.c:1419:
numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);

Lines checked: 184, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v7 1/2] netdev-linux: Detect numa node id.

2019-12-17 Thread Yi-Hung Wei
From: William Tu 

The patch detects the numa node id from the name of the netdev,
by reading the '/sys/class/net//device/numa_node'.
If not available, ex: virtual device, or any error happens,
return numa id 0.  Currently only the afxdp netdev type uses it,
other linux netdev types are disabled due to no use case.

Signed-off-by: William Tu 
Acked-by: Eelco Chaudron 
---
v7:
  - Add numa aware memory allocation for afxdp related memory in the following
patch.
  - Travis CI: https://travis-ci.org/YiHungWei/ovs/builds/626403984

v6:
  Feedbacks from Ilya
  - add thread safety check at netdev_linux_get_numa_id__, and
pass netdev_linux
  - preserve numa cache on netlink updates
  - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/605634673

v5:
  Feedbacks from Ilya
  - reafactor the error handling
  - add mutex lock
  - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/601947245

v4:
  Feedbacks from Eelco
  - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/599308893

v3:
  Feedbacks from Ilya and Eelco
  - update doc, afxdp.rst
  - fix coding style
  - fix limit of numa node max, by using ovs_numa_numa_id_is_valid
  - move the function to netdev-linux
  - cache the result of numa_id
  - add security check for netdev name
  - use fscanf instead of read and convert to int
  - revise some error message content

v2:
  address feedback from Eelco
fix memory leak of xaspintf
log using INFO instead of WARN
---
 Documentation/intro/install/afxdp.rst |  1 -
 lib/netdev-afxdp.c| 11 -
 lib/netdev-afxdp.h|  1 -
 lib/netdev-linux-private.h|  2 +
 lib/netdev-linux.c| 67 +--
 5 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/Documentation/intro/install/afxdp.rst 
b/Documentation/intro/install/afxdp.rst
index 15e3c918f942..7b0736c96114 100644
--- a/Documentation/intro/install/afxdp.rst
+++ b/Documentation/intro/install/afxdp.rst
@@ -327,7 +327,6 @@ Below is a script using namespaces and veth peer::
 
 Limitations/Known Issues
 
-#. Device's numa ID is always 0, need a way to find numa id from a netdev.
 #. No QoS support because AF_XDP netdev by-pass the Linux TC layer. A possible
work-around is to use OpenFlow meter action.
 #. Most of the tests are done using i40e single port. Multiple ports and
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index ca2dfd005b9f..4c1f9c68270a 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -695,17 +695,6 @@ out:
 return err;
 }
 
-int
-netdev_afxdp_get_numa_id(const struct netdev *netdev)
-{
-/* FIXME: Get netdev's PCIe device ID, then find
- * its NUMA node id.
- */
-VLOG_INFO("FIXME: Device %s always use numa id 0.",
-  netdev_get_name(netdev));
-return 0;
-}
-
 static void
 xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode)
 {
diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h
index 8188bc669526..246a4b62fb57 100644
--- a/lib/netdev-afxdp.h
+++ b/lib/netdev-afxdp.h
@@ -60,7 +60,6 @@ int netdev_afxdp_batch_send(struct netdev *netdev_, int qid,
 int netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
 char **errp);
 int netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args);
-int netdev_afxdp_get_numa_id(const struct netdev *netdev);
 int netdev_afxdp_get_stats(const struct netdev *netdev_,
struct netdev_stats *stats);
 int netdev_afxdp_get_custom_stats(const struct netdev *netdev,
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index 8873caa9d412..4d66b0858222 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -96,6 +96,8 @@ struct netdev_linux {
 /* LAG information. */
 bool is_lag_master; /* True if the netdev is a LAG master. */
 
+int numa_id;/* NUMA node id. */
+
 #ifdef HAVE_AF_XDP
 /* AF_XDP information. */
 struct xsk_socket_info **xsks;
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index f8e59bacfb13..e32df970fe4b 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -236,6 +236,7 @@ enum {
 VALID_VPORT_STAT_ERROR  = 1 << 5,
 VALID_DRVINFO   = 1 << 6,
 VALID_FEATURES  = 1 << 7,
+VALID_NUMA_ID   = 1 << 8,
 };
 
 struct linux_lag_slave {
@@ -820,9 +821,9 @@ netdev_linux_update__(struct netdev_linux *dev,
 {
 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
 if (change->nlmsg_type == RTM_NEWLINK) {
-/* Keep drv-info, and ip addresses. */
+/* Keep drv-info, ip addresses, and NUMA id. */
 netdev_linux_changed(dev, change->ifi_flags,
- VALID_DRVINFO | VALID_IN);
+ VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
 
 /* Update netdev from rtnl-change msg. */
 

[ovs-dev] [PATCH v7 2/2] netdev-afxdp: NUMA-aware memory allocation for XSK related memory

2019-12-17 Thread Yi-Hung Wei
Currently, the AF_XDP socket (XSK) related memory are allocated by main
thread in the main thread's NUMA domain.  With the patch that detects
netdev-linux's NUMA node id, the PMD thread of AF_XDP port will be run on
the AF_XDP netdev's NUMA domain.  If the net device's NUMA domain
is different from the main thread's NUMA domain, we will have two
cross-NUMA memory accesses (netdev <-> memory, memory <-> CPU).

This patch addresses the aforementioned issue by allocating
the memory in the net device's NUMA domain.

Signed-off-by: Yi-Hung Wei 
---
 Documentation/intro/install/afxdp.rst |  2 +-
 acinclude.m4  |  5 -
 include/sparse/automake.mk|  1 +
 include/sparse/numa.h | 27 +++
 lib/netdev-afxdp.c| 21 ++---
 5 files changed, 51 insertions(+), 5 deletions(-)
 create mode 100644 include/sparse/numa.h

diff --git a/Documentation/intro/install/afxdp.rst 
b/Documentation/intro/install/afxdp.rst
index 7b0736c96114..c4685fa7ebac 100644
--- a/Documentation/intro/install/afxdp.rst
+++ b/Documentation/intro/install/afxdp.rst
@@ -164,7 +164,7 @@ If a test case fails, check the log at::
 
 Setup AF_XDP netdev
 ---
-Before running OVS with AF_XDP, make sure the libbpf and libelf are
+Before running OVS with AF_XDP, make sure the libbpf, libelf, and libnuma are
 set-up right::
 
   ldd vswitchd/ovs-vswitchd
diff --git a/acinclude.m4 b/acinclude.m4
index 542637ac8cb8..73ed11d701aa 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -286,9 +286,12 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
 AC_CHECK_FUNCS([pthread_spin_lock], [],
   [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])])
 
+AC_CHECK_LIB(numa, numa_alloc_onnode, [],
+  [AC_MSG_ERROR([unable to find libnuma for AF_XDP support])])
+
 AC_DEFINE([HAVE_AF_XDP], [1],
   [Define to 1 if AF_XDP support is available and enabled.])
-LIBBPF_LDADD=" -lbpf -lelf"
+LIBBPF_LDADD=" -lbpf -lelf -lnuma"
 AC_SUBST([LIBBPF_LDADD])
 
 AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk
index 073631e8c082..974ad3fe55f7 100644
--- a/include/sparse/automake.mk
+++ b/include/sparse/automake.mk
@@ -5,6 +5,7 @@ noinst_HEADERS += \
 include/sparse/bits/floatn.h \
 include/sparse/assert.h \
 include/sparse/math.h \
+include/sparse/numa.h \
 include/sparse/netinet/in.h \
 include/sparse/netinet/ip6.h \
 include/sparse/netpacket/packet.h \
diff --git a/include/sparse/numa.h b/include/sparse/numa.h
new file mode 100644
index ..3691a0eaf729
--- /dev/null
+++ b/include/sparse/numa.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CHECKER__
+#error "Use this header only with sparse.  It is not a correct implementation."
+#endif
+
+/* Avoid sparse warning: non-ANSI function declaration of function" */
+#define numa_get_membind_compat() numa_get_membind_compat(void)
+#define numa_get_interleave_mask_compat() numa_get_interleave_mask_compat(void)
+#define numa_get_run_node_mask_compat() numa_get_run_node_mask_compat(void)
+
+/* Get actual  definitions for us to annotate and build on. */
+#include_next
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index 4c1f9c68270a..2d6f739b4b67 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -469,7 +470,7 @@ xsk_configure_all(struct netdev *netdev)
 {
 struct netdev_linux *dev = netdev_linux_cast(netdev);
 int i, ifindex, n_rxq, n_txq;
-int qid = 0;
+int qid = 0, err = 0;
 
 ifindex = linux_get_ifindex(netdev_get_name(netdev));
 
@@ -477,6 +478,14 @@ xsk_configure_all(struct netdev *netdev)
 ovs_assert(dev->tx_locks == NULL);
 
 n_rxq = netdev_n_rxq(netdev);
+
+/* Allocate all the xsk related memory in the netdev's NUMA domain. */
+struct bitmask *old_bm = numa_get_membind();
+struct bitmask *new_bm = numa_allocate_nodemask();
+netdev_get_numa_id(netdev);
+numa_bitmask_setbit(new_bm, dev->numa_id);
+numa_set_membind(new_bm);
+
 dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks);
 
 if (dev->xdp_mode == OVS_AF_XDP_MODE_BEST_EFFORT) {
@@ -518,11 +527,17 @@ xsk_configure_all(struct netdev *netdev)

Re: [ovs-dev] [PATCH] socket-util: Introduce emulation and wrapper for recvmmsg().

2019-12-17 Thread Ben Pfaff
On Tue, Dec 17, 2019 at 04:51:35PM -0500, Aaron Conole wrote:
> Ben Pfaff  writes:
> 
> > Not every system will have recvmmsg(), so introduce compatibility code
> > that will allow it to be used blindly from the rest of the tree.
> >
> > This assumes that recvmmsg() and sendmmsg() are either both present or
> > both absent in system libraries and headers.
> >
> > CC: Yi Yang 
> > Signed-off-by: Ben Pfaff 
> > ---
> > I haven't actually tested this!
> 
> I have trouble understanding the motivation for this patch.  Will
> there be an attempt to rewrite netlink-socket or netdev-linux to use
> recvmmsg?  Simply adding this function with no new callers seems like
> introducing dead code.  I'd expect the next version to include some
> in-tree user.

This is relevant to Yi Yang's patch:
http://patchwork.ozlabs.org/patch/1204939/

I should have referred to it or put it in his thread, sorry.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] socket-util: Introduce emulation and wrapper for recvmmsg().

2019-12-17 Thread Aaron Conole
Ben Pfaff  writes:

> Not every system will have recvmmsg(), so introduce compatibility code
> that will allow it to be used blindly from the rest of the tree.
>
> This assumes that recvmmsg() and sendmmsg() are either both present or
> both absent in system libraries and headers.
>
> CC: Yi Yang 
> Signed-off-by: Ben Pfaff 
> ---
> I haven't actually tested this!

I have trouble understanding the motivation for this patch.  Will
there be an attempt to rewrite netlink-socket or netdev-linux to use
recvmmsg?  Simply adding this function with no new callers seems like
introducing dead code.  I'd expect the next version to include some
in-tree user.

>  include/sparse/sys/socket.h |  7 -
>  lib/socket-util.c   | 56 +
>  lib/socket-util.h   | 24 +---
>  3 files changed, 76 insertions(+), 11 deletions(-)
>
> diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h
> index 4178f57e2bda..6ff245ae939b 100644
> --- a/include/sparse/sys/socket.h
> +++ b/include/sparse/sys/socket.h
> @@ -27,6 +27,7 @@
>  
>  typedef unsigned short int sa_family_t;
>  typedef __socklen_t socklen_t;
> +struct timespec;
>  
>  struct sockaddr {
>  sa_family_t sa_family;
> @@ -126,7 +127,8 @@ enum {
>  MSG_PEEK,
>  MSG_TRUNC,
>  MSG_WAITALL,
> -MSG_DONTWAIT
> +MSG_DONTWAIT,
> +MSG_WAITFORONE
>  };
>  
>  enum {
> @@ -171,4 +173,7 @@ int sockatmark(int);
>  int socket(int, int, int);
>  int socketpair(int, int, int, int[2]);
>  
> +int sendmmsg(int, struct mmsghdr *, unsigned int, int);
> +int recvmmsg(int, struct mmsghdr *, unsigned int, int, struct timespec *);
> +
>  #endif /*  for sparse */
> diff --git a/lib/socket-util.c b/lib/socket-util.c
> index 6b7378de934b..f6f6f3b0a33f 100644
> --- a/lib/socket-util.c
> +++ b/lib/socket-util.c
> @@ -1283,3 +1283,59 @@ wrap_sendmmsg(int fd, struct mmsghdr *msgs, unsigned 
> int n, unsigned int flags)
>  }
>  #endif
>  #endif
> +
> +#ifndef _WIN32 /* Avoid using recvmsg on Windows entirely. */
> +static int
> +emulate_recvmmsg(int fd, struct mmsghdr *msgs, unsigned int n,
> + int flags, struct timespec *timeout OVS_UNUSED)
> +{
> +ovs_assert(!timeout);   /* XXX not emulated */

Why not?  A really simple way of implementing it (albeit not a terribly
resource friendly one) would be to call recvmsg() with
  (flags | MSG_DONTWAIT)

Then decrement time, and if there's still time left in the timeout,
retry.  Alternately, setup an alarm.  There may be other methods to
achieve it.  The point being that we probably can accommodate this
timeout.

Otherwise, it should at least be documented that the behavior isn't the
same.  In that case, I'd prefer calling it 'ovs_recvmmsg' rather than
simply masquerading the function away.  That way, reading the code (and
authoring new code) the difference is obvious (and we could even
just... eliminate timeout as a parameter).

> +bool waitforone = flags & MSG_WAITFORONE;
> +flags &= ~MSG_WAITFORONE;
> +
> +for (unsigned int i = 0; i < n; i++) {
> +ssize_t retval = recvmsg(fd, &msgs[i].msg_hdr, flags);
> +if (retval < 0) {
> +return i ? i : retval;
> +}
> +msgs[i].msg_len = retval;
> +
> +if (waitforone) {
> +flags |= MSG_DONTWAIT;
> +}
> +}
> +return n;
> +}
> +
> +#ifndef HAVE_SENDMMSG
> +int
> +recvmmsg(int fd, struct mmsghdr *msgs, unsigned int n,
> + int flags, struct timespec *timeout)
> +{
> +return emulate_recvmmsg(fd, msgs, n, flags, timeout);
> +}
> +#else
> +/* recvmmsg was redefined in lib/socket-util.c, should undef recvmmsg here

Shouldn't that read lib/socket-util.h ?  That also applies to the
sendmmsg comment ;)

> + * to avoid recursion */
> +#undef recvmmsg
> +int
> +wrap_recvmmsg(int fd, struct mmsghdr *msgs, unsigned int n,
> +  int flags, struct timespec *timeout)
> +{
> +ovs_assert(!timeout);   /* XXX not emulated */

This doesn't make sense.  recvmmsg(..., NULL); is perfectly valid, and
this is supposed to be a wrapper?  On systems where it is valid to call,
why can't we use it?

> +static bool recvmmsg_broken = false;
> +if (!recvmmsg_broken) {
> +int save_errno = errno;
> +int retval = recvmmsg(fd, msgs, n, flags, timeout);
> +if (retval >= 0 || errno != ENOSYS) {

It's strange, no matter what retval is, we would return here.  The only
case we really care about is errno, yes?  I'm not asking for any change
at this line, it's just an observation.

> +return retval;
> +}
> +recvmmsg_broken = true;
> +errno = save_errno;
> +}
> +return emulate_recvmmsg(fd, msgs, n, flags, timeout);
> +}
> +#endif
> +#endif
> diff --git a/lib/socket-util.h b/lib/socket-util.h
> index a65433d90738..71bd68926aaa 100644
> --- a/lib/socket-util.h
> +++ b/lib/socket-util.h
> @@ -104,19 +104,20 @@ int make_unix_socket(int style, bool nonblock,

Re: [ovs-dev] [PATCH] tests: introduced test for checking "ovs-vsctl emer-reset"

2019-12-17 Thread Ben Pfaff
On Mon, Dec 09, 2019 at 02:26:43PM +0100, Damijan Skvarc wrote:
> Signed-off-by: Damijan Skvarc 

Thanks!  Applied to master.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH V4 08/17] dpif-netdev: Update offloaded flows statistics

2019-12-17 Thread Ilya Maximets
On 17.12.2019 21:56, Eli Britstein wrote:
> 
> On 12/17/2019 9:07 PM, Ilya Maximets wrote:
>> On 17.12.2019 18:38, Eli Britstein wrote:
>>> On 12/16/2019 8:47 PM, Ilya Maximets wrote:
 3. New 'dp_layer' types has to be documented in dpctl man.
  Also, 'in_hw' doesn't look like a datapath layer name.
  Suggesting to use 'dpdk' as netdev-offload-tc uses 'tc'.
  We also could return specific dp_layer for partially offloaded
  flows, i.e. 'ovs-partial'.
>>> Thanks for the patch. I applied and tested. It works well.
>>>
>>> Re-thinking about the dp_layer and offloaded, I think it's wrong.
>>>
>>> The 'dp_layer' should be "ovs-dpdk" to differentiate from kernel module
>>> datapath "ovs", and denote the datapath is by dpdk.
>> I don't think we need to differentiate userspace and kernel datapaths
>> like this just because they are kind of same datapath layer, also,
>> you're dumping flows always from the specific datapath/bridge, i.e. you
>> know what is your datapath type.  Second point is that it definitely
>> should not be called as 'ovs-dpdk' because there might be no DPDK at all.
>> My suggestion is to have 'ovs' layer for all the usual non-offloadded
>> flows, 'tc' for flows offloaded to linux tc via netdev-offload-tc
>> and 'dpdk' for fully offloaded flows via netdev-offload-dpdk.
> 
> I think it's not very clear that the DP is changed because of 
> offloading. If we go ahead with this, so
> 
> there is no point make the offloaded flag with 3 states, as partial 
> offloaded can be noted by "ovs"
> 
> and "offloaded=yes". It can just be formatted this way in dump-flows 
> (offloaded=partial, dp: ovs).

Good point.  This might be easier to implement.

> 
>>
>>> It is true for any kind of offloading.
>>>
>>> This can be done in dp_netdev_flow_to_dpif_flow to handle offloading is
>>> off, and filled in netdev-offload-dpdk (to the same value).
>>>
>>> The "offloaded" flag can be enhanced to 3 states, instead of just
>>> boolean, as below (of course need to adapt throughout the code).
>>>
>>> So, we will be able to show "partial" or "yes" for "offloaded", in
>>> dpctl/dump-flows.
>> Yes, I think there were such idea in early discussion with Roni this year.
>> So, we could implement this.  For partially offloaded flows it might
>> make sense to have dp_layer = "ovs" since most of the processing happens
>> in the main userspace datapath.
>>
>>> Please comment.
>>>
>>>
>>> diff --git a/lib/dpif.h b/lib/dpif.h
>>> index d96f854a3..dba2f3cbf 100644
>>> --- a/lib/dpif.h
>>> +++ b/lib/dpif.h
>>> @@ -508,8 +508,14 @@ struct dpif_flow_detailed_stats {
>>>    uint16_t tcp_flags;
>>>    };
>>>
>>> +enum ol_state {
>>> +    OFFLOADED_STATE_OFF,
>>> +    OFFLOADED_STATE_ON,
>>> +    OFFLOADED_STATE_PARTIAL,
>> How about:
>>
>> enum dpif_flow_ol_state {
>>  DPIF_FLOW_OL_STATE_NOT_OFFLOADED, /* Flow fully handled in software. */
>>  DPIF_FLOW_OL_STATE_OFFLOADED, /* Flow fully handled in hardware. */
>>  DPIF_FLOW_OL_STATE_PARTIAL,   /* Flow matching handled in hardware. 
>> */
>> };
>>
>>> +};
>>> +
>>>    struct dpif_flow_attrs {
>>> -    bool offloaded; /* True if flow is offloaded to HW. */
>>> +    enum ol_state offloaded;
>> Some minimal comment would be nice here.
>>  enum dpif_flow_ol_state ol_state;  /* Status of HW offloading. */
>>
>>>    const char *dp_layer;   /* DP layer the flow is handled in. */
>>>    };
>>>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v8 1/1] Avoid dp_hash recirculation for balance-tcp bond selection mode

2019-12-17 Thread Ben Pfaff
On Tue, Dec 17, 2019 at 10:26:05PM +0100, Ilya Maximets wrote:
> On 17.12.2019 20:23, Ben Pfaff wrote:
> > Is there a reason to allow users to turn this off?  What is the downside
> > of enabling it?  Why is it disabled by default?  In general, OVS should
> > optimize itself to the extent it can rather than relying on a
> > knowledgeable user to tweak it.  If it's necessary to make it
> > configurable, then the documentation (in vswitch.xml) should explain why
> > one would want to turn it on or off and what the default is.
> 
> Hi Ben,
> I just wanted to add a few comments here.  There are few possibly unwanted
> things that appears while enabling the feature:
> 1. Bonding rebalancing triggers reload of PMD threads, that might affect
>performance of highly loaded threads.
> 2. While datapath supports lb_output action, netdev-offload provider might
>not support it.  This could be an issue.
> So, I think it's sane to have a configuration knob.  Not sure about disabling
> by default but it might make a bit of sense keeping this feature as 
> experimental
> for some time.  And yes, I agree  that documentation should be updated.

I didn't understand either of those points.  With those points, it makes
sense for it to be user configurable (and the documentation should be
clear about the tradeoffs).
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v8 1/1] Avoid dp_hash recirculation for balance-tcp bond selection mode

2019-12-17 Thread Ilya Maximets
On 17.12.2019 20:23, Ben Pfaff wrote:
> Thanks for the patch!
> 
> "sparse" reports some type errors:
> 
> ../ofproto/bond.c:357:30: error: incorrect type in assignment (different 
> base types)
> ../ofproto/bond.c:357:30:expected unsigned int
> ../ofproto/bond.c:357:30:got restricted ofp_port_t [usertype] ofp_port
> ../ofproto/ofproto-dpif.c:3454:56: error: incorrect type in argument 2 
> (different base types)
> ../ofproto/ofproto-dpif.c:3454:56:expected restricted ofp_port_t 
> [usertype]
> ../ofproto/ofproto-dpif.c:3454:56:got unsigned int [usertype]
> ../ofproto/ofproto-dpif.c:3453:31: error: incorrect type in assignment 
> (different base types)
> ../ofproto/ofproto-dpif.c:3453:31:expected unsigned int [usertype]
> ../ofproto/ofproto-dpif.c:3453:31:got restricted odp_port_t
> ../ofproto/ofproto-dpif-xlate.c:4208:39: error: incorrect type in 
> argument 3 (different base types)
> ../ofproto/ofproto-dpif-xlate.c:4208:39:expected restricted 
> odp_port_t [usertype] value
> ../ofproto/ofproto-dpif-xlate.c:4208:39:got unsigned int const 
> [usertype] recirc_id
> 
> I took a look.  The underlying issue is that code here mixes integers,
> ofp_port_t, and odp_port_t.  OVS uses "sparse" annotations to keep these
> from being confused, since they are different in important ways.  I
> spent some time working through the types here and appended a patch that
> fixes them up.  I also made a bunch of style updates throughout the
> code, which might obscure the relevant changes a bit; my apologies.
> 
> It seems odd that dpif_bond_*() succeed if the dpif doesn't support
> them.  Shouldn't they return an error by default, instead of 0?
> 
> Is there a reason to allow users to turn this off?  What is the downside
> of enabling it?  Why is it disabled by default?  In general, OVS should
> optimize itself to the extent it can rather than relying on a
> knowledgeable user to tweak it.  If it's necessary to make it
> configurable, then the documentation (in vswitch.xml) should explain why
> one would want to turn it on or off and what the default is.

Hi Ben,
I just wanted to add a few comments here.  There are few possibly unwanted
things that appears while enabling the feature:
1. Bonding rebalancing triggers reload of PMD threads, that might affect
   performance of highly loaded threads.
2. While datapath supports lb_output action, netdev-offload provider might
   not support it.  This could be an issue.
So, I think it's sane to have a configuration knob.  Not sure about disabling
by default but it might make a bit of sense keeping this feature as experimental
for some time.  And yes, I agree  that documentation should be updated.

> 
> This introduces a new capabilities flag, which should be documented with
> the other capabilities in vswitch.xml.
> 
> I'm appending a suggested diff to fold into your patch.
> 
> I did not do a full technical review for correctness.  I'll do that on
> the next revision.
> 
> Thanks,
> 
> Ben.
> 
> -8<--cut here-->8--
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 9a2befdcbc02..fa9ff73a4101 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -604,7 +604,7 @@ struct tx_port {
>  
>  /* Contained by struct tx_bond 'slave_buckets' */
>  struct slave_entry {
> -uint32_t slave_id;
> +odp_port_t slave_id;
>  atomic_ullong n_packets;
>  atomic_ullong n_bytes;
>  };
> @@ -1427,12 +1427,10 @@ dpif_netdev_dp_bond_show(struct unixctl_conn *conn, 
> int argc,
>   const char *argv[], void *aux OVS_UNUSED)
>  {
>  struct ds reply = DS_EMPTY_INITIALIZER;
> -struct dp_netdev *dp = NULL;
> -uint32_t bucket;
> -struct tx_bond *dp_bond_entry = NULL;
>  
>  ovs_mutex_lock(&dp_netdev_mutex);
>  
> +struct dp_netdev *dp = NULL;
>  if (argc == 2) {
>  dp = shash_find_data(&dp_netdevs, argv[1]);
>  } else if (shash_count(&dp_netdevs) == 1) {
> @@ -1446,10 +1444,12 @@ dpif_netdev_dp_bond_show(struct unixctl_conn *conn, 
> int argc,
>  return;
>  }
>  ds_put_cstr(&reply, "\nBonds:\n");
> +
> +struct tx_bond *dp_bond_entry;
>  HMAP_FOR_EACH (dp_bond_entry, node, &dp->bonds) {
>  ds_put_format(&reply, "\tbond-id %u :\n",
>dp_bond_entry->bond_id);
> -for (bucket = 0; bucket < BOND_BUCKETS; bucket++) {
> +for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
>  ds_put_format(&reply, "\t\tbucket %u - slave %u \n",
>bucket,
>dp_bond_entry->slave_buckets[bucket].slave_id);
> @@ -1735,7 +1735,6 @@ dp_netdev_free(struct dp_netdev *dp)
>  OVS_REQUIRES(dp_netdev_mutex)
>  {
>  struct dp_netdev_port *port, *next;
> -struct tx_bond *bond, *next_bond;
>  
>  shash_find_and_delete(&dp_netdevs, dp->name);
>  
> @@ -1746,6 +1745,7 @@ dp_netdev_free

Re: [ovs-dev] [PATCH v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.

2019-12-17 Thread Ilya Maximets
On 17.12.2019 10:22, David Marchand wrote:
> On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets  wrote:
>>> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
>>>  return false;
>>>  }
>>>
>>> +static void
>>> +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
>>> +struct svec *args)
>>> +{
>>> +const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
>>> +struct svec lcores = SVEC_EMPTY_INITIALIZER;
>>> +struct ovs_numa_info_core *core;
>>> +struct ovs_numa_dump *cores;
>>> +int index = 0;
>>> +
>>> +if (!cmask) {
>>> +return;
>>> +}
>>> +if (args_contains(args, "-c") || args_contains(args, "-l") ||
>>> +args_contains(args, "--lcores")) {
>>> +VLOG_WARN("Ignoring database defined option 
>>> 'dpdk-lcore-mask' "
>>> +  "due to dpdk-extra config");
>>> +return;
>>> +}
>>> +
>>> +cores = ovs_numa_dump_cores_with_cmask(cmask);
>>> +FOR_EACH_CORE_ON_DUMP(core, cores) {
>>> +svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
>>
>> What's the point of re-mapping these cores?
>> Just let DPDK to fail initialization and user will adjust cores.
>> This automatic re-mapping only complicates everything.
> 
> Deal with existing user of this parameter.
> Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432

So, why not just rebuild DPDK with more than 128 cores in config
since you're going to update OVS package for this issue?
I bet kernel is built with NR_CPUS equal to something like 8192.

<"support up to 128 threads" limitation> doesn't sound feasible
as you can't actually enforce that.

'--lcores' option made DPDK really confusing, IMHO.
Looking at the DPDK API and wondering how the rte_lcore_to_cpu_id()
should work and what it returns if lcore to cpu relation is one to
many...  I'm lost.

BTW, looking at the cpu mask in the bug, what are these 8 cores
(4 cores from each numa) from the lcore mask are supposed to do?

> 
> 
>>
>> IMHO, user should not configure DPDK cores at all.  So, it's users' problem
>> if cores configured incorrectly.
> 
> If the user explicitly had set a dpdk option, ok, this is his problem.
> But here, this is an OVS option, handling this internally makes more
> sense to me.

Sounds like another reason to deprecate and remove those configuration knobs:
https://patchwork.ozlabs.org/project/openvswitch/list/?series=89529&state=*

> 
> 
>>> +index++;
>>> +}
>>> +svec_terminate(&lcores);
>>> +ovs_numa_dump_destroy(cores);
>>> +svec_add(args, "--lcores");
>>> +svec_add_nocopy(args, svec_join(&lcores, ",", ""));
>>> +svec_destroy(&lcores);
>>> +}
>>> +
>>>  static void
>>>  construct_dpdk_options(const struct smap *ovs_other_config, struct svec 
>>> *args)
>>>  {
>>> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap 
>>> *ovs_other_config, struct svec *args)
>>>  bool default_enabled;
>>>  const char *default_value;
>>>  } opts[] = {
>>> -{"dpdk-lcore-mask",   "-c", false, NULL},
>>>  {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
>>>  {"dpdk-socket-limit", "--socket-limit", false, NULL},
>>>  };
>>> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap 
>>> *ovs_other_config, struct svec *args)
>>>  svec_parse_words(args, extra_configuration);
>>>  }
>>>
>>> +construct_dpdk_lcore_option(ovs_other_config, args);
>>>  construct_dpdk_options(ovs_other_config, args);
>>>  construct_dpdk_mutex_options(ovs_other_config, args);
>>>  }
>>> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
>>>  .write = dpdk_log_write,
>>>  };
>>>
>>> +static void
>>> +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
>>> +{
>>> +struct svec cores = SVEC_EMPTY_INITIALIZER;
>>> +rte_cpuset_t cpuset;
>>> +unsigned cpu;
>>> +
>>> +cpuset = rte_lcore_cpuset(lcore);
>>> +for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
>>> +if (!CPU_ISSET(cpu, &cpuset)) {
>>> +continue;
>>> +}
>>> +svec_add_nocopy(&cores, xasprintf("%u", cpu));
>>> +}> +svec_terminate(&cores);
>>> +ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
>>> +  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
>>> +  svec_join(&cores, ",", ""));
>>> +svec_destroy(&cores);
>>> +}
>>> +
>>> +static void
>>> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
>>> + void *aux OVS_UNUSED)
>>> +{
>>> +struct ds ds = DS_EMPTY_INITIALIZER;
>>> +unsigned lcore;
>>> +
>>> +ovs_mutex_lock(&lcore_bitmap_mutex);
>>> +if (lcore_bitmap == NULL) {
>>> +unixctl_command_reply_error(conn, "DPDK has not been initialised");
>>> +goto out;
>>> +}
>>> +if (argc > 1) {
>>> +if (!str_to_uint(argv[1], 0, &lcore) || lcore >=

Re: [ovs-dev] [PATCH] socket-util: Introduce emulation and wrapper for recvmmsg().

2019-12-17 Thread 0-day Robot
Bleep bloop.  Greetings Ben Pfaff, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Comment with 'xxx' marker
#65 FILE: lib/socket-util.c:1293:
ovs_assert(!timeout);   /* XXX not emulated */

WARNING: Comment with 'xxx' marker
#99 FILE: lib/socket-util.c:1327:
ovs_assert(!timeout);   /* XXX not emulated */

Lines checked: 165, Warnings: 2, Errors: 0


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH V4 08/17] dpif-netdev: Update offloaded flows statistics

2019-12-17 Thread Eli Britstein

On 12/17/2019 9:07 PM, Ilya Maximets wrote:
> On 17.12.2019 18:38, Eli Britstein wrote:
>> On 12/16/2019 8:47 PM, Ilya Maximets wrote:
>>> 3. New 'dp_layer' types has to be documented in dpctl man.
>>>  Also, 'in_hw' doesn't look like a datapath layer name.
>>>  Suggesting to use 'dpdk' as netdev-offload-tc uses 'tc'.
>>>  We also could return specific dp_layer for partially offloaded
>>>  flows, i.e. 'ovs-partial'.
>> Thanks for the patch. I applied and tested. It works well.
>>
>> Re-thinking about the dp_layer and offloaded, I think it's wrong.
>>
>> The 'dp_layer' should be "ovs-dpdk" to differentiate from kernel module
>> datapath "ovs", and denote the datapath is by dpdk.
> I don't think we need to differentiate userspace and kernel datapaths
> like this just because they are kind of same datapath layer, also,
> you're dumping flows always from the specific datapath/bridge, i.e. you
> know what is your datapath type.  Second point is that it definitely
> should not be called as 'ovs-dpdk' because there might be no DPDK at all.
> My suggestion is to have 'ovs' layer for all the usual non-offloadded
> flows, 'tc' for flows offloaded to linux tc via netdev-offload-tc
> and 'dpdk' for fully offloaded flows via netdev-offload-dpdk.

I think it's not very clear that the DP is changed because of 
offloading. If we go ahead with this, so

there is no point make the offloaded flag with 3 states, as partial 
offloaded can be noted by "ovs"

and "offloaded=yes". It can just be formatted this way in dump-flows 
(offloaded=partial, dp: ovs).

>
>> It is true for any kind of offloading.
>>
>> This can be done in dp_netdev_flow_to_dpif_flow to handle offloading is
>> off, and filled in netdev-offload-dpdk (to the same value).
>>
>> The "offloaded" flag can be enhanced to 3 states, instead of just
>> boolean, as below (of course need to adapt throughout the code).
>>
>> So, we will be able to show "partial" or "yes" for "offloaded", in
>> dpctl/dump-flows.
> Yes, I think there were such idea in early discussion with Roni this year.
> So, we could implement this.  For partially offloaded flows it might
> make sense to have dp_layer = "ovs" since most of the processing happens
> in the main userspace datapath.
>
>> Please comment.
>>
>>
>> diff --git a/lib/dpif.h b/lib/dpif.h
>> index d96f854a3..dba2f3cbf 100644
>> --- a/lib/dpif.h
>> +++ b/lib/dpif.h
>> @@ -508,8 +508,14 @@ struct dpif_flow_detailed_stats {
>>    uint16_t tcp_flags;
>>    };
>>
>> +enum ol_state {
>> +    OFFLOADED_STATE_OFF,
>> +    OFFLOADED_STATE_ON,
>> +    OFFLOADED_STATE_PARTIAL,
> How about:
>
> enum dpif_flow_ol_state {
>  DPIF_FLOW_OL_STATE_NOT_OFFLOADED, /* Flow fully handled in software. */
>  DPIF_FLOW_OL_STATE_OFFLOADED, /* Flow fully handled in hardware. */
>  DPIF_FLOW_OL_STATE_PARTIAL,   /* Flow matching handled in hardware. 
> */
> };
>
>> +};
>> +
>>    struct dpif_flow_attrs {
>> -    bool offloaded; /* True if flow is offloaded to HW. */
>> +    enum ol_state offloaded;
> Some minimal comment would be nice here.
>  enum dpif_flow_ol_state ol_state;  /* Status of HW offloading. */
>
>>    const char *dp_layer;   /* DP layer the flow is handled in. */
>>    };
>>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] socket-util: Introduce emulation and wrapper for recvmmsg().

2019-12-17 Thread Ben Pfaff
Not every system will have recvmmsg(), so introduce compatibility code
that will allow it to be used blindly from the rest of the tree.

This assumes that recvmmsg() and sendmmsg() are either both present or
both absent in system libraries and headers.

CC: Yi Yang 
Signed-off-by: Ben Pfaff 
---
I haven't actually tested this!

 include/sparse/sys/socket.h |  7 -
 lib/socket-util.c   | 56 +
 lib/socket-util.h   | 24 +---
 3 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h
index 4178f57e2bda..6ff245ae939b 100644
--- a/include/sparse/sys/socket.h
+++ b/include/sparse/sys/socket.h
@@ -27,6 +27,7 @@
 
 typedef unsigned short int sa_family_t;
 typedef __socklen_t socklen_t;
+struct timespec;
 
 struct sockaddr {
 sa_family_t sa_family;
@@ -126,7 +127,8 @@ enum {
 MSG_PEEK,
 MSG_TRUNC,
 MSG_WAITALL,
-MSG_DONTWAIT
+MSG_DONTWAIT,
+MSG_WAITFORONE
 };
 
 enum {
@@ -171,4 +173,7 @@ int sockatmark(int);
 int socket(int, int, int);
 int socketpair(int, int, int, int[2]);
 
+int sendmmsg(int, struct mmsghdr *, unsigned int, int);
+int recvmmsg(int, struct mmsghdr *, unsigned int, int, struct timespec *);
+
 #endif /*  for sparse */
diff --git a/lib/socket-util.c b/lib/socket-util.c
index 6b7378de934b..f6f6f3b0a33f 100644
--- a/lib/socket-util.c
+++ b/lib/socket-util.c
@@ -1283,3 +1283,59 @@ wrap_sendmmsg(int fd, struct mmsghdr *msgs, unsigned int 
n, unsigned int flags)
 }
 #endif
 #endif
+
+#ifndef _WIN32 /* Avoid using recvmsg on Windows entirely. */
+static int
+emulate_recvmmsg(int fd, struct mmsghdr *msgs, unsigned int n,
+ int flags, struct timespec *timeout OVS_UNUSED)
+{
+ovs_assert(!timeout);   /* XXX not emulated */
+
+bool waitforone = flags & MSG_WAITFORONE;
+flags &= ~MSG_WAITFORONE;
+
+for (unsigned int i = 0; i < n; i++) {
+ssize_t retval = recvmsg(fd, &msgs[i].msg_hdr, flags);
+if (retval < 0) {
+return i ? i : retval;
+}
+msgs[i].msg_len = retval;
+
+if (waitforone) {
+flags |= MSG_DONTWAIT;
+}
+}
+return n;
+}
+
+#ifndef HAVE_SENDMMSG
+int
+recvmmsg(int fd, struct mmsghdr *msgs, unsigned int n,
+ int flags, struct timespec *timeout)
+{
+return emulate_recvmmsg(fd, msgs, n, flags, timeout);
+}
+#else
+/* recvmmsg was redefined in lib/socket-util.c, should undef recvmmsg here
+ * to avoid recursion */
+#undef recvmmsg
+int
+wrap_recvmmsg(int fd, struct mmsghdr *msgs, unsigned int n,
+  int flags, struct timespec *timeout)
+{
+ovs_assert(!timeout);   /* XXX not emulated */
+
+static bool recvmmsg_broken = false;
+if (!recvmmsg_broken) {
+int save_errno = errno;
+int retval = recvmmsg(fd, msgs, n, flags, timeout);
+if (retval >= 0 || errno != ENOSYS) {
+return retval;
+}
+recvmmsg_broken = true;
+errno = save_errno;
+}
+return emulate_recvmmsg(fd, msgs, n, flags, timeout);
+}
+#endif
+#endif
diff --git a/lib/socket-util.h b/lib/socket-util.h
index a65433d90738..71bd68926aaa 100644
--- a/lib/socket-util.h
+++ b/lib/socket-util.h
@@ -104,19 +104,20 @@ int make_unix_socket(int style, bool nonblock,
  const char *bind_path, const char *connect_path);
 int get_unix_name_len(const struct sockaddr_un *sun, socklen_t sun_len);
 
-/* Universal sendmmsg support.
+/* Universal sendmmsg and recvmmsg support.
  *
- * Some platforms, such as new enough Linux and FreeBSD, support sendmmsg, but
- * other platforms (or older ones) do not.  We add the following infrastructure
- * to allow all code to use sendmmsg, regardless of platform support:
+ * Some platforms, such as new enough Linux and FreeBSD, support sendmmsg and
+ * recvmmsg, but other platforms (or older ones) do not.  We add the following
+ * infrastructure to allow all code to use sendmmsg and recvmmsg, regardless of
+ * platform support:
  *
- *   - For platforms that lack sendmmsg entirely, we emulate it.
+ * - For platforms that lack these functions entirely, we emulate them.
  *
- *   - Some platforms have sendmmsg() in the C library but not in the kernel.
- * For example, this is true if a Linux system has a newer glibc with an
- * old kernel.  To compensate, even if sendmmsg() appears to be available,
- * we still wrap it with a handler that uses our emulation if sendmmsg()
- * returns ENOSYS.
+ * - Some platforms have sendmmsg() and recvmmsg() in the C library but not in
+ *   the kernel.  For example, this is true if a Linux system has a newer glibc
+ *   with an old kernel.  To compensate, even if these functions appear to be
+ *   available, we still wrap them with handlers that uses our emulation if the
+ *   underlying function returns ENOSYS.
  */
 #ifndef HAVE_STRUCT_MMSGHDR_MSG_LEN
 struct mmsghdr {
@@ -126,9 +127,12 @@ stru

Re: [ovs-dev] [PATCH] Use batch process recv for tap and raw socket in netdev datapath

2019-12-17 Thread Ben Pfaff
On Fri, Dec 06, 2019 at 02:09:24AM -0500, yang_y...@163.com wrote:
> From: Yi Yang 
> 
> Current netdev_linux_rxq_recv_tap and netdev_linux_rxq_recv_sock
> just receive single packet, that is very inefficient, per my test
> case which adds two tap ports or veth ports into OVS bridge
> (datapath_type=netdev) and use iperf3 to do performance test
> between two ports (they are set into different network name space).

Thanks for the patch!  This is an impressive performance improvement!

Each call to netdev_linux_batch_rxq_recv_sock() now calls malloc() 32
times.  This is expensive if only a few packets (or none) are received.
Maybe it doesn't matter, but I wonder whether it affects performance.

I think that no packets are freed on error.  Fix:

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 9cb45d5c7d29..3414a6495ced 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1198,6 +1198,7 @@ netdev_linux_batch_rxq_recv_sock(int fd, int mtu,
 if (retval < 0) {
 /* Save -errno to retval temporarily */
 retval = -errno;
+i = 0;
 goto free_buffers;
 }
 

To get sparse to work, one must fold in the following:

diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h
index 4178f57e2bda..e954ade714b5 100644
--- a/include/sparse/sys/socket.h
+++ b/include/sparse/sys/socket.h
@@ -27,6 +27,7 @@
 
 typedef unsigned short int sa_family_t;
 typedef __socklen_t socklen_t;
+struct timespec;
 
 struct sockaddr {
 sa_family_t sa_family;
@@ -171,4 +172,7 @@ int sockatmark(int);
 int socket(int, int, int);
 int socketpair(int, int, int, int[2]);
 
+int sendmmsg(int, struct mmsghdr *, unsigned int, int);
+int recvmmsg(int, struct mmsghdr *, unsigned int, int, struct timespec *);
+
 #endif /*  for sparse */
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v8 1/1] Avoid dp_hash recirculation for balance-tcp bond selection mode

2019-12-17 Thread Ben Pfaff
Thanks for the patch!

"sparse" reports some type errors:

../ofproto/bond.c:357:30: error: incorrect type in assignment (different 
base types)
../ofproto/bond.c:357:30:expected unsigned int
../ofproto/bond.c:357:30:got restricted ofp_port_t [usertype] ofp_port
../ofproto/ofproto-dpif.c:3454:56: error: incorrect type in argument 2 
(different base types)
../ofproto/ofproto-dpif.c:3454:56:expected restricted ofp_port_t 
[usertype]
../ofproto/ofproto-dpif.c:3454:56:got unsigned int [usertype]
../ofproto/ofproto-dpif.c:3453:31: error: incorrect type in assignment 
(different base types)
../ofproto/ofproto-dpif.c:3453:31:expected unsigned int [usertype]
../ofproto/ofproto-dpif.c:3453:31:got restricted odp_port_t
../ofproto/ofproto-dpif-xlate.c:4208:39: error: incorrect type in argument 
3 (different base types)
../ofproto/ofproto-dpif-xlate.c:4208:39:expected restricted odp_port_t 
[usertype] value
../ofproto/ofproto-dpif-xlate.c:4208:39:got unsigned int const 
[usertype] recirc_id

I took a look.  The underlying issue is that code here mixes integers,
ofp_port_t, and odp_port_t.  OVS uses "sparse" annotations to keep these
from being confused, since they are different in important ways.  I
spent some time working through the types here and appended a patch that
fixes them up.  I also made a bunch of style updates throughout the
code, which might obscure the relevant changes a bit; my apologies.

It seems odd that dpif_bond_*() succeed if the dpif doesn't support
them.  Shouldn't they return an error by default, instead of 0?

Is there a reason to allow users to turn this off?  What is the downside
of enabling it?  Why is it disabled by default?  In general, OVS should
optimize itself to the extent it can rather than relying on a
knowledgeable user to tweak it.  If it's necessary to make it
configurable, then the documentation (in vswitch.xml) should explain why
one would want to turn it on or off and what the default is.

This introduces a new capabilities flag, which should be documented with
the other capabilities in vswitch.xml.

I'm appending a suggested diff to fold into your patch.

I did not do a full technical review for correctness.  I'll do that on
the next revision.

Thanks,

Ben.

-8<--cut here-->8--

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9a2befdcbc02..fa9ff73a4101 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -604,7 +604,7 @@ struct tx_port {
 
 /* Contained by struct tx_bond 'slave_buckets' */
 struct slave_entry {
-uint32_t slave_id;
+odp_port_t slave_id;
 atomic_ullong n_packets;
 atomic_ullong n_bytes;
 };
@@ -1427,12 +1427,10 @@ dpif_netdev_dp_bond_show(struct unixctl_conn *conn, int 
argc,
  const char *argv[], void *aux OVS_UNUSED)
 {
 struct ds reply = DS_EMPTY_INITIALIZER;
-struct dp_netdev *dp = NULL;
-uint32_t bucket;
-struct tx_bond *dp_bond_entry = NULL;
 
 ovs_mutex_lock(&dp_netdev_mutex);
 
+struct dp_netdev *dp = NULL;
 if (argc == 2) {
 dp = shash_find_data(&dp_netdevs, argv[1]);
 } else if (shash_count(&dp_netdevs) == 1) {
@@ -1446,10 +1444,12 @@ dpif_netdev_dp_bond_show(struct unixctl_conn *conn, int 
argc,
 return;
 }
 ds_put_cstr(&reply, "\nBonds:\n");
+
+struct tx_bond *dp_bond_entry;
 HMAP_FOR_EACH (dp_bond_entry, node, &dp->bonds) {
 ds_put_format(&reply, "\tbond-id %u :\n",
   dp_bond_entry->bond_id);
-for (bucket = 0; bucket < BOND_BUCKETS; bucket++) {
+for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
 ds_put_format(&reply, "\t\tbucket %u - slave %u \n",
   bucket,
   dp_bond_entry->slave_buckets[bucket].slave_id);
@@ -1735,7 +1735,6 @@ dp_netdev_free(struct dp_netdev *dp)
 OVS_REQUIRES(dp_netdev_mutex)
 {
 struct dp_netdev_port *port, *next;
-struct tx_bond *bond, *next_bond;
 
 shash_find_and_delete(&dp_netdevs, dp->name);
 
@@ -1746,6 +1745,7 @@ dp_netdev_free(struct dp_netdev *dp)
 ovs_mutex_unlock(&dp->port_mutex);
 
 ovs_mutex_lock(&dp->bond_mutex);
+struct tx_bond *bond, *next_bond;
 HMAP_FOR_EACH_SAFE (bond, next_bond, node, &dp->bonds) {
 hmap_remove(&dp->bonds, &bond->node);
 free(bond);
@@ -4933,11 +4933,10 @@ pmd_remove_stale_bonds(struct dp_netdev *dp,
 OVS_EXCLUDED(pmd->bond_mutex)
 OVS_EXCLUDED(dp->bond_mutex)
 {
-struct tx_bond *tx, *tx_next;
-
 ovs_mutex_lock(&dp->bond_mutex);
 ovs_mutex_lock(&pmd->bond_mutex);
 
+struct tx_bond *tx, *tx_next;
 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_bonds) {
 if (!tx_bond_lookup(&dp->bonds, tx->bond_id)) {
 dp_netdev_del_bond_tx_from_pmd(pmd, tx);
@@ -4958,7 +4957,6 @@ reconfigure_datapath(struct dp_netdev *dp)
 struct hmapx busy_threads = HMAPX_INITIALI

Re: [ovs-dev] [PATCH V4 08/17] dpif-netdev: Update offloaded flows statistics

2019-12-17 Thread Ilya Maximets
On 17.12.2019 18:38, Eli Britstein wrote:
> 
> On 12/16/2019 8:47 PM, Ilya Maximets wrote:
>> 3. New 'dp_layer' types has to be documented in dpctl man.
>> Also, 'in_hw' doesn't look like a datapath layer name.
>> Suggesting to use 'dpdk' as netdev-offload-tc uses 'tc'.
>> We also could return specific dp_layer for partially offloaded
>> flows, i.e. 'ovs-partial'.
> 
> Thanks for the patch. I applied and tested. It works well.
> 
> Re-thinking about the dp_layer and offloaded, I think it's wrong.
> 
> The 'dp_layer' should be "ovs-dpdk" to differentiate from kernel module 
> datapath "ovs", and denote the datapath is by dpdk.

I don't think we need to differentiate userspace and kernel datapaths
like this just because they are kind of same datapath layer, also,
you're dumping flows always from the specific datapath/bridge, i.e. you
know what is your datapath type.  Second point is that it definitely
should not be called as 'ovs-dpdk' because there might be no DPDK at all.
My suggestion is to have 'ovs' layer for all the usual non-offloadded
flows, 'tc' for flows offloaded to linux tc via netdev-offload-tc
and 'dpdk' for fully offloaded flows via netdev-offload-dpdk.

> 
> It is true for any kind of offloading.
> 
> This can be done in dp_netdev_flow_to_dpif_flow to handle offloading is 
> off, and filled in netdev-offload-dpdk (to the same value).
> 
> The "offloaded" flag can be enhanced to 3 states, instead of just 
> boolean, as below (of course need to adapt throughout the code).
> 
> So, we will be able to show "partial" or "yes" for "offloaded", in 
> dpctl/dump-flows.

Yes, I think there were such idea in early discussion with Roni this year.
So, we could implement this.  For partially offloaded flows it might
make sense to have dp_layer = "ovs" since most of the processing happens
in the main userspace datapath.

> 
> Please comment.
> 
> 
> diff --git a/lib/dpif.h b/lib/dpif.h
> index d96f854a3..dba2f3cbf 100644
> --- a/lib/dpif.h
> +++ b/lib/dpif.h
> @@ -508,8 +508,14 @@ struct dpif_flow_detailed_stats {
>   uint16_t tcp_flags;
>   };
> 
> +enum ol_state {
> +    OFFLOADED_STATE_OFF,
> +    OFFLOADED_STATE_ON,
> +    OFFLOADED_STATE_PARTIAL,

How about:

enum dpif_flow_ol_state {
DPIF_FLOW_OL_STATE_NOT_OFFLOADED, /* Flow fully handled in software. */
DPIF_FLOW_OL_STATE_OFFLOADED, /* Flow fully handled in hardware. */
DPIF_FLOW_OL_STATE_PARTIAL,   /* Flow matching handled in hardware. */
};

> +};
> +
>   struct dpif_flow_attrs {
> -    bool offloaded; /* True if flow is offloaded to HW. */
> +    enum ol_state offloaded;

Some minimal comment would be nice here.
enum dpif_flow_ol_state ol_state;  /* Status of HW offloading. */

>   const char *dp_layer;   /* DP layer the flow is handled in. */
>   };
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] bridge: Allow manual notifications about interfaces' updates.

2019-12-17 Thread Ilya Maximets
On 17.12.2019 19:25, Ben Pfaff wrote:
> On Tue, Dec 17, 2019 at 02:45:03PM +0100, Eelco Chaudron wrote:
>>
>>
>> On 6 Nov 2019, at 14:32, Eelco Chaudron wrote:
>>
>>> On 5 Nov 2019, at 18:20, Ilya Maximets wrote:
>>>
 Sometimes interface updates could happen in a way ifnotifier is not
 able to catch.  For example some heavy operations (device reset) in
 netdev-dpdk could require re-applying of the bridge configuration.

 For this purpose new manual notifier introduced. Its function
 'if_notifier_manual_report()' could be called directly by the code
 that aware about changes.  This new notifier is thread-safe.

 Signed-off-by: Ilya Maximets 
>>>
>>> Reviewed and tested this patch in combination with the “netdev-dpdk: add
>>> support for the RTE_ETH_EVENT_INTR_RESET event”.
>>>
>>> LGTM,
>>>
>>> Acked-by: Eelco Chaudron 
>>
>> Is there anything waiting to get this included so my other patch could get
>> included?
> 
> Seems reasonable.  The only thing I noticed in it is two function
> definitions where the function name should be moved to the beginning of
> a line.

Thanks for spotting.  Will fix before applying.

> 
> Ilya, I'm assuming you'll apply it when you're satisfied?

I'm taking the last look at the dependent patch
https://patchwork.ozlabs.org/patch/1192944/
It seems OK so far and I'm going to apply both patches soon.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] bridge: Allow manual notifications about interfaces' updates.

2019-12-17 Thread Ben Pfaff
On Tue, Dec 17, 2019 at 02:45:03PM +0100, Eelco Chaudron wrote:
> 
> 
> On 6 Nov 2019, at 14:32, Eelco Chaudron wrote:
> 
> > On 5 Nov 2019, at 18:20, Ilya Maximets wrote:
> > 
> > > Sometimes interface updates could happen in a way ifnotifier is not
> > > able to catch.  For example some heavy operations (device reset) in
> > > netdev-dpdk could require re-applying of the bridge configuration.
> > > 
> > > For this purpose new manual notifier introduced. Its function
> > > 'if_notifier_manual_report()' could be called directly by the code
> > > that aware about changes.  This new notifier is thread-safe.
> > > 
> > > Signed-off-by: Ilya Maximets 
> > 
> > Reviewed and tested this patch in combination with the “netdev-dpdk: add
> > support for the RTE_ETH_EVENT_INTR_RESET event”.
> > 
> > LGTM,
> > 
> > Acked-by: Eelco Chaudron 
> 
> Is there anything waiting to get this included so my other patch could get
> included?

Seems reasonable.  The only thing I noticed in it is two function
definitions where the function name should be moved to the beginning of
a line.

Ilya, I'm assuming you'll apply it when you're satisfied?
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH V4 08/17] dpif-netdev: Update offloaded flows statistics

2019-12-17 Thread Eli Britstein

On 12/16/2019 8:47 PM, Ilya Maximets wrote:
> 3. New 'dp_layer' types has to be documented in dpctl man.
> Also, 'in_hw' doesn't look like a datapath layer name.
> Suggesting to use 'dpdk' as netdev-offload-tc uses 'tc'.
> We also could return specific dp_layer for partially offloaded
> flows, i.e. 'ovs-partial'.

Thanks for the patch. I applied and tested. It works well.

Re-thinking about the dp_layer and offloaded, I think it's wrong.

The 'dp_layer' should be "ovs-dpdk" to differentiate from kernel module 
datapath "ovs", and denote the datapath is by dpdk.

It is true for any kind of offloading.

This can be done in dp_netdev_flow_to_dpif_flow to handle offloading is 
off, and filled in netdev-offload-dpdk (to the same value).

The "offloaded" flag can be enhanced to 3 states, instead of just 
boolean, as below (of course need to adapt throughout the code).

So, we will be able to show "partial" or "yes" for "offloaded", in 
dpctl/dump-flows.

Please comment.


diff --git a/lib/dpif.h b/lib/dpif.h
index d96f854a3..dba2f3cbf 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -508,8 +508,14 @@ struct dpif_flow_detailed_stats {
  uint16_t tcp_flags;
  };

+enum ol_state {
+    OFFLOADED_STATE_OFF,
+    OFFLOADED_STATE_ON,
+    OFFLOADED_STATE_PARTIAL,
+};
+
  struct dpif_flow_attrs {
-    bool offloaded; /* True if flow is offloaded to HW. */
+    enum ol_state offloaded;
  const char *dp_layer;   /* DP layer the flow is handled in. */
  };

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v2] ovn-northd: ls_*_acl behavior notconsistent for untracked flows

2019-12-17 Thread venugopal iyer via dev
 Would appreciate any thoughts on this.
thanks,
-venu

On Tuesday, November 19, 2019, 12:40:47 PM PST,  
wrote:  
 
 From: venu iyer 

If one creates a port group and a MAC address set, and an
ACL that prevents packets being output to a port in that Port Group from
any MAC address in that address set, the outcome is not consistent.

The outcome depends on whether there is a stateful rule on the switch or not.

Specifically:

Assuming 'l2pg' is a port group with a list of ports and 'macs' is an Address
Set with a list of MAC addresses and the intent is to drop all packets
with source MAC address in 'macs' to any port in 'l2pg' using:

ovn-nbctl acl-add  to-lport 5000 \
    "outport == @l2pg && eth.src == $macs" drop

Without any stateful rule on the logical switch, the corresponding
logical flow looks like:
    table=4 (ls_out_acl        ), priority=6000,\
        match=(outport == @l2pg && eth.src == $macs), \
        action=(/* drop */)

Based on this rule, any packet destined to the ports in 'l2pg' with source
Address in 'macs' will be dropped - as is expected from the ACL above.

While with a Stateful rule on the switch (any stateful rule will do),
the same rule looks like:
    table=4 (ls_out_acl        ), priority=6000, \
        match=((!ct.est || (ct.est && ct_label.blocked == 1)) && \
        (outport == @l2pg && eth.src == $macs)), action=(/* drop */)

With this, however, only packets that are tracked will match the rule
and be dropped, e.g. IP packets will be dropped, but ARP etc., will go
through - this is not expected.

Based on whether there are stateful rules or not on the switch,
untracked packets will see different behavior.

The fix is to make the rule in the stateful case comprehensive, i.e.
instead of just looking for flows that are not established (or not new),
we should also look for flows that are not tracked.

The fix was tested in the above scenario. Additionally, the following
ACL was added to test the change in the "allow" case (i.e. to drop
all the packets based on the above ACL, but have a higher priority
rule that selectively allow ARP).

ovn-nbctl acl-add ls1 to-lport 6000
        "outport == @l2pg && eth.type == 0x806" allow

with and without the stateful rule to make sure the behavior is the
same.  The test suite has been enhanced to add the above test cases
(with different ethertype) for drop and allow.

OVN test cases were run with this fix and no failures were seen.

Signed-off-by: venu iyer 
---
 northd/ovn-northd.c |  13 +--
 tests/ovn.at        | 206 
 2 files changed, 213 insertions(+), 6 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index a5e711e69..69ba85321 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -4885,12 +4885,12 @@ consider_acl(struct hmap *lflows, struct ovn_datapath 
*od,
              * deletion.  There is no need to commit here, so we can just
              * proceed to the next table. We use this to ensure that this
              * connection is still allowed by the currently defined
-            * policy. */
+            * policy. Match untracked packets too. */
            ds_clear(&match);
            ds_clear(&actions);
            ds_put_format(&match,
-                          "!ct.new && ct.est && !ct.rpl"
-                          " && ct_label.blocked == 0 && (%s)",
+                          "(!ct.trk || (!ct.new && ct.est && !ct.rpl"
+                          " && ct_label.blocked == 0)) && (%s)",
                          acl->match);
 
            build_acl_log(&actions, acl);
@@ -4913,10 +4913,11 @@ consider_acl(struct hmap *lflows, struct ovn_datapath 
*od,
          * depending on whether the connection was previously committed
          * to the connection tracker with ct_commit. */
        if (has_stateful) {
-            /* If the packet is not part of an established connection, then
-            * we can simply reject/drop it. */
+            /* If the packet is not tracked or part of an established
+            * connection, then we can simply reject/drop it. */
            ds_put_cstr(&match,
-                        "(!ct.est || (ct.est && ct_label.blocked == 1))");
+                        "(!ct.trk || !ct.est"
+                        " || (ct.est && ct_label.blocked == 1))");
            if (!strcmp(acl->action, "reject")) {
                build_reject_acl_rules(od, lflows, stage, acl, &match,
                                        &actions);
diff --git a/tests/ovn.at b/tests/ovn.at
index 0cab187a3..e280a5ca3 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -12392,6 +12392,212 @@ AT_CHECK([cat 2.packets], [0], [])
 
 AT_CLEANUP
 
+# 3 hypervisors, one logical switch, 3 logical ports per hypervisor
+AT_SETUP([ovn -- L2 Drop and Allow ACL w/ Stateful ACL])
+AT_SKIP_IF([test $HAVE_PYTHON = no])
+ovn_start
+
+# Create hypervisors hv[123].
+# Add vif11 to hv1, vif21 to hv2, vif31 to hv3.
+# Add all of the vifs to a single logical switch lsw0.

Re: [ovs-dev] attaching ebpf program to openvswitch vport driver

2019-12-17 Thread Nicolas Bouliane via dev
>
> In function internal_dev_recv, currently it calls
> netif_rx(skb)
> and this skips the generic xdp code path.
>
> I wonder if it's ok to replace netif_rx with
> netif_receive_skb(skb)
> Then the generic xdp should work.
>
> Ohh, interesting, I'll check that !
cheers,
Nick
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] attaching ebpf program to openvswitch vport driver

2019-12-17 Thread Nicolas Bouliane via dev
>
>
> because internal port is a loopback device, and packet
> does not go through linux tc qdisc. So the attached ebpf
> program through tc does not work.
> Attach XDP program also does not work.
>

Oh I see !

> May I know your use case?
>

I'd like to attach an ebpf program to rate limit packets that are sent to
this interface.
I have an internal service behind that interface that can be reached via
the virtual machines attached to the bridge,
and I would like to rate limit at the interface directly.

Nick
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH net-next v2] openvswitch: add TTL decrement action

2019-12-17 Thread Nikolay Aleksandrov
On 17/12/2019 17:51, Matteo Croce wrote:
> New action to decrement TTL instead of setting it to a fixed value.
> This action will decrement the TTL and, in case of expired TTL, drop it
> or execute an action passed via a nested attribute.
> The default TTL expired action is to drop the packet.
> 
> Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
> 
> Tested with a corresponding change in the userspace:
> 
> # ovs-dpctl dump-flows
> in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, 
> actions:dec_ttl{ttl<=1 action:(drop)},1,1
> in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, 
> actions:dec_ttl{ttl<=1 action:(drop)},1,2
> in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, 
> actions:2
> in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, 
> actions:1
> 
> # ping -c1 192.168.0.2 -t 42
> IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), 
> length 84)
> 192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
> # ping -c1 192.168.0.2 -t 120
> IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), 
> length 84)
> 192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
> # ping -c1 192.168.0.2 -t 1
> #
> 
> Co-authored-by: Bindiya Kurle 
> Signed-off-by: Bindiya Kurle 
> Signed-off-by: Matteo Croce 
> ---
>  include/uapi/linux/openvswitch.h |  22 +++
>  net/openvswitch/actions.c|  71 +
>  net/openvswitch/flow_netlink.c   | 105 +++
>  3 files changed, 198 insertions(+)
> 

Hi Matteo,

[snip]
> +}
> +
>  /* When 'last' is true, sample() should always consume the 'skb'.
>   * Otherwise, sample() should keep 'skb' intact regardless what
>   * actions are executed within sample().
> @@ -1176,6 +1201,44 @@ static int execute_check_pkt_len(struct datapath *dp, 
> struct sk_buff *skb,
>nla_len(actions), last, clone_flow_key);
>  }
>  
> +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> + int err;
> +
> + if (skb->protocol == htons(ETH_P_IPV6)) {
> + struct ipv6hdr *nh = ipv6_hdr(skb);
> +
> + err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +   sizeof(*nh));

skb_ensure_writable() calls pskb_may_pull() which may reallocate so nh might 
become invalid.
It seems the IPv4 version below is ok as the ptr is reloaded.

One q as I don't know ovs that much - can this action be called only with
skb->protocol ==  ETH_P_IP/IPV6 ? I.e. Are we sure that if it's not v6, then it 
must be v4 ?


Thanks,
 Nik

> + if (unlikely(err))
> + return err;
> +
> + if (nh->hop_limit <= 1)
> + return -EHOSTUNREACH;
> +
> + key->ip.ttl = --nh->hop_limit;
> + } else {
> + struct iphdr *nh = ip_hdr(skb);
> + u8 old_ttl;
> +
> + err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +   sizeof(*nh));
> + if (unlikely(err))
> + return err;
> +
> + nh = ip_hdr(skb);
> + if (nh->ttl <= 1)
> + return -EHOSTUNREACH;
> +
> + old_ttl = nh->ttl--;
> + csum_replace2(&nh->check, htons(old_ttl << 8),
> +   htons(nh->ttl << 8));
> + key->ip.ttl = nh->ttl;
> + }
> +
> + return 0;
> +}
> +
>  /* Execute a list of actions against 'skb'. */
>  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> struct sw_flow_key *key,
> @@ -1347,6 +1410,14 @@ static int do_execute_actions(struct datapath *dp, 
> struct sk_buff *skb,
>  
>   break;
>   }
> +
> + case OVS_ACTION_ATTR_DEC_TTL:
> + err = execute_dec_ttl(skb, key);
> + if (err == -EHOSTUNREACH) {
> + err = dec_ttl(dp, skb, key, a, true);
> + return err;
> + }
> + break;
>   }
>  
>   if (unlikely(err)) {
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index 65c2e3458ff5..a9eea2ffb8b0 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -61,6 +61,7 @@ static bool actions_may_change_flow(const struct nlattr 
> *actions)
>   case OVS_ACTION_ATTR_RECIRC:
>   case OVS_ACTION_ATTR_TRUNC:
>   case OVS_ACTION_ATTR_USERSPACE:
> + case OVS_ACTION_ATTR_DEC_TTL:
>   break;
>  
>   case OVS_ACTION_ATTR_CT:
> @@ -2494,6 +2495,59 @@ static int validate_and_copy_sample(struct net *net, 
> const struct nlattr *attr,
>   return 0;
>  }
>

Re: [ovs-dev] attaching ebpf program to openvswitch vport driver

2019-12-17 Thread William Tu
On Tue, Dec 17, 2019 at 07:59:42AM -0800, William Tu wrote:
> On Tue, Dec 17, 2019 at 09:14:00AM -0500, Nicolas Bouliane wrote:
> > >
> > >
> > > type of this port?
> > >
> > Internal
> > 
> > We need to have an IP address set on the interface, which is why we use the
> > internal type.
> > 
> > 
> > > Can you share your "ovs-vsctl show"
> > > If meta0 is "type: internal", then it doesn't work.
> > >
> > 
> > Port "meta0"
> > Interface "meta0"
> > type: internal
> > 
> > >
> > > I think other types, such as system or tunnel port
> > > should work (the xdpgeneric should see packets)
> > >
> > 
> > Is there a fundamental reason why the 'internal' type doesn't work ?
> > Is it something that could be modified at the driver level ?
> > Any pointer to get me started in that direction ?
> > 
> > thanks !
> > Nick
> 
> because internal port is a loopback device, and packet
> does not go through linux tc qdisc. So the attached ebpf
> program through tc does not work.
> Attach XDP program also does not work. 
> 
> see the .send function.
> net/openvswitch/vport-internal_dev.c
> static struct vport_ops ovs_internal_vport_ops = { 
> .type   = OVS_VPORT_TYPE_INTERNAL,
> .create = internal_dev_create,
> .destroy= internal_dev_destroy,
> .send   = internal_dev_recv,
> };
> 
> May I know your use case?
> William

In function internal_dev_recv, currently it calls
netif_rx(skb)
and this skips the generic xdp code path.

I wonder if it's ok to replace netif_rx with
netif_receive_skb(skb)
Then the generic xdp should work.

Regards,
William
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v8 1/1] Avoid dp_hash recirculation for balance-tcp bond selection mode

2019-12-17 Thread Matteo Croce
On Tue, Dec 17, 2019 at 11:03 AM Vishal Deep Ajmera
 wrote:
>
> Problem:
> 
> In OVS-DPDK, flows with output over a bond interface of type “balance-tcp”
> (using a hash on TCP/UDP 5-tuple) get translated by the ofproto layer into
> "HASH" and "RECIRC" datapath actions. After recirculation, the packet is
> forwarded to the bond member port based on 8-bits of the datapath hash
> value computed through dp_hash. This causes performance degradation in the
> following ways:
>

ACK.

Just a nitpick: can you remove the non ASCII double quotes from the
commit message? They shows in git log as:

<80><9C>balance-tcp<80><9D>

Regards,
-- 
Matteo Croce
per aspera ad upstream

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] attaching ebpf program to openvswitch vport driver

2019-12-17 Thread William Tu
On Tue, Dec 17, 2019 at 09:14:00AM -0500, Nicolas Bouliane wrote:
> >
> >
> > type of this port?
> >
> Internal
> 
> We need to have an IP address set on the interface, which is why we use the
> internal type.
> 
> 
> > Can you share your "ovs-vsctl show"
> > If meta0 is "type: internal", then it doesn't work.
> >
> 
> Port "meta0"
> Interface "meta0"
> type: internal
> 
> >
> > I think other types, such as system or tunnel port
> > should work (the xdpgeneric should see packets)
> >
> 
> Is there a fundamental reason why the 'internal' type doesn't work ?
> Is it something that could be modified at the driver level ?
> Any pointer to get me started in that direction ?
> 
> thanks !
> Nick

because internal port is a loopback device, and packet
does not go through linux tc qdisc. So the attached ebpf
program through tc does not work.
Attach XDP program also does not work. 

see the .send function.
net/openvswitch/vport-internal_dev.c
static struct vport_ops ovs_internal_vport_ops = { 
.type   = OVS_VPORT_TYPE_INTERNAL,
.create = internal_dev_create,
.destroy= internal_dev_destroy,
.send   = internal_dev_recv,
};

May I know your use case?
William
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH net-next v2] openvswitch: add TTL decrement action

2019-12-17 Thread Matteo Croce
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.

Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.

Tested with a corresponding change in the userspace:

# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, 
actions:dec_ttl{ttl<=1 action:(drop)},1,1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, 
actions:dec_ttl{ttl<=1 action:(drop)},1,2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1

# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 
84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), 
length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#

Co-authored-by: Bindiya Kurle 
Signed-off-by: Bindiya Kurle 
Signed-off-by: Matteo Croce 
---
 include/uapi/linux/openvswitch.h |  22 +++
 net/openvswitch/actions.c|  71 +
 net/openvswitch/flow_netlink.c   | 105 +++
 3 files changed, 198 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a87b44cd5590..b6684bc04883 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -927,6 +927,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_METER,/* u32 meter ID. */
OVS_ACTION_ATTR_CLONE,/* Nested OVS_CLONE_ATTR_*.  */
OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */
+   OVS_ACTION_ATTR_DEC_TTL,   /* Nested OVS_DEC_TTL_ATTR_*. */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
@@ -939,6 +940,23 @@ enum ovs_action_attr {
 };
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
+enum ovs_dec_ttl_attr {
+   OVS_DEC_TTL_ATTR_UNSPEC,
+   OVS_DEC_TTL_ATTR_ACTION_TYPE,/* Action Type u32 */
+   OVS_DEC_TTL_ATTR_ACTION, /* nested action */
+   __OVS_DEC_TTL_ATTR_MAX,
+#ifdef __KERNEL__
+   OVS_DEC_TTL_ATTR_ARG  /* struct sample_arg  */
+#endif
+};
+
+#ifdef __KERNEL__
+struct dec_ttl_arg {
+   u32 action_type;/* dec_ttl action type.*/
+};
+#endif
+
+#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1)
 
 /* Meters. */
 #define OVS_METER_FAMILY  "ovs_meter"
@@ -1009,6 +1027,10 @@ enum ovs_ct_limit_attr {
__OVS_CT_LIMIT_ATTR_MAX
 };
 
+enum ovs_dec_ttl_action {/*Actions supported by dec_ttl */
+   OVS_DEC_TTL_ACTION_DROP,
+   OVS_DEC_TTL_ACTION_USER_SPACE
+};
 #define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
 
 #define OVS_ZONE_LIMIT_DEFAULT_ZONE -1
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 4c8395462303..5329668732b1 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -960,6 +960,31 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
 }
 
+static int dec_ttl(struct datapath *dp, struct sk_buff *skb,
+  struct sw_flow_key *fk, const struct nlattr *attr, bool last)
+{
+   struct nlattr *actions;
+   struct nlattr *dec_ttl_arg;
+   int rem = nla_len(attr);
+   const struct dec_ttl_arg *arg;
+
+   /* The first action is always OVS_DEC_TTL_ATTR_ARG. */
+   dec_ttl_arg = nla_data(attr);
+   arg = nla_data(dec_ttl_arg);
+   actions = nla_next(dec_ttl_arg, &rem);
+
+   switch (arg->action_type) {
+   case OVS_DEC_TTL_ACTION_DROP:
+   consume_skb(skb);
+   break;
+
+   case OVS_DEC_TTL_ACTION_USER_SPACE:
+   return clone_execute(dp, skb, fk, 0, actions, rem, last, false);
+   }
+
+   return 0;
+}
+
 /* When 'last' is true, sample() should always consume the 'skb'.
  * Otherwise, sample() should keep 'skb' intact regardless what
  * actions are executed within sample().
@@ -1176,6 +1201,44 @@ static int execute_check_pkt_len(struct datapath *dp, 
struct sk_buff *skb,
 nla_len(actions), last, clone_flow_key);
 }
 
+static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
+{
+   int err;
+
+   if (skb->protocol == htons(ETH_P_IPV6)) {
+   struct ipv6hdr *nh = ipv6_hdr(skb);
+
+   err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(*nh));
+   if (unlikely(err))
+  

[ovs-dev] [PATCH] netdev-dpdk: Fix sw stats perf drop.

2019-12-17 Thread Kevin Traynor
Accessing the sw stats in the vhost datapath of a PVP test
can incur a performance drop of ~2%.

Most of the time these stats will just be getting zero added
to them. By checking if there is a non-zero update first, we
can avoid accessing them when they won't be updated and avoid
the performance drop.

Fixes: 2f862c712e52 ("netdev-dpdk: Detailed packet drop statistics.")
Signed-off-by: Kevin Traynor 
---
 lib/netdev-dpdk.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 89c73a29b..238f29a2a 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2175,5 +2175,4 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk 
*dev,
  int qos_drops)
 {
-struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
 struct netdev_stats *stats = &dev->stats;
 struct dp_packet *packet;
@@ -2206,5 +2205,7 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk 
*dev,
 }
 
-sw_stats->rx_qos_drops += qos_drops;
+if (OVS_UNLIKELY(qos_drops)) {
+dev->sw_stats->rx_qos_drops += qos_drops;
+}
 }
 
@@ -2370,5 +2371,4 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk 
*dev,
  struct netdev_dpdk_sw_stats *sw_stats_add)
 {
-struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
 int dropped = sw_stats_add->tx_mtu_exceeded_drops +
   sw_stats_add->tx_qos_drops +
@@ -2385,8 +2385,12 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk 
*dev,
 }
 
-sw_stats->tx_retries+= sw_stats_add->tx_retries;
-sw_stats->tx_failure_drops  += sw_stats_add->tx_failure_drops;
-sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops;
-sw_stats->tx_qos_drops  += sw_stats_add->tx_qos_drops;
+if (OVS_UNLIKELY(dropped || sw_stats_add->tx_retries)) {
+struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
+
+sw_stats->tx_retries+= sw_stats_add->tx_retries;
+sw_stats->tx_failure_drops  += sw_stats_add->tx_failure_drops;
+sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops;
+sw_stats->tx_qos_drops  += sw_stats_add->tx_qos_drops;
+}
 }
 
-- 
2.21.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3] ovsdb replication: Provide option to configure probe interval.

2019-12-17 Thread Dumitru Ceara
On Fri, Dec 13, 2019 at 9:16 PM  wrote:
>
> From: Numan Siddique 
>
> When ovsdb-server is in backup mode and connects to the active
> ovsdb-server for replication, and if takes more than 5 seconds to
> get the dump of the whole database, it will drop the connection
> soon after as the default probe interval is 5 seconds. This
> results in a snowball effect of reconnections to the active
> ovsdb-server.
>
> This patch handles or mitigates this issue by setting the
> default probe interval value to 60 seconds and provide the option to
> configure this value from the unixctl command.
>
> Other option could be increase the value of 'RECONNECT_DEFAULT_PROBE_INTERVAL'
> to a higher value.
>
> Acked-by: Mark Michelson 
> Signed-off-by: Numan Siddique 

Looks good to me.

Acked-by: Dumitru Ceara 

> ---
>  ovsdb/ovsdb-server.1.in |  6 ++
>  ovsdb/ovsdb-server.c| 46 ++---
>  ovsdb/replication.c |  4 +++-
>  ovsdb/replication.h |  4 +++-
>  4 files changed, 51 insertions(+), 9 deletions(-)
>
> diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in
> index 21f527bc6..daa420909 100644
> --- a/ovsdb/ovsdb-server.1.in
> +++ b/ovsdb/ovsdb-server.1.in
> @@ -288,6 +288,12 @@ Switches the server to an active role.  The server stops 
> synchronizing
>  its databases with an active server and closes all existing client
>  connections, which requires clients to reconnect.
>  .
> +.IP "\fBovsdb\-server/set\-active\-ovsdb\-server-probe-interval \fIprobe 
> interval"
> +Sets  the probe interval (in milli seconds) for the connection to
> +active \fIserver\fR. For the value to take effect, the connection to the
> +active \fIserver\fR should be disconnected and connected again.
> +.
> +.
>  .IP "\fBovsdb\-server/set\-sync\-exclude\-tables 
> \fIdb\fB:\fItable\fR[\fB,\fIdb\fB:\fItable\fR]..."
>  Sets the \fItable\fR within \fIdb\fR that will be excluded from 
> synchronization.
>  This overrides the \fB\-\-sync\-exclude-tables\fR command-line option.
> diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
> index 9827320ec..4ffe24ad7 100644
> --- a/ovsdb/ovsdb-server.c
> +++ b/ovsdb/ovsdb-server.c
> @@ -86,6 +86,7 @@ static unixctl_cb_func ovsdb_server_set_active_ovsdb_server;
>  static unixctl_cb_func ovsdb_server_get_active_ovsdb_server;
>  static unixctl_cb_func ovsdb_server_connect_active_ovsdb_server;
>  static unixctl_cb_func ovsdb_server_disconnect_active_ovsdb_server;
> +static unixctl_cb_func ovsdb_server_set_active_ovsdb_server_probe_interval;
>  static unixctl_cb_func ovsdb_server_set_sync_exclude_tables;
>  static unixctl_cb_func ovsdb_server_get_sync_exclude_tables;
>  static unixctl_cb_func ovsdb_server_get_sync_status;
> @@ -97,6 +98,7 @@ struct server_config {
>  char **sync_from;
>  char **sync_exclude;
>  bool *is_backup;
> +int *replication_probe_interval;
>  struct ovsdb_jsonrpc_server *jsonrpc;
>  };
>  static unixctl_cb_func ovsdb_server_add_remote;
> @@ -144,9 +146,10 @@ static void load_config(FILE *config_file, struct sset 
> *remotes,
>
>  static void
>  ovsdb_replication_init(const char *sync_from, const char *exclude,
> -   struct shash *all_dbs, const struct uuid *server_uuid)
> +   struct shash *all_dbs, const struct uuid *server_uuid,
> +   int probe_interval)
>  {
> -replication_init(sync_from, exclude, server_uuid);
> +replication_init(sync_from, exclude, server_uuid, probe_interval);
>  struct shash_node *node;
>  SHASH_FOR_EACH (node, all_dbs) {
>  struct db *db = node->data;
> @@ -304,6 +307,7 @@ main(int argc, char *argv[])
>  struct server_config server_config;
>  struct shash all_dbs;
>  struct shash_node *node, *next;
> +int replication_probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL;
>
>  ovs_cmdl_proctitle_init(argc, argv);
>  set_program_name(argv[0]);
> @@ -351,6 +355,7 @@ main(int argc, char *argv[])
>  server_config.sync_from = &sync_from;
>  server_config.sync_exclude = &sync_exclude;
>  server_config.is_backup = &is_backup;
> +server_config.replication_probe_interval = &replication_probe_interval;
>
>  perf_counters_init();
>
> @@ -436,6 +441,9 @@ main(int argc, char *argv[])
>  unixctl_command_register("ovsdb-server/disconnect-active-ovsdb-server", 
> "",
>   0, 0, 
> ovsdb_server_disconnect_active_ovsdb_server,
>   &server_config);
> +unixctl_command_register(
> +"ovsdb-server/set-active-ovsdb-server-probe-interval", "", 1, 1,
> +ovsdb_server_set_active_ovsdb_server_probe_interval, &server_config);
>  unixctl_command_register("ovsdb-server/set-sync-exclude-tables", "",
>   0, 1, ovsdb_server_set_sync_exclude_tables,
>   &server_config);
> @@ -454,7 +462,8 @@ main(int argc, char *argv[])
>  if (is_backup) {
>  const struct u

Re: [ovs-dev] attaching ebpf program to openvswitch vport driver

2019-12-17 Thread Nicolas Bouliane via dev
>
>
> type of this port?
>
Internal

We need to have an IP address set on the interface, which is why we use the
internal type.


> Can you share your "ovs-vsctl show"
> If meta0 is "type: internal", then it doesn't work.
>

Port "meta0"
Interface "meta0"
type: internal

>
> I think other types, such as system or tunnel port
> should work (the xdpgeneric should see packets)
>

Is there a fundamental reason why the 'internal' type doesn't work ?
Is it something that could be modified at the driver level ?
Any pointer to get me started in that direction ?

thanks !
Nick
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v6] netdev-dpdk: add support for the RTE_ETH_EVENT_INTR_RESET event

2019-12-17 Thread Eelco Chaudron
Any comments on this patch? Guess it might be waiting for “[PATCH] 
bridge: Allow manual notifications about interfaces”.


Would like to get this ready for 2.13…

Cheers,

Eelco


On 11 Nov 2019, at 15:02, Eelco Chaudron wrote:


Currently, OVS does not register and therefore not handle the
interface reset event from the DPDK framework. This would cause a
problem in cases where a VF is used as an interface, and its
configuration changes.

As an example in the following scenario the MAC change is not
detected/acted upon until OVS is restarted without the patch applied:

  $ echo 1 > /sys/bus/pci/devices/:05:00.1/sriov_numvfs
  $ ovs-vsctl add-port ovs_pvp_br0 dpdk0 -- \
set Interface dpdk0 type=dpdk -- \
set Interface dpdk0 options:dpdk-devargs=:05:0a.0

  $ ip link set p5p2 vf 0 mac 52:54:00:92:d3:33

Requires patch "bridge: Allow manual notifications about interfaces' 
updates."


Signed-off-by: Eelco Chaudron 
---
v5 -> v6:
  Moved the if_notifier_manual_report() function till after the
  actual reset of the interface to make it more clear that it's
  supposed to happen AFTER the reset.

  Removed empty line in the code

v4 -> v5:
  Using new if_notifier_manual_report() API

v3 -> v4:
  Add support for if-notification to make sure set_config()
  gets called

v2 -> v3:
v1 -> v2:
  Fixed Ilya's comments

 lib/netdev-dpdk.c |   54 
+++--

 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 4805783..cb57b96 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -46,6 +46,7 @@
 #include "dpdk.h"
 #include "dpif-netdev.h"
 #include "fatal-signal.h"
+#include "if-notifier.h"
 #include "netdev-provider.h"
 #include "netdev-vport.h"
 #include "odp-util.h"
@@ -396,6 +397,8 @@ struct netdev_dpdk {
 bool attached;
 /* If true, rte_eth_dev_start() was successfully called */
 bool started;
+bool reset_needed;
+/* 1 pad byte here. */
 struct eth_addr hwaddr;
 int mtu;
 int socket_id;
@@ -1173,6 +1176,8 @@ common_construct(struct netdev *netdev, 
dpdk_port_t port_no,

 ovsrcu_index_init(&dev->vid, -1);
 dev->vhost_reconfigured = false;
 dev->attached = false;
+dev->started = false;
+dev->reset_needed = false;

 ovsrcu_init(&dev->qos_conf, NULL);

@@ -1769,6 +1774,34 @@ netdev_dpdk_process_devargs(struct netdev_dpdk 
*dev,

 return new_port_id;
 }

+static int
+dpdk_eth_event_callback(dpdk_port_t port_id, enum rte_eth_event_type 
type,
+void *param OVS_UNUSED, void *ret_param 
OVS_UNUSED)

+{
+struct netdev_dpdk *dev;
+
+switch ((int) type) {
+case RTE_ETH_EVENT_INTR_RESET:
+ovs_mutex_lock(&dpdk_mutex);
+dev = netdev_dpdk_lookup_by_port_id(port_id);
+if (dev) {
+ovs_mutex_lock(&dev->mutex);
+dev->reset_needed = true;
+netdev_request_reconfigure(&dev->up);
+VLOG_DBG_RL(&rl, "%s: Device reset requested.",
+netdev_get_name(&dev->up));
+ovs_mutex_unlock(&dev->mutex);
+}
+ovs_mutex_unlock(&dpdk_mutex);
+break;
+
+default:
+/* Ignore all other types. */
+break;
+   }
+   return 0;
+}
+
 static void
 dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap 
*args)

 OVS_REQUIRES(dev->mutex)
@@ -3807,6 +3840,8 @@ netdev_dpdk_class_init(void)
 /* This function can be called for different classes.  The 
initialization

  * needs to be done only once */
 if (ovsthread_once_start(&once)) {
+int ret;
+
 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
 unixctl_command_register("netdev-dpdk/set-admin-state",
  "[netdev] up|down", 1, 2,
@@ -3820,6 +3855,14 @@ netdev_dpdk_class_init(void)
  "[netdev]", 0, 1,
  netdev_dpdk_get_mempool_info, 
NULL);


+ret = rte_eth_dev_callback_register(RTE_ETH_ALL,
+
RTE_ETH_EVENT_INTR_RESET,
+dpdk_eth_event_callback, 
NULL);

+if (ret != 0) {
+VLOG_ERR("Ethernet device callback register error: %s",
+ rte_strerror(-ret));
+}
+
 ovsthread_once_done(&once);
 }

@@ -4180,13 +4223,20 @@ netdev_dpdk_reconfigure(struct netdev 
*netdev)

 && dev->rxq_size == dev->requested_rxq_size
 && dev->txq_size == dev->requested_txq_size
 && dev->socket_id == dev->requested_socket_id
-&& dev->started) {
+&& dev->started && !dev->reset_needed) {
 /* Reconfiguration is unnecessary */

 goto out;
 }

-rte_eth_dev_stop(dev->port_id);
+if (dev->reset_needed) {
+rte_eth_dev_reset(dev->port_id);
+if_notifier_manual_report();
+

Re: [ovs-dev] [PATCH] bridge: Allow manual notifications about interfaces' updates.

2019-12-17 Thread Eelco Chaudron



On 6 Nov 2019, at 14:32, Eelco Chaudron wrote:


On 5 Nov 2019, at 18:20, Ilya Maximets wrote:


Sometimes interface updates could happen in a way ifnotifier is not
able to catch.  For example some heavy operations (device reset) in
netdev-dpdk could require re-applying of the bridge configuration.

For this purpose new manual notifier introduced. Its function
'if_notifier_manual_report()' could be called directly by the code
that aware about changes.  This new notifier is thread-safe.

Signed-off-by: Ilya Maximets 


Reviewed and tested this patch in combination with the “netdev-dpdk: 
add support for the RTE_ETH_EVENT_INTR_RESET event”.


LGTM,

Acked-by: Eelco Chaudron 


Is there anything waiting to get this included so my other patch could 
get included?


Cheers,

Eelco

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v8 0/1] Balance-tcp bond mode optimization

2019-12-17 Thread Vishal Deep Ajmera via dev
v7->v8:
 Removed hash action for balance-tcp mode.
 Removed bond-only pmd reload action.
 Rebased to OVS master.

v6->v7:
 Fixed issue reported by Matteo for bond/show.

v5->v6:
 Addressed comments from Ilya Maximets.
 https://mail.openvswitch.org/pipermail/ovs-dev/2019-August/362001.html
 Rebased to OVS master.

v4->v5:
 Support for stats per hash bucket.
 Support for dynamic load balancing.
 Rebased to OVS Master.

v3->v4:
 Addressed Ilya Maximets comments.
 https://mail.openvswitch.org/pipermail/ovs-dev/2019-July/360452.html

v2->v3:
 Rebased to OVS master.
 Fixed git merge issue.

v1->v2:
 Updated datapath action to hash + lb-output.
 Updated throughput test observations.
 Rebased to OVS master.

Vishal Deep Ajmera (1):
  Avoid dp_hash recirculation for balance-tcp bond selection mode

 datapath/linux/compat/include/linux/openvswitch.h |   1 +
 lib/dpif-netdev.c | 503 --
 lib/dpif-netlink.c|   3 +
 lib/dpif-provider.h   |   8 +
 lib/dpif.c|  48 +++
 lib/dpif.h|   7 +
 lib/odp-execute.c |   2 +
 lib/odp-util.c|   4 +
 ofproto/bond.c|  50 ++-
 ofproto/bond.h|   9 +
 ofproto/ofproto-dpif-ipfix.c  |   1 +
 ofproto/ofproto-dpif-sflow.c  |   1 +
 ofproto/ofproto-dpif-xlate.c  |  43 +-
 ofproto/ofproto-dpif.c|  32 ++
 ofproto/ofproto-dpif.h|  10 +-
 tests/lacp.at |   9 +
 vswitchd/bridge.c |   6 +
 vswitchd/vswitch.xml  |  10 +
 18 files changed, 688 insertions(+), 59 deletions(-)

-- 
1.9.1

Signed-off-by: Manohar Krishnappa Chidambaraswamy 
Co-authored-by: Manohar Krishnappa Chidambaraswamy 
Signed-off-by: Vishal Deep Ajmera 

CC: Jan Scheurich 
CC: Venkatesan Pradeep 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v8 1/1] Avoid dp_hash recirculation for balance-tcp bond selection mode

2019-12-17 Thread Vishal Deep Ajmera via dev
Problem:

In OVS-DPDK, flows with output over a bond interface of type “balance-tcp”
(using a hash on TCP/UDP 5-tuple) get translated by the ofproto layer into
"HASH" and "RECIRC" datapath actions. After recirculation, the packet is
forwarded to the bond member port based on 8-bits of the datapath hash
value computed through dp_hash. This causes performance degradation in the
following ways:

1. The recirculation of the packet implies another lookup of the packet’s
flow key in the exact match cache (EMC) and potentially Megaflow classifier
(DPCLS). This is the biggest cost factor.

2. The recirculated packets have a new “RSS” hash and compete with the
original packets for the scarce number of EMC slots. This implies more
EMC misses and potentially EMC thrashing causing costly DPCLS lookups.

3. The 256 extra megaflow entries per bond for dp_hash bond selection put
additional load on the revalidation threads.

Owing to this performance degradation, deployments stick to “balance-slb”
bond mode even though it does not do active-active load balancing for
VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same
source MAC address.

Proposed optimization:
--
This proposal introduces a new load-balancing output action instead of
recirculation.

Maintain one table per-bond (could just be an array of uint16's) and
program it the same way internal flows are created today for each possible
hash value(256 entries) from ofproto layer. Use this table to load-balance
flows as part of output action processing.

Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules()
-> bond_may_recirc() and compose_output_action__() generate
“dp_hash(hash_l4(0))” and “recirc()” actions. In this case the
RecircID identifies the bond. For the recirculated packets the ofproto layer
installs megaflow entries that match on RecircID and masked dp_hash and send
them to the corresponding output port.

Instead, we will now generate action as
"lb_output(bond,)"

This combines hash computation (only if needed, else re-use RSS hash) and
inline load-balancing over the bond. This action is used *only* for balance-tcp
bonds in OVS-DPDK datapath (the OVS kernel datapath remains unchanged).

Example:

Current scheme:
---
With 1 IP-UDP flow:

flow-dump from pmd on cpu core: 2
recirc_id(0),in_port(7),packet_type(ns=0,id=0),eth(src=02:00:00:02:14:01,dst=0c:c4:7a:58:f0:2b),eth_type(0x0800),ipv4(frag=no),
 packets:2828969, bytes:181054016, used:0.000s, 
actions:hash(hash_l4(0)),recirc(0x1)

recirc_id(0x1),dp_hash(0x113683bd/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:2828937, bytes:181051968, used:0.000s, actions:2

With 8 IP-UDP flows (with random UDP src port): (all hitting same DPCL):

flow-dump from pmd on cpu core: 2
recirc_id(0),in_port(7),packet_type(ns=0,id=0),eth(src=02:00:00:02:14:01,dst=0c:c4:7a:58:f0:2b),eth_type(0x0800),ipv4(frag=no),
 packets:2674009, bytes:171136576, used:0.000s, 
actions:hash(hash_l4(0)),recirc(0x1)

recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:377395, bytes:24153280, used:0.000s, actions:2
recirc_id(0x1),dp_hash(0xb236c260/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:333486, bytes:21343104, used:0.000s, actions:1
recirc_id(0x1),dp_hash(0x7d89eb18/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:348461, bytes:22301504, used:0.000s, actions:1
recirc_id(0x1),dp_hash(0xa78d75df/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:633353, bytes:40534592, used:0.000s, actions:2
recirc_id(0x1),dp_hash(0xb58d846f/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:319901, bytes:20473664, used:0.001s, actions:2
recirc_id(0x1),dp_hash(0x24534406/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:334985, bytes:21439040, used:0.001s, actions:1
recirc_id(0x1),dp_hash(0x3cf32550/0xff),in_port(7),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:326404, bytes:20889856, used:0.001s, actions:1

New scheme:
---
We can do with a single flow entry (for any number of new flows):

in_port(7),packet_type(ns=0,id=0),eth(src=02:00:00:02:14:01,dst=0c:c4:7a:58:f0:2b),eth_type(0x0800),ipv4(frag=no),
 packets:2674009, bytes:171136576, used:0.000s, actions:lb_output(bond,1)

A new CLI has been added to dump datapath bond cache as given below.

“sudo ovs-appctl dpif-netdev/dp-bond-show [dp]”

root@ubuntu-190:performance_scripts # sudo ovs-appctl dpif-netdev/dp-bond-show
Bond cache:
bond-id 1 :
bucket 0 - slave 2
bucket 1 - slave 1
bucket 2 - slave 2
bucket 3 - slave 1

Performance improvement:

With a prototype of the proposed idea, the following perf improvement is seen
with Phy-VM-Phy UDP traf

Re: [ovs-dev] [PATCH v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.

2019-12-17 Thread David Marchand
On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets  wrote:
> > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
> >  return false;
> >  }
> >
> > +static void
> > +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> > +struct svec *args)
> > +{
> > +const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> > +struct svec lcores = SVEC_EMPTY_INITIALIZER;
> > +struct ovs_numa_info_core *core;
> > +struct ovs_numa_dump *cores;
> > +int index = 0;
> > +
> > +if (!cmask) {
> > +return;
> > +}
> > +if (args_contains(args, "-c") || args_contains(args, "-l") ||
> > +args_contains(args, "--lcores")) {
> > +VLOG_WARN("Ignoring database defined option 
> > 'dpdk-lcore-mask' "
> > +  "due to dpdk-extra config");
> > +return;
> > +}
> > +
> > +cores = ovs_numa_dump_cores_with_cmask(cmask);
> > +FOR_EACH_CORE_ON_DUMP(core, cores) {
> > +svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
>
> What's the point of re-mapping these cores?
> Just let DPDK to fail initialization and user will adjust cores.
> This automatic re-mapping only complicates everything.

Deal with existing user of this parameter.
Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432


>
> IMHO, user should not configure DPDK cores at all.  So, it's users' problem
> if cores configured incorrectly.

If the user explicitly had set a dpdk option, ok, this is his problem.
But here, this is an OVS option, handling this internally makes more
sense to me.


> > +index++;
> > +}
> > +svec_terminate(&lcores);
> > +ovs_numa_dump_destroy(cores);
> > +svec_add(args, "--lcores");
> > +svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> > +svec_destroy(&lcores);
> > +}
> > +
> >  static void
> >  construct_dpdk_options(const struct smap *ovs_other_config, struct svec 
> > *args)
> >  {
> > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap 
> > *ovs_other_config, struct svec *args)
> >  bool default_enabled;
> >  const char *default_value;
> >  } opts[] = {
> > -{"dpdk-lcore-mask",   "-c", false, NULL},
> >  {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
> >  {"dpdk-socket-limit", "--socket-limit", false, NULL},
> >  };
> > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap 
> > *ovs_other_config, struct svec *args)
> >  svec_parse_words(args, extra_configuration);
> >  }
> >
> > +construct_dpdk_lcore_option(ovs_other_config, args);
> >  construct_dpdk_options(ovs_other_config, args);
> >  construct_dpdk_mutex_options(ovs_other_config, args);
> >  }
> > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
> >  .write = dpdk_log_write,
> >  };
> >
> > +static void
> > +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> > +{
> > +struct svec cores = SVEC_EMPTY_INITIALIZER;
> > +rte_cpuset_t cpuset;
> > +unsigned cpu;
> > +
> > +cpuset = rte_lcore_cpuset(lcore);
> > +for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> > +if (!CPU_ISSET(cpu, &cpuset)) {
> > +continue;
> > +}
> > +svec_add_nocopy(&cores, xasprintf("%u", cpu));
> > +}> +svec_terminate(&cores);
> > +ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> > +  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> > +  svec_join(&cores, ",", ""));
> > +svec_destroy(&cores);
> > +}
> > +
> > +static void
> > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> > + void *aux OVS_UNUSED)
> > +{
> > +struct ds ds = DS_EMPTY_INITIALIZER;
> > +unsigned lcore;
> > +
> > +ovs_mutex_lock(&lcore_bitmap_mutex);
> > +if (lcore_bitmap == NULL) {
> > +unixctl_command_reply_error(conn, "DPDK has not been initialised");
> > +goto out;
> > +}
> > +if (argc > 1) {
> > +if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> > +!bitmap_is_set(lcore_bitmap, lcore)) {
> > +unixctl_command_reply_error(conn, "incorrect lcoreid");
> > +goto out;
> > +}
> > +dpdk_dump_lcore(&ds, lcore);
> > +} else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>
> Not a good style to have 'else for'.

Tried to be creative :-) (the additional indent is really unnecessary).
But, if this is a blocker for this patch, ok.


>
> DPDK lcore iterator?

We are iterating over lcores owned by DPDK and OVS.

Example:
other_config: {dpdk-init="true", dpdk-lcore-mask="0x1",
pmd-cpu-mask="0x8002"}

# ovs-appctl dpdk/dump-lcores
lcore0 (DPDK) is running on core 0
lcore1 (OVS) is running on core 1
lcore2 (OVS) is running on core 15


>
> > +if (!bitmap_is_set(lcore_bitmap, lcore)) {
> > +continu

Re: [ovs-dev] [PATCH v5 06/10] tc: Move tunnel_key unset action before output ports

2019-12-17 Thread Paul Blakey
I have no problem, maybe remove the changelog part?

paulb@reg-r-vrt-019-180 openvswitch (git (detached from origin/master))$ git 
checkout FETCH_HEAD
Note: checking out 'FETCH_HEAD'.
paulb@reg-r-vrt-019-180 openvswitch (git (detached from origin/master))$ git am 
0submit/ovs/ovs/5/0001-match-Add-match_set_ct_zone_masked-helper.patch
Applying: match: Add match_set_ct_zone_masked helper
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0002*
Applying: compat: Add tc ct action and flower matches defines for older kernels
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0003*
Applying: tc: Introduce tcf_id to specify a tc filter
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0004*
Applying: netdev-offload-tc: Implement netdev tc flush via tc filter del
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0005*
Applying: dpif: Add support to set user features
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0006*
Applying: tc: Move tunnel_key unset action before output ports
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0007*
Applying: netdev-offload-tc: Add recirculation support via tc chains
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0008*
Applying: netdev-offload-tc: Add conntrack support
^[[Apaulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git 
am 0submit/ovs/ovs/5/0009*
Applying: netdev-offload-tc: Add conntrack label and mark support
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git am 
0submit/ovs/ovs/5/0010*
Applying: netdev-offload-tc: Add conntrack nat support
paulb@reg-r-vrt-019-180 openvswitch (git (detached from FETCH_HEAD))$ git log 
--oneline -11
574b18d (HEAD) netdev-offload-tc: Add conntrack nat support
ce5fb14 netdev-offload-tc: Add conntrack label and mark support
085fc42 netdev-offload-tc: Add conntrack support
4515f8d netdev-offload-tc: Add recirculation support via tc chains
bf1e1c3 tc: Move tunnel_key unset action before output ports
4b4f5c6 dpif: Add support to set user features
aa5e592 netdev-offload-tc: Implement netdev tc flush via tc filter del
015e4d1 tc: Introduce tcf_id to specify a tc filter
ca41aaf compat: Add tc ct action and flower matches defines for older kernels
c9c3fa9 match: Add match_set_ct_zone_masked helper
391b52f rhel: Support RHEL 7.8 kernel module rpm build



Paul.


From: Marcelo Ricardo Leitner 
Sent: Monday, December 16, 2019 7:52 PM
To: Paul Blakey 
Cc: Roi Dayan ; Simon Horman ; 
Oz Shlomo ; Justin Pettit ; Ilya 
Maximets ; Ben Pfaff ; d...@openvswitch.org 

Subject: Re: [ovs-dev][PATCH v5 06/10] tc: Move tunnel_key unset action before 
output ports

On Mon, Dec 16, 2019 at 05:53:17PM +0200, Paul Blakey wrote:
> diff --git a/lib/tc.c b/lib/tc.c
> index b2d8ca7..7a4acce 100644
> --- a/lib/tc.c
> +++ b/lib/tc.c
> @@ -665,6 +665,12 @@ nl_parse_flower_tunnel(struct nlattr **attrs, struct 
> tc_flower *flower)

Not sure why but my 'git am' can't process this patch:
$ git am --show-current | patch -p1
patching file lib/tc.c
patch:  malformed patch at line 109: c_flower *flower)

git am --show-current  gives:

--- a/lib/tc.c
+++ b/lib/tc.c
@@ -665,6 +665,12 @@ nl_parse_flower_tunnel(struct nlattr **attrs, struct t=
  line broken here ^
c_flower *flower)
 flower->mask.tunnel.ttl =3D
  ^^ other encoding
 nl_attr_get_u8(attrs[TCA_FLOWER_KEY_ENC_IP_TTL_MASK]);
 }
+
+if (!is_all_zeros(&flower->mask.tunnel, sizeof flower->mask.tunnel) ||
+!is_all_zeros(&flower->key.tunnel, sizeof flower->key.tunnel)) {
+flower->tunnel =3D true;
 ^^ another here

but if I display it with mutt, I can see the patch just fine. Seems
git am is not decoding something here (git-2.23.0-1.fc31).


>  flower->mask.tunnel.ttl =
>  nl_attr_get_u8(attrs[TCA_FLOWER_KEY_ENC_IP_TTL_MASK]);
>  }
> +
> +if (!is_all_zeros(&flower->mask.tunnel, sizeof flower->mask.tunnel) ||
> +!is_all_zeros(&flower->key.tunnel, sizeof flower->key.tunnel)) {
> +flower->tunnel = true;
> +}
> +
>  if (attrs[TCA_FLOWER_KEY_ENC_OPTS] &&
>  attrs[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
>   err = nl_parse_flower_tunnel_opts(attrs[TCA_FLOWER_KEY_ENC_OPTS],

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev