Re: [ovs-dev] [PATCH ovn] northd: Remove the usage of 'ct.inv' in logical flows.

2021-02-24 Thread Han Zhou
On Wed, Feb 24, 2021 at 5:27 AM  wrote:
>
> From: Numan Siddique 
>
> Presently we add 65535 priority lflows in the stages -
> 'ls_in_acl' and 'ls_out_acl' to drop packets which
> match on 'ct.inv'.
>
> As per the 'ovs-fields' man page, this
> ct state field can be used to identify problems such as:
>  •  L3/L4 protocol handler is not loaded/unavailable.
>
>  •  L3/L4 protocol handler determines that the packet is
> malformed.
>
>  •  Packets are unexpected length for protocol.
>
> This patch removes the usage of this field for the following
> reasons:
>
>  • Some of the smart NICs which support offloading datapath
>flows don't support this field.

What do you mean by "don't support this field"? Do you mean the NIC
offloading supports connection tracking, but cannot detect if a packet is
invalid and always populate the ct.inv as 0?

>
>  • A recent commit in kernel ovs datapath sets the committed
>connection tracking entry to be liberal for out-of-window
>tcp packets (nf_ct_set_tcp_be_liberal()).  Such TCP
>packets will not be marked as invalid.
>

Could you share a link to this commit?

>  • Even if a ct.inv packet is delivered to a VIF, the
>networking stack of the VIF's kernel can handle such
>packets.

I have some concern for this point. We shouldn't make assumptions for
what's configured in the VIF's kernel, because it is independent of what's
expected from OVN ACLs. In addition, egress rules are expected to drop
invalid packets sent by the VIF (regardless of how the VIF's kernel is
configured).

However, I am not against this patch, but just want to double confirm. I
think this deserves a description in NEWS if we do so.

Thanks,
Han

>
> Signed-off-by: Numan Siddique 
> ---
>  northd/ovn-northd.c | 24 
>  tests/ovn-northd.at | 24 
>  2 files changed, 24 insertions(+), 24 deletions(-)
>
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index dcdb777a2..e30fb532c 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -5617,10 +5617,10 @@ build_acls(struct ovn_datapath *od, struct hmap
*lflows,
>   *
>   * This is enforced at a higher priority than ACLs can be
defined. */
>  ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
> -  "ct.inv || (ct.est && ct.rpl && ct_label.blocked
== 1)",
> +  "ct.est && ct.rpl && ct_label.blocked == 1",
>"drop;");
>  ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
> -  "ct.inv || (ct.est && ct.rpl && ct_label.blocked
== 1)",
> +  "ct.est && ct.rpl && ct_label.blocked == 1",
>"drop;");
>
>  /* Ingress and Egress ACL Table (Priority 65535).
> @@ -5633,12 +5633,12 @@ build_acls(struct ovn_datapath *od, struct hmap
*lflows,
>   *
>   * This is enforced at a higher priority than ACLs can be
defined. */
>  ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
> -  "ct.est && !ct.rel && !ct.new && !ct.inv "
> -  "&& ct.rpl && ct_label.blocked == 0",
> +  "ct.est && !ct.rel && !ct.new && "
> +  "ct.rpl && ct_label.blocked == 0",
>"next;");
>  ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
> -  "ct.est && !ct.rel && !ct.new && !ct.inv "
> -  "&& ct.rpl && ct_label.blocked == 0",
> +  "ct.est && !ct.rel && !ct.new && "
> +  "ct.rpl && ct_label.blocked == 0",
>"next;");
>
>  /* Ingress and Egress ACL Table (Priority 65535).
> @@ -5653,12 +5653,12 @@ build_acls(struct ovn_datapath *od, struct hmap
*lflows,
>   * related traffic such as an ICMP Port Unreachable through
>   * that's generated from a non-listening UDP port.  */
>  ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
> -  "!ct.est && ct.rel && !ct.new && !ct.inv "
> -  "&& ct_label.blocked == 0",
> +  "!ct.est && ct.rel && !ct.new && "
> +  "ct_label.blocked == 0",
>"next;");
>  ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
> -  "!ct.est && ct.rel && !ct.new && !ct.inv "
> -  "&& ct_label.blocked == 0",
> +  "!ct.est && ct.rel && !ct.new && "
> +  "ct_label.blocked == 0",
>"next;");
>
>  /* Ingress and Egress ACL Table (Priority 65535).
> @@ -5846,11 +5846,11 @@ build_lb(struct ovn_datapath *od, struct hmap
*lflows)
>   *
>   * Send established traffic through conntrack for just NAT. */
>  ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, UINT16_MAX - 1,
> -  "ct.est && !ct.rel && !ct.new && !ct.inv && 

Re: [ovs-dev] [PATCH V2 13/14] netdev-offload-dpdk: Support vports flows offload

2021-02-24 Thread Sriharsha Basavapatna via dev
On Wed, Feb 24, 2021 at 4:50 PM Eli Britstein  wrote:
>
>
> On 2/24/2021 12:37 PM, Sriharsha Basavapatna wrote:
> > On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
> >> Vports are virtual, OVS only logical devices, so rte_flows cannot be
> >> applied as is on them. Instead, apply the rules the physical port from
> >> which the packet has arrived, provided by orig_in_port field.
> >>
> >> Signed-off-by: Eli Britstein 
> >> Reviewed-by: Gaetan Rivet 
> >> ---
> >>   lib/netdev-offload-dpdk.c | 169 ++
> >>   1 file changed, 137 insertions(+), 32 deletions(-)
> >>
> >> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> >> index f6e668bff..ad47d717f 100644
> >> --- a/lib/netdev-offload-dpdk.c
> >> +++ b/lib/netdev-offload-dpdk.c
> >> @@ -62,6 +62,7 @@ struct ufid_to_rte_flow_data {
> >>   struct rte_flow *rte_flow;
> >>   bool actions_offloaded;
> >>   struct dpif_flow_stats stats;
> >> +struct netdev *physdev;
> >>   };
> >>
> >>   /* Find rte_flow with @ufid. */
> >> @@ -87,7 +88,8 @@ ufid_to_rte_flow_data_find(const ovs_u128 *ufid, bool 
> >> warn)
> >>
> >>   static inline struct ufid_to_rte_flow_data *
> >>   ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct netdev *netdev,
> >> -   struct rte_flow *rte_flow, bool 
> >> actions_offloaded)
> >> +   struct netdev *vport, struct rte_flow 
> >> *rte_flow,
> >> +   bool actions_offloaded)
> >>   {
> >>   size_t hash = hash_bytes(ufid, sizeof *ufid, 0);
> >>   struct ufid_to_rte_flow_data *data = xzalloc(sizeof *data);
> >> @@ -105,7 +107,8 @@ ufid_to_rte_flow_associate(const ovs_u128 *ufid, 
> >> struct netdev *netdev,
> >>   }
> >>
> >>   data->ufid = *ufid;
> >> -data->netdev = netdev_ref(netdev);
> >> +data->netdev = vport ? netdev_ref(vport) : netdev_ref(netdev);
> >> +data->physdev = netdev_ref(netdev);
> > For non-tunnel offloads, we end up getting two references to the same
> > 'netdev'; can we avoid this ? That is, get a reference to physdev only
> > for the vport case.
> I know. This is on purpose. It simplifies other places, for example
> query, to always use physdev, and always close both without any
> additional logic there.

Ok,  please add this as an inline comment so we know why it is done
this way. Also, you missed my last comment in this patch (see
VXLAN_DECAP action below).


> >>   data->rte_flow = rte_flow;
> >>   data->actions_offloaded = actions_offloaded;
> >>
> >> @@ -122,6 +125,7 @@ ufid_to_rte_flow_disassociate(struct 
> >> ufid_to_rte_flow_data *data)
> >>   cmap_remove(_to_rte_flow,
> >>   CONST_CAST(struct cmap_node *, >node), hash);
> >>   netdev_close(data->netdev);
> >> +netdev_close(data->physdev);
> > Similar comment, release reference to physdev only if we got a
> > reference earlier (i.e., physdev should be non-null only when netdev
> > is a vport).
> Right. As it is written this way, no need for any additional logic here.
> >>   ovsrcu_postpone(free, data);
> >>   }
> >>
> >> @@ -134,6 +138,8 @@ struct flow_patterns {
> >>   struct rte_flow_item *items;
> >>   int cnt;
> >>   int current_max;
> >> +uint32_t num_of_tnl_items;
> > change to --> num_pmd_tnl_items
> OK.
> >> +struct ds s_tnl;
> >>   };
> >>
> >>   struct flow_actions {
> >> @@ -154,16 +160,20 @@ struct flow_actions {
> >>   static void
> >>   dump_flow_attr(struct ds *s, struct ds *s_extra,
> >>  const struct rte_flow_attr *attr,
> >> +   struct flow_patterns *flow_patterns,
> >>  struct flow_actions *flow_actions)
> >>   {
> >>   if (flow_actions->num_of_tnl_actions) {
> >>   ds_clone(s_extra, _actions->s_tnl);
> >> +} else if (flow_patterns->num_of_tnl_items) {
> >> +ds_clone(s_extra, _patterns->s_tnl);
> >>   }
> >> -ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s%s",
> >> +ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s%s%s",
> >> attr->ingress  ? "ingress " : "",
> >> attr->egress   ? "egress " : "", attr->priority, 
> >> attr->group,
> >> attr->transfer ? "transfer " : "",
> >> -  flow_actions->num_of_tnl_actions ? "tunnel_set 1 " : 
> >> "");
> >> +  flow_actions->num_of_tnl_actions ? "tunnel_set 1 " : "",
> >> +  flow_patterns->num_of_tnl_items ? "tunnel_match 1 " : 
> >> "");
> >>   }
> >>
> >>   /* Adds one pattern item 'field' with the 'mask' to dynamic string 's' 
> >> using
> >> @@ -177,9 +187,18 @@ dump_flow_attr(struct ds *s, struct ds *s_extra,
> >>   }
> >>
> >>   static void
> >> -dump_flow_pattern(struct ds *s, const struct rte_flow_item *item)
> >> +dump_flow_pattern(struct ds *s,
> >> +  struct flow_patterns *flow_patterns,
> >> +  int pattern_index)
> >>   {

Re: [ovs-dev] [PATCH V2 11/14] netdev-offload-dpdk: Refactor offload rule creation

2021-02-24 Thread Sriharsha Basavapatna via dev
On Wed, Feb 24, 2021 at 3:47 PM Eli Britstein  wrote:
>
>
> On 2/24/2021 11:55 AM, Sriharsha Basavapatna wrote:
> > On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
> >> Refactor offload rule creation as a pre-step towards tunnel matching
> >> that depend on the netdev.
> >>
> >> Signed-off-by: Eli Britstein 
> >> Reviewed-by: Gaetan Rivet 
> >> ---
> >>   lib/netdev-offload-dpdk.c | 106 --
> >>   1 file changed, 45 insertions(+), 61 deletions(-)
> >>
> >> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> >> index 493cc9159..f6e668bff 100644
> >> --- a/lib/netdev-offload-dpdk.c
> >> +++ b/lib/netdev-offload-dpdk.c
> >> @@ -1009,30 +1009,6 @@ add_flow_mark_rss_actions(struct flow_actions 
> >> *actions,
> >>   add_flow_action(actions, RTE_FLOW_ACTION_TYPE_END, NULL);
> >>   }
> >>
> >> -static struct rte_flow *
> >> -netdev_offload_dpdk_mark_rss(struct flow_patterns *patterns,
> >> - struct netdev *netdev,
> >> - uint32_t flow_mark)
> >> -{
> >> -struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> >> -const struct rte_flow_attr flow_attr = {
> >> -.group = 0,
> >> -.priority = 0,
> >> -.ingress = 1,
> >> -.egress = 0
> >> -};
> >> -struct rte_flow_error error;
> >> -struct rte_flow *flow;
> >> -
> >> -add_flow_mark_rss_actions(, flow_mark, netdev);
> >> -
> >> -flow = netdev_offload_dpdk_flow_create(netdev, _attr, 
> >> patterns->items,
> >> -   , );
> >> -
> >> -free_flow_actions();
> >> -return flow;
> >> -}
> >> -
> >>   static void
> >>   add_count_action(struct flow_actions *actions)
> >>   {
> >> @@ -1509,27 +1485,48 @@ parse_flow_actions(struct netdev *netdev,
> >>   return 0;
> >>   }
> >>
> >> -static struct rte_flow *
> >> -netdev_offload_dpdk_actions(struct netdev *netdev,
> >> -struct flow_patterns *patterns,
> >> -struct nlattr *nl_actions,
> >> -size_t actions_len)
> >> +static struct ufid_to_rte_flow_data *
> >> +create_netdev_offload(struct netdev *netdev,
> >> +  const ovs_u128 *ufid,
> >> +  struct flow_patterns *flow_patterns,
> >> +  struct flow_actions *flow_actions,
> >> +  bool enable_full,
> >> +  uint32_t flow_mark)
> >>   {
> >> -const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 
> >> };
> >> -struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> >> +struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1, };
> >> +struct flow_actions rss_actions = { .actions = NULL, .cnt = 0 };
> >> +struct rte_flow_item *items = flow_patterns->items;
> >> +struct ufid_to_rte_flow_data *flow_data = NULL;
> >> +bool actions_offloaded = true;
> >>   struct rte_flow *flow = NULL;
> >>   struct rte_flow_error error;
> >> -int ret;
> >>
> >> -ret = parse_flow_actions(netdev, , nl_actions, actions_len);
> >> -if (ret) {
> >> -goto out;
> >> +if (enable_full) {
> >> +flow = netdev_offload_dpdk_flow_create(netdev, _attr, items,
> >> +   flow_actions, );
> >>   }
> >> -flow = netdev_offload_dpdk_flow_create(netdev, _attr, 
> >> patterns->items,
> >> -   , );
> >> -out:
> >> -free_flow_actions();
> >> -return flow;
> >> +
> >> +if (!flow) {
> >> +/* If we failed to offload the rule actions fallback to MARK+RSS
> >> + * actions.
> >> + */
> > A  debug message might be useful here, when we fallback to mark/rss action ?
> We can, but this is just a refactor commit, and this fallback is not
> new. Add it anyway?

I think it'd be useful to add a debug message here and also in
parse_flow_actions() to indicate that action offloading failed.

> >> +actions_offloaded = false;
> >> +flow_attr.transfer = 0;
> >> +add_flow_mark_rss_actions(_actions, flow_mark, netdev);
> >> +flow = netdev_offload_dpdk_flow_create(netdev, _attr, items,
> >> +   _actions, );
> >> +}
> >> +
> >> +if (flow) {
> >> +flow_data = ufid_to_rte_flow_associate(ufid, netdev, flow,
> >> +   actions_offloaded);
> >> +VLOG_DBG("%s: installed flow %p by ufid "UUID_FMT,
> >> + netdev_get_name(netdev), flow,
> >> + UUID_ARGS((struct uuid *) ufid));
> >> +}
> >> +
> >> +free_flow_actions(_actions);
> > This free is needed only when we fallback to mark/rss offload, not 
> > otherwise.
> OK.
> >
> >> +return flow_data;
> >>   }
> >>
> >>   static struct ufid_to_rte_flow_data *
> >> @@ -1541,9 +1538,9 @@ 

Re: [ovs-dev] [PATCH V2 10/14] netdev-offload-dpdk: Support tunnel pop action

2021-02-24 Thread Sriharsha Basavapatna via dev
On Wed, Feb 24, 2021 at 1:50 PM Eli Britstein  wrote:
>
>
> On 2/24/2021 9:52 AM, Sriharsha Basavapatna wrote:
> > On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
> >> Support tunnel pop action.
> >>
> >> Signed-off-by: Eli Britstein 
> >> Reviewed-by: Gaetan Rivet 
> >> ---
> >>   Documentation/howto/dpdk.rst |   1 +
> >>   NEWS |   1 +
> >>   lib/netdev-offload-dpdk.c| 173 ---
> >>   3 files changed, 160 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
> >> index f0d45e47b..4918d80f3 100644
> >> --- a/Documentation/howto/dpdk.rst
> >> +++ b/Documentation/howto/dpdk.rst
> >> @@ -398,6 +398,7 @@ Supported actions for hardware offload are:
> >>   - VLAN Push/Pop (push_vlan/pop_vlan).
> >>   - Modification of IPv6 (set_field:->ipv6_src/ipv6_dst/mod_nw_ttl).
> >>   - Clone/output (tnl_push and output) for encapsulating over a tunnel.
> >> +- Tunnel pop, for changing from a physical port to a vport.
> >>
> >>   Further Reading
> >>   ---
> >> diff --git a/NEWS b/NEWS
> >> index a7bffce97..6850d5621 100644
> >> --- a/NEWS
> >> +++ b/NEWS
> >> @@ -26,6 +26,7 @@ v2.15.0 - xx xxx 
> >>  - DPDK:
> >>* Removed support for vhost-user dequeue zero-copy.
> >>* Add support for DPDK 20.11.
> >> + * Add hardware offload support for tunnel pop action (experimental).
> >>  - Userspace datapath:
> >>* Add the 'pmd' option to "ovs-appctl dpctl/dump-flows", which
> >>  restricts a flow dump to a single PMD thread if set.
> >> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> >> index 78f866080..493cc9159 100644
> >> --- a/lib/netdev-offload-dpdk.c
> >> +++ b/lib/netdev-offload-dpdk.c
> >> @@ -140,15 +140,30 @@ struct flow_actions {
> >>   struct rte_flow_action *actions;
> >>   int cnt;
> >>   int current_max;
> >> +struct netdev *tnl_netdev;
> >> +/* tnl_actions is the opaque array of actions returned by the PMD. */
> >> +struct rte_flow_action *tnl_actions;
> > Why is this an opaque array ? Since it is struct rte_flow_action, OVS
> > knows the type and members. Is it opaque because the value of
> > rte_flow_action_type member is unknown to OVS ? Is it a private action
> > type and if so how does the PMD ensure that it doesn't conflict with
> > standard action types ?
>
> True it is not used by OVS, but that's not why it opaque. Although it is
> struct rte_flow_action array, the PMD may use its own private actions,
> not defined in rte_flow.h, thus not known to the application.
>
> The details of this array is under the PMD's responsibility.

So this means the PMD has to pick a range of private action values
that are not defined in rte_flow.h,  but what if later a new action
type with the same value is added to rte_flow.h ?
The other question is if the PMD could use one of the existing action
types in rte_flow.h [i.e, to avoid defining its own private action
types] and return it in the opaque action array ?

>
> >
> >> +uint32_t num_of_tnl_actions;
> >> +/* tnl_actions_pos is where the tunnel actions starts within the 
> >> 'actions'
> >> + * field.
> >> + */
> >> +int tnl_actions_pos;
> > Names should indicate that they are private or pmd specific ?
> >
> > tnl_actions --> tnl_private_actions or tnl_pmd_actions
> > num_of_tnl_actions --> num_private_tnl_actions or num_pmd_tnl_actions
> > tnl_actions_pos --> tnl_private_actions_pos or tnl_pmd_actions_pos
> OK. _pmd_
> >
> >> +struct ds s_tnl;
> >>   };
> >>
> >>   static void
> >> -dump_flow_attr(struct ds *s, const struct rte_flow_attr *attr)
> >> +dump_flow_attr(struct ds *s, struct ds *s_extra,
> >> +   const struct rte_flow_attr *attr,
> >> +   struct flow_actions *flow_actions)
> >>   {
> >> -ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s",
> >> +if (flow_actions->num_of_tnl_actions) {
> >> +ds_clone(s_extra, _actions->s_tnl);
> >> +}
> >> +ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s%s",
> >> attr->ingress  ? "ingress " : "",
> >> attr->egress   ? "egress " : "", attr->priority, 
> >> attr->group,
> >> -  attr->transfer ? "transfer " : "");
> >> +  attr->transfer ? "transfer " : "",
> >> +  flow_actions->num_of_tnl_actions ? "tunnel_set 1 " : 
> >> "");
> >>   }
> >>
> >>   /* Adds one pattern item 'field' with the 'mask' to dynamic string 's' 
> >> using
> >> @@ -395,9 +410,19 @@ dump_vxlan_encap(struct ds *s, const struct 
> >> rte_flow_item *items)
> >>
> >>   static void
> >>   dump_flow_action(struct ds *s, struct ds *s_extra,
> >> - const struct rte_flow_action *actions)
> >> + struct flow_actions *flow_actions, int act_index)
> >>   {
> >> -if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
> >> +const struct 

Re: [ovs-dev] [PATCH V2 05/14] netdev-offload-dpdk: Implement HW miss packet recover for vport

2021-02-24 Thread Sriharsha Basavapatna via dev
On Tue, Feb 23, 2021 at 7:24 PM Eli Britstein  wrote:
>
>
> On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
> > On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
> >> A miss in virtual port offloads means the flow with tnl_pop was
> >> offloaded, but not the following one. Recover the state and continue
> >> with SW processing.
> > Relates to my comment on Patch-1; please explain what is meant by
> > recovering the packet state.
> See my response there.

Responded to your comment.
> >
> >> Signed-off-by: Eli Britstein 
> >> Reviewed-by: Gaetan Rivet 
> >> ---
> >>   lib/netdev-offload-dpdk.c | 95 +++
> >>   1 file changed, 95 insertions(+)
> >>
> >> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> >> index 8cc90d0f1..21aa26b42 100644
> >> --- a/lib/netdev-offload-dpdk.c
> >> +++ b/lib/netdev-offload-dpdk.c
> >> @@ -1610,6 +1610,100 @@ netdev_offload_dpdk_flow_dump_destroy(struct 
> >> netdev_flow_dump *dump)
> >>   return 0;
> >>   }
> >>
> >> +static struct netdev *
> >> +get_vport_netdev(const char *dpif_type,
> >> + struct rte_flow_tunnel *tunnel,
> >> + odp_port_t *odp_port)
> >> +{
> >> +const struct netdev_tunnel_config *tnl_cfg;
> >> +struct netdev_flow_dump **netdev_dumps;
> >> +struct netdev *vport = NULL;
> >> +bool found = false;
> >> +int num_ports = 0;
> >> +int err;
> >> +int i;
> >> +
> >> +netdev_dumps = netdev_ports_flow_dump_create(dpif_type, _ports, 
> >> false);
> > This relates to my comment in Patch-3; flow_dump_create() in this
> > context is very confusing since we are not really dumping flows. We
> > might as well walk the list of ports/netdevs looking for a specific
> > netdev, just like other netdev_ports_*() routines in netdev-offload.c;
> > may be add a new function in netdev-offload.c:
> As in the response there. I used an existing API, not introducing new ones.

Like I said, this can be confusing that while trying to offload a
flow, we invoke flow-dump API. And the flow-dump API implementation
does nothing, apart from getting a reference to a netdev. IMO, it'd be
better to add a new API to keep it simple and clear, as shown below.
> >
> > netdev_ports_get_tunnel_vport(dpif_type, tunnel_type, tp_dst):/*
> > example: tunnel_type == "vxlan", tp_dst = 4789 */
> >
> > HMAP_FOR_EACH (data, portno_node, _to_netdev) {
> >  if (netdev_get_dpif_type(data->netdev) == dpif_type &&
> >  netdev_get_type(data->netdev) == tunnel_type) {
> >  tnl_cfg = netdev_get_tunnel_config(data->netdev);
> >   if (tnl_cfg && tnl_cfg->dst_port == tp_dst) {
> >   
> >   
> >  }
> >  }
> >
> >> +for (i = 0; i < num_ports; i++) {
> >> +if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
> >> +!strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
> >> +tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
> >> +if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
> >> +found = true;
> >> +vport = netdev_dumps[i]->netdev;
> >> +netdev_ref(vport);
> >> +*odp_port = netdev_dumps[i]->port;
> >> +}
> >> +}
> >> +err = netdev_flow_dump_destroy(netdev_dumps[i]);
> >> +if (err != 0 && err != EOPNOTSUPP) {
> >> +VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
> >> +}
> >> +}
> >> +return vport;
> >> +}
> >> +
> >> +static int
> >> +netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
> >> +   struct dp_packet *packet)
> >> +{
> >> +struct rte_flow_restore_info rte_restore_info;
> >> +struct rte_flow_tunnel *rte_tnl;
> >> +struct rte_flow_error error;
> >> +struct netdev *vport_netdev;
> >> +struct pkt_metadata *md;
> >> +struct flow_tnl *md_tnl;
> >> +odp_port_t vport_odp;
> >> +
> >> +if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
> >> +  _restore_info, )) 
> >> {
> >> +/* This function is called for every packet, and in most cases 
> >> there
> >> + * will be no restore info from the HW, thus error is expected.
> > Right now this API (get_restore_info) is needed only in the case of
> > tunnel offloads; is there some way we could avoid calling this for
> > every packet (e.g non-offloaded, non-tunnel-offloaded packets) ?
> No other way I can think of. Suggestions?
Suggested in Patch-6 (code refactoring in dfc_processing())

> > What is expected from the PMD using the given 'packet' argument ? This
> > is not clear from the API description in rte_flow.h.
>
> The PMD is expected to use any data on the packet in order to provide
> the HW info.
>
> mlx5 PMD uses mark to do it, but each PMD is free to use whatever
> implementation suitable for its HW.

Re: [ovs-dev] [PATCH V2 03/14] netdev-offload-dpdk: Implement flow dump create/destroy APIs

2021-02-24 Thread Sriharsha Basavapatna via dev
On Tue, Feb 23, 2021 at 6:55 PM Eli Britstein  wrote:
>
>
> On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
>
> On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
>
> When offloading vports, we don't configure rte_flow on the vport itself,
> as it is not a physical dpdk port, but rather on uplinks. Implement
> those APIs as a pre-step to enable iterate over the ports.
>
> We don't need these flow_dump APIs, since we are not really dumping
> any flows here and also orig_in_port is provided to the offload layer
> (Patch 12).
>
> We still need them to traverse the ports to get the vxlan netdev in case of a 
> miss. See patch #5:
>
> 37056941f netdev-offload-dpdk: Implement HW miss packet recover for vport
>
> There, see get_vport_netdev.
>
> The naming is because this is "flow_api" and the implementation is of an 
> existing API rather than introducing new one(s).

I've suggested a new API (see my comments in Patch 5).
>
> Thanks,
> -Harsha
>
>

-- 
This electronic communication and the information and any files transmitted 
with it, or attached to it, are confidential and are intended solely for 
the use of the individual or entity to whom it is addressed and may contain 
information that is confidential, legally privileged, protected by privacy 
laws, or otherwise restricted from disclosure to anyone else. If you are 
not the intended recipient or the person responsible for delivering the 
e-mail to the intended recipient, you are hereby notified that any use, 
copying, distributing, dissemination, forwarding, printing, or copying of 
this e-mail is strictly prohibited. If you received this e-mail in error, 
please return the e-mail to the sender, delete it from your computer, and 
destroy any printed copy of it.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH V2 01/14] netdev-offload: Add HW miss packet state recover API

2021-02-24 Thread Sriharsha Basavapatna via dev
On Tue, Feb 23, 2021 at 6:51 PM Eli Britstein  wrote:
>
>
> On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
>
> On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
>
> When the HW offload involves multiple flows, like in tunnel decap path,
> it is possible that not all flows in the path are offloaded, resulting
> in partial processing in HW. In order to proceed with rest of the
> processing in SW, the packet state has to be recovered as if it was
> processed in SW from the beginning. Add API for that.
>
> Can you be more specific/clear on what this API does ? What specific
> packet state is this referring to and what is meant by recovering the
> state here ? For example, if recovering  the packet state means to
> check if the packet is encapsulated and to pop the tunnel header, then
> it would make it clear to just state that.
> Thanks,
> -Harsha
>
> The state refers to the state provided by the HW. This patch introduces a 
> generic API to support all cases.
>
> The case to pop in SW in case the info provided is that the packet is 
> encapsulated is a private case.

Private case ? IMO, the API/interface should provide sufficient
information on what is meant by the state and recovery for each use
case, starting with tunnel encapsulated packets for now.

>
>
>

-- 
This electronic communication and the information and any files transmitted 
with it, or attached to it, are confidential and are intended solely for 
the use of the individual or entity to whom it is addressed and may contain 
information that is confidential, legally privileged, protected by privacy 
laws, or otherwise restricted from disclosure to anyone else. If you are 
not the intended recipient or the person responsible for delivering the 
e-mail to the intended recipient, you are hereby notified that any use, 
copying, distributing, dissemination, forwarding, printing, or copying of 
this e-mail is strictly prohibited. If you received this e-mail in error, 
please return the e-mail to the sender, delete it from your computer, and 
destroy any printed copy of it.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] raft: Add 'stop-raft-rpc' failure test command.

2021-02-24 Thread Han Zhou
On Tue, Feb 23, 2021 at 5:18 AM Ilya Maximets  wrote:
>
> This command will stop sending and receiving any RAFT-related
> traffic or accepting new connections.  Useful to simulate
> network problems between cluster members.
>
> There is no unit test that uses it yet, but it's convenient for
> manual testing.
>
> Signed-off-by: Ilya Maximets 
> ---
>  ovsdb/raft.c | 31 +--
>  1 file changed, 21 insertions(+), 10 deletions(-)
>
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index 192f7f0a9..d2ff643b2 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -75,7 +75,8 @@ enum raft_failure_test {
>  FT_CRASH_AFTER_SEND_EXEC_REQ,
>  FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE,
>  FT_DELAY_ELECTION,
> -FT_DONT_SEND_VOTE_REQUEST
> +FT_DONT_SEND_VOTE_REQUEST,
> +FT_STOP_RAFT_RPC,
>  };
>  static enum raft_failure_test failure_test;
>
> @@ -1474,6 +1475,10 @@ raft_send_add_server_request(struct raft *raft,
struct raft_conn *conn)
>  static void
>  raft_conn_run(struct raft *raft, struct raft_conn *conn)
>  {
> +if (failure_test == FT_STOP_RAFT_RPC) {
> +return;
> +}
> +
>  jsonrpc_session_run(conn->js);
>
>  unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js);
> @@ -1794,7 +1799,8 @@ static void
>  raft_open_conn(struct raft *raft, const char *address, const struct uuid
*sid)
>  {
>  if (strcmp(address, raft->local_address)
> -&& !raft_find_conn_by_address(raft, address)) {
> +&& !raft_find_conn_by_address(raft, address)
> +&& failure_test != FT_STOP_RAFT_RPC) {
>  raft_add_conn(raft, jsonrpc_session_open(address, true), sid,
false);
>  }
>  }
> @@ -1870,7 +1876,7 @@ raft_run(struct raft *raft)
>  free(paddr);
>  }
>
> -if (raft->listener) {
> +if (raft->listener && failure_test != FT_STOP_RAFT_RPC) {
>  struct stream *stream;
>  int error = pstream_accept(raft->listener, );
>  if (!error) {
> @@ -1995,7 +2001,7 @@ raft_run(struct raft *raft)
>  static void
>  raft_wait_session(struct jsonrpc_session *js)
>  {
> -if (js) {
> +if (js && failure_test != FT_STOP_RAFT_RPC) {
>  jsonrpc_session_wait(js);
>  jsonrpc_session_recv_wait(js);
>  }
> @@ -2012,10 +2018,12 @@ raft_wait(struct raft *raft)
>
>  raft_waiters_wait(raft);
>
> -if (raft->listener) {
> -pstream_wait(raft->listener);
> -} else {
> -poll_timer_wait_until(raft->listen_backoff);
> +if (failure_test != FT_STOP_RAFT_RPC) {
> +if (raft->listener) {
> +pstream_wait(raft->listener);
> +} else {
> +poll_timer_wait_until(raft->listen_backoff);
> +}
>  }
>
>  struct raft_conn *conn;
> @@ -4361,8 +4369,9 @@ raft_send_to_conn_at(struct raft *raft, const union
raft_rpc *rpc,
>   struct raft_conn *conn, int line_number)
>  {
>  log_rpc(rpc, "-->", conn, line_number);
> -return !jsonrpc_session_send(
> -conn->js, raft_rpc_to_jsonrpc(>cid, >sid, rpc));
> +return failure_test == FT_STOP_RAFT_RPC
> +   || !jsonrpc_session_send(
> +  conn->js, raft_rpc_to_jsonrpc(>cid, >sid,
rpc));
>  }
>
>  static bool
> @@ -4856,6 +4865,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn
OVS_UNUSED,
>  }
>  } else if (!strcmp(test, "dont-send-vote-request")) {
>  failure_test = FT_DONT_SEND_VOTE_REQUEST;
> +} else if (!strcmp(test, "stop-raft-rpc")) {
> +failure_test = FT_STOP_RAFT_RPC;
>  } else if (!strcmp(test, "clear")) {
>  failure_test = FT_NO_TEST;
>  unixctl_command_reply(conn, "test dismissed");
> --
> 2.26.2
>

Acked-by: Han Zhou 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] raft: Report disconnected in cluster/status if candidate retries election.

2021-02-24 Thread Han Zhou
On Tue, Feb 23, 2021 at 5:16 AM Ilya Maximets  wrote:
>
> If election times out for a server in 'candidate' role it sets
> 'candidate_retrying' flag that notifies that storage is disconnected
> and client should re-connect.  However, cluster/status command
> reports 'Status: cluster member' and that is misleading.
> Reporting "disconnected from the cluster (election timeout)" instead.
>
> Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered
databases.")

candidate_retrying flag was introduced in a patch later than the above one
:)

Acked-by: Han Zhou 

> Signed-off-by: Ilya Maximets 
> ---
>  ovsdb/raft.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index 0fb1420fb..192f7f0a9 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -4498,6 +4498,8 @@ raft_unixctl_status(struct unixctl_conn *conn,
>: raft->leaving ? "leaving cluster"
>: raft->left ? "left cluster"
>: raft->failed ? "failed"
> +  : raft->candidate_retrying
> +  ? "disconnected from the cluster (election
timeout)"
>: "cluster member");
>  if (raft->joining) {
>  ds_put_format(, "Remotes for joining:");
> --
> 2.26.2
>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] raft: Reintroduce jsonrpc inactivity probes.

2021-02-24 Thread Han Zhou
On Tue, Feb 23, 2021 at 5:15 AM Ilya Maximets  wrote:
>
> It's not enough to just have heartbeats.
>
> RAFT heartbeats are unidirectional, i.e. leader sends them to followers
> but not the other way around.  Missing heartbeats provokes followers to
> start election, but if leader will not receive any replies it will not
> do anything while there is a quorum, i.e. there are enough other
> servers to make decisions.
>
> This leads to situation that while TCP connection is established,
> leader will continue to blindly send messages to it.  In our case this
> leads to growing send backlog.  Connection will be terminated
> eventually due to excessive send backlog, but this this might take a
> lot of time and wasted process memory.  At the same time 'candidate'
> will continue to send vote requests to the dead connection on its
> side.
>
> To fix that we need to reintroduce inactivity probes that will drop
> connection if there was no incoming traffic for a long time and remote
> server doesn't reply to the "echo" request.  Probe interval might be
> chosen based on an election timeout to avoid issues described in commit
> db5a066c17bd.
>
> Fixes: db5a066c17bd ("raft: Disable RAFT jsonrpc inactivity probe.")
> Signed-off-by: Ilya Maximets 
> ---
>  ovsdb/raft.c | 32 +++-
>  1 file changed, 31 insertions(+), 1 deletion(-)
>
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index ea91d1fdb..0fb1420fb 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -940,6 +940,34 @@ raft_reset_ping_timer(struct raft *raft)
>  raft->ping_timeout = time_msec() + raft->election_timer / 3;
>  }
>
> +static void
> +raft_conn_update_probe_interval(struct raft *raft, struct raft_conn
*r_conn)
> +{
> +/* Inactivity probe will be sent if connection will remain idle for
the
> + * time of an election timeout.  Connection will be dropped if
inactivity
> + * will last twice that time.
> + *
> + * It's not enough to just have heartbeats if connection is still
> + * established, but no packets received from the other side.  Without
> + * inactivity probe follower will just try to initiate election
> + * indefinitely staying in 'candidate' role.  And the leader will
continue
> + * to send heartbeats to the dead connection thinking that remote
server
> + * is still part of the cluster. */
> +int probe_interval = raft->election_timer + ELECTION_RANGE_MSEC;
> +
> +jsonrpc_session_set_probe_interval(r_conn->js, probe_interval);
> +}
> +
> +static void
> +raft_update_probe_intervals(struct raft *raft)
> +{
> +struct raft_conn *r_conn;
> +
> +LIST_FOR_EACH (r_conn, list_node, >conns) {
> +raft_conn_update_probe_interval(raft, r_conn);
> +}
> +}
> +
>  static void
>  raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
>const struct uuid *sid, bool incoming)
> @@ -954,7 +982,7 @@ raft_add_conn(struct raft *raft, struct
jsonrpc_session *js,
>>sid);
>  conn->incoming = incoming;
>  conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
> -jsonrpc_session_set_probe_interval(js, 0);
> +raft_conn_update_probe_interval(raft, conn);
>  jsonrpc_session_set_backlog_threshold(js,
raft->conn_backlog_max_n_msgs,
>
 raft->conn_backlog_max_n_bytes);
>  }
> @@ -2804,6 +2832,7 @@ raft_update_commit_index(struct raft *raft,
uint64_t new_commit_index)
>raft->election_timer, e->election_timer);
>  raft->election_timer = e->election_timer;
>  raft->election_timer_new = 0;
> +raft_update_probe_intervals(raft);
>  }
>  if (e->servers) {
>  /* raft_run_reconfigure() can write a new Raft entry,
which can
> @@ -2820,6 +2849,7 @@ raft_update_commit_index(struct raft *raft,
uint64_t new_commit_index)
>  VLOG_INFO("Election timer changed from %"PRIu64" to
%"PRIu64,
>raft->election_timer, e->election_timer);
>  raft->election_timer = e->election_timer;
> +raft_update_probe_intervals(raft);
>  }
>  }
>  /* Check if any pending command can be completed, and complete
it.
> --
> 2.26.2
>

Thanks Ilya.
Acked-by: Han Zhou 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v10.1 0/6] Add DDlog implementation of ovn-northd

2021-02-24 Thread Ben Pfaff
On Wed, Feb 24, 2021 at 12:40:17AM +0530, Numan Siddique wrote:
> On Tue, Feb 23, 2021 at 11:16 PM Ben Pfaff  wrote:
> >
> > On Tue, Feb 23, 2021 at 07:38:12PM +0530, Numan Siddique wrote:
> > > On Tue, Feb 23, 2021 at 4:10 PM Numan Siddique  wrote:
> > > >
> > > > On Fri, Feb 19, 2021 at 4:10 AM Ben Pfaff  wrote:
> > > > >
> > > > > This passed in the ovsrobot CI:
> > > > > https://github.com/ovsrobot/ovn/actions/runs/579307688
> > > > >
> > > > > I think that we have arrived at consensus to push this to ovn master
> > > > > after branching happens tomorrow.  Please, let me know if I
> > > > > misunderstood!
> > > >
> > > > I think understanding is correct.  I think Mark needs to create the 
> > > > branch.
> > >
> > > Hi Ben,
> > >
> > > I am trying to compile with ddlog configured, but the compilation is
> > > failing for me
> > > for some reason. Can you please help me with this ?
> >
> > Oops.  I made a dumb typo.  It didn't show up in my testing because I
> > guess a -j12 build typically built OVN_Northbound.dl and
> > OVN_Southbound.dl early enough.  Here's the fix:
> >
> > diff --git a/northd/automake.mk b/northd/automake.mk
> > index 17c394c82082..d64d1d197c77 100644
> > --- a/northd/automake.mk
> > +++ b/northd/automake.mk
> > @@ -29,7 +29,7 @@ ddlog_sources = \
> > northd/ovn.dl \
> > northd/ovn.rs \
> > northd/helpers.dl
> > -ddlog_nodist_source = \
> > +ddlog_nodist_sources = \
> > northd/OVN_Northbound.dl \
> > northd/OVN_Southbound.dl
> 
> Thank you. It works.
> 
> For the whole series
> Acked-by: Numan Siddique 
> 
> Unfortunately because of a couple of recent commits [1], you  have to
> rebase the code and you may have to add some more code in northd-ddlog to
> sync up with c northd.

Thanks so much.  I rebased, added the missing features, tested, fixed my
bugs, and applied this series to OVN master.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] ovs-ctl: Allow recording hostname separately

2021-02-24 Thread Frode Nordahl
On Wed, Feb 24, 2021 at 5:36 PM Ilya Maximets  wrote:
>
> On 2/19/21 9:10 AM, Christian Ehrhardt wrote:
> > On Tue, Feb 16, 2021 at 5:32 PM Frode Nordahl
> >  wrote:
> >>
> >> ovs-ctl determines the system FQDN or hostname and records it in
> >> the `external-ids:hostname` field of the `Open-vSwitch` table on
> >> system startup.
> >>
> >> This value may be consumed by downstream software and having it
> >> unset or set to a incorrect value could lead to erratic behavior
> >> of a system.
> >>
> >> When a system is configured to use a Open vSwitch controlled
> >> datapath as its only network connection, the current ordering of
> >> events would always produce a unreliable hostname
> >>
> >> Reported-At: https://bugs.launchpad.net/bugs/1915829
> >> Signed-off-by: Frode Nordahl 
> >
> > I've looked at this for the Ubuntu problem that started Frodes work,
> > and while I can't give any authoritative Ack I can at least say
> >
> > Reviewed-by: Christian Ehrhardt 
> >
>
> Hi.  Reading the description and the bug on launchpad it seems that
> the issue is that hostname in database changes in unexpected way and
> this is somewhat similar to what the following patch tried to fix:
>   
> http://patchwork.ozlabs.org/project/openvswitch/patch/20200525152821.19838-1-dalva...@redhat.com/

Hello Ilya, and thank you for the review.

> Is it still a problem with above patch applied?

Yes, while the above patch is welcome and adds to the stability of the
identifier, it does not solve the problem this patch addresses.

What we see is that the data collected for the initial recording of
the hostname field is incorrect. This occurs because the information
might not be available at the time of initial Open vSwitch startup.

We support the use of Open vSwitch to manage the only connection a
computer has to a network, and to do that we start Open vSwitch early
as part of the network target (in systemd terms). At this point in
time there may be no network connectivity yet, depending on deployment
type initial system configuration may not yet be applied (i.e.
/etc/hostname may hold a default unconfigured value), FQDN hints may
not yet be recorded in /etc/hosts etc.

We seek to resolve this class of issues by breaking the hostname
recording part out into a separate oneshot service [0] run after the
network is online.

To help support that we need to be able to manage whether and when
`ovs-ctl` performs the recording of the hostname.

0: 
https://code.launchpad.net/~fnordahl/ubuntu/+source/openvswitch/+git/openvswitch/+merge/398174

-- 
Frode Nordahl

> Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix crash at ofproto_destroy

2021-02-24 Thread Guohongzhi (Russell Lab)
Refcount and RCU are not mutually exclusive.
IMO, the main reason for this problem is that the rule uses both refcount and 
rcu, while the ofproto uses only rcu, and the rule retains the pointer of the 
ofproto. More importantly, ofproto cannot guarantee a longer grace period than 
the rule.


-Original Message-
From: 贺鹏 [mailto:xnhp0...@gmail.com] 
Sent: 25/2/2021 11:14
To: Ilya Maximets 
Cc: hepeng.0320 ;  
; Guohongzhi (Russell Lab) 
Subject: Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix 
crash at ofproto_destroy

Ilya Maximets  于2021年2月25日周四 上午12:01写道:
>
> On 2/24/21 3:29 PM, Ilya Maximets wrote:
> > On 7/17/20 3:50 AM, hepeng.0320 wrote:
> >> From: Peng He 
> >>
> >> The call stack of rule_destroy_cb:
> >>
> >> remove_rules_postponed (one grace period)
> >> -> remove_rule_rcu
> >> -> remove_rule_rcu__
> >> -> ofproto_rule_unref -> ref count != 1
> >> -> ... more grace periods.
> >> -> rule_destroy_cb (> 2 grace periods)
> >> -> free
> >>
> >> Currently ofproto_destory waits only two grace periods, this means 
> >> when rule_destroy_cb is called, the ofproto might have been already 
> >> destroyed.
> >>
> >> This patch adds a refcount for ofproto to ensure ofproto exists 
> >> when it is needed to call free functions.
> >>
> >> This patch fixes the crashes found:
> >> Program terminated with signal SIGSEGV, Segmentation fault.
> >> 0  0x562a55169e49 in rule_destroy_cb (rule=0x562a598be2c0) at 
> >> ofproto/ofproto.c:2956
> >> 1  0x562a552623d6 in ovsrcu_call_postponed () at 
> >> lib/ovs-rcu.c:348
> >> 2  0x562a55262504 in ovsrcu_postpone_thread (arg= >> out>) at lib/ovs-rcu.c:364
> >> 3  0x562a55264aef in ovsthread_wrapper (aux_=) 
> >> at lib/ovs-thread.c:383
> >> 4  0x7febe715a4a4 in start_thread (arg=0x7febe4c1c700) at 
> >> pthread_create.c:456
> >> 5  0x7febe6990d0f in clone () at 
> >> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> >> (gdb) p rule->ofproto->ofproto_class
> >> $3 = (const struct ofproto_class *) 0x0
> >>
> >> Also:
> >> 0  ofproto_dpif_credit_table_stats (ofproto=0x5583583332b0, 
> >> table_id=0 '\000', n_matches=5, n_misses=0) at 
> >> ofproto/ofproto-dpif.c:4310
> >> 1  0x558354c68514 in xlate_push_stats_entry 
> >> (entry=entry@entry=0x7ff488031398, 
> >> stats=stats@entry=0x7ff49af30b60) at 
> >> ofproto/ofproto-dpif-xlate-cache.c:99
> >> 2  0x558354c686f3 in xlate_push_stats (xcache=, 
> >> stats=stats@entry=0x7ff49af30b60) at 
> >> ofproto/ofproto-dpif-xlate-cache.c:181
> >> 3  0x558354c5411a in revalidate_ukey 
> >> (udpif=udpif@entry=0x5583583baba0, ukey=ukey@entry=0x7ff47809f770, 
> >> stats=stats@entry=0x7ff49af32128, 
> >> odp_actions=odp_actions@entry=0x7ff49af30c50, 
> >> reval_seq=reval_seq@entry=275670456,
> >>recircs=recircs@entry=0x7ff49af30cd0) at 
> >> ofproto/ofproto-dpif-upcall.c:2292
> >> 4  0x558354c57cbc in revalidate (revalidator=) 
> >> at ofproto/ofproto-dpif-upcall.c:2681
> >> 5  0x558354c57f8e in udpif_revalidator (arg=0x5583583d2a90) at 
> >> ofproto/ofproto-dpif-upcall.c:934
> >> 6  0x558354d24aef in ovsthread_wrapper (aux_=) 
> >> at lib/ovs-thread.c:383
> >> 7  0x7ff4a479d4a4 in start_thread (arg=0x7ff49af35700) at 
> >> pthread_create.c:456
> >> 8  0x7ff4a3fd3d0f in clone () at 
> >> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> >> (gdb) p ofproto->up.ofproto_class
> >> $3 = (const struct ofproto_class *) 0x0
> >>
> >> Signed-off-by: Peng He 
> >> ---
>
> CC: Hongzhi Guo
>
> And I just remembered that there is a very similar patch that was 
> submitted to the mail-list several months betore this one.
> Ben started review, but didn't finish:
>
> http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.1
> 9884-1-guohongz...@huawei.com/
>
> Both patches has smilar issues, i.e. mixing RCU with refcounters and 
> not covering all the cases.
>
> There are at least 2 major places where ofproto needs to be refcounted:
> 1. rules
>1.a xlate caches?
> 2. ofproto groups
>
> IMO, mixing RCU with refcounts basically means that we do not have a 
> clear understanding on how it works and using RCU as a safety net.
> We shuld stop using RCU for orproto and just have refcounts.  Once 
> ofproto refcount reaches zero it should be immediately destroyed.
> All the places where we need to increase refcount needs to be tracked 
> down.

IMO, I think RCU and refcounts are not mutually exclusive. In kernel connntack 
system such as IPVS, sessions are used with both RCU and refcounts.

Essentially, the RCU implies a refcount if there are no places holding a 
pointer to the object other than hash tables (or other RCU enabled data 
structure), thus by removing it from hash tables, and wait a grace period, we 
can safely free the object because we are sure that no one holds a pointer to 
the object. Removing it from hash tables is like reducing the refcount by 1, 
and since hash tables are the 

Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix crash at ofproto_destroy

2021-02-24 Thread 贺鹏
Ilya Maximets  于2021年2月25日周四 上午12:01写道:
>
> On 2/24/21 3:29 PM, Ilya Maximets wrote:
> > On 7/17/20 3:50 AM, hepeng.0320 wrote:
> >> From: Peng He 
> >>
> >> The call stack of rule_destroy_cb:
> >>
> >> remove_rules_postponed (one grace period)
> >> -> remove_rule_rcu
> >> -> remove_rule_rcu__
> >> -> ofproto_rule_unref -> ref count != 1
> >> -> ... more grace periods.
> >> -> rule_destroy_cb (> 2 grace periods)
> >> -> free
> >>
> >> Currently ofproto_destory waits only two grace periods, this means when
> >> rule_destroy_cb is called, the ofproto might have been already destroyed.
> >>
> >> This patch adds a refcount for ofproto to ensure ofproto exists when it
> >> is needed to call free functions.
> >>
> >> This patch fixes the crashes found:
> >> Program terminated with signal SIGSEGV, Segmentation fault.
> >> 0  0x562a55169e49 in rule_destroy_cb (rule=0x562a598be2c0) at 
> >> ofproto/ofproto.c:2956
> >> 1  0x562a552623d6 in ovsrcu_call_postponed () at lib/ovs-rcu.c:348
> >> 2  0x562a55262504 in ovsrcu_postpone_thread (arg=) at 
> >> lib/ovs-rcu.c:364
> >> 3  0x562a55264aef in ovsthread_wrapper (aux_=) at 
> >> lib/ovs-thread.c:383
> >> 4  0x7febe715a4a4 in start_thread (arg=0x7febe4c1c700) at 
> >> pthread_create.c:456
> >> 5  0x7febe6990d0f in clone () at 
> >> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> >> (gdb) p rule->ofproto->ofproto_class
> >> $3 = (const struct ofproto_class *) 0x0
> >>
> >> Also:
> >> 0  ofproto_dpif_credit_table_stats (ofproto=0x5583583332b0, table_id=0 
> >> '\000', n_matches=5, n_misses=0) at ofproto/ofproto-dpif.c:4310
> >> 1  0x558354c68514 in xlate_push_stats_entry 
> >> (entry=entry@entry=0x7ff488031398, stats=stats@entry=0x7ff49af30b60) at 
> >> ofproto/ofproto-dpif-xlate-cache.c:99
> >> 2  0x558354c686f3 in xlate_push_stats (xcache=, 
> >> stats=stats@entry=0x7ff49af30b60) at ofproto/ofproto-dpif-xlate-cache.c:181
> >> 3  0x558354c5411a in revalidate_ukey 
> >> (udpif=udpif@entry=0x5583583baba0, ukey=ukey@entry=0x7ff47809f770, 
> >> stats=stats@entry=0x7ff49af32128, 
> >> odp_actions=odp_actions@entry=0x7ff49af30c50, 
> >> reval_seq=reval_seq@entry=275670456,
> >>recircs=recircs@entry=0x7ff49af30cd0) at 
> >> ofproto/ofproto-dpif-upcall.c:2292
> >> 4  0x558354c57cbc in revalidate (revalidator=) at 
> >> ofproto/ofproto-dpif-upcall.c:2681
> >> 5  0x558354c57f8e in udpif_revalidator (arg=0x5583583d2a90) at 
> >> ofproto/ofproto-dpif-upcall.c:934
> >> 6  0x558354d24aef in ovsthread_wrapper (aux_=) at 
> >> lib/ovs-thread.c:383
> >> 7  0x7ff4a479d4a4 in start_thread (arg=0x7ff49af35700) at 
> >> pthread_create.c:456
> >> 8  0x7ff4a3fd3d0f in clone () at 
> >> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> >> (gdb) p ofproto->up.ofproto_class
> >> $3 = (const struct ofproto_class *) 0x0
> >>
> >> Signed-off-by: Peng He 
> >> ---
>
> CC: Hongzhi Guo
>
> And I just remembered that there is a very similar patch that
> was submitted to the mail-list several months betore this one.
> Ben started review, but didn't finish:
>
> http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.19884-1-guohongz...@huawei.com/
>
> Both patches has smilar issues, i.e. mixing RCU with refcounters
> and not covering all the cases.
>
> There are at least 2 major places where ofproto needs to be refcounted:
> 1. rules
>1.a xlate caches?
> 2. ofproto groups
>
> IMO, mixing RCU with refcounts basically means that we do not have a
> clear understanding on how it works and using RCU as a safety net.
> We shuld stop using RCU for orproto and just have refcounts.  Once
> ofproto refcount reaches zero it should be immediately destroyed.
> All the places where we need to increase refcount needs to be tracked
> down.

IMO, I think RCU and refcounts are not mutually exclusive. In kernel
connntack system such as IPVS, sessions are used with both RCU and
refcounts.

Essentially, the RCU implies a refcount if there are no places holding
a pointer to the object other than hash tables (or other RCU enabled
data structure), thus by removing it from hash tables, and wait a
grace period,
we can safely free the object because we are sure that no one holds a
pointer to the object. Removing it from hash tables is like reducing
the refcount by 1, and since hash tables are
the only place that holds the pointer and this is the only reference
that lives longer than a grace period, so by removing it, we are sure
that no other ref can live longer than a grace period, and after the
period, we are safe to free it.

If we use only refcounts, every time we access the object, we have to
add refcounts.
But if use RCU combined with refcount, we only need to add ref, if
there are other places than the hashtables that live longer than one
grace period.

So I think using RCU with refcounts actually makes code changes less,
as mostly, the access to object is ephemeral, you lookup 

Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix crash at ofproto_destroy

2021-02-24 Thread Guohongzhi (Russell Lab)
It's true that the patch has been reviewed by ben and a question has been 
raised.I have replied the question, but I did not get a follow-up reply.

In short, I agree with you very much and would be happy to prepare a complete 
solution for this issue.

-Original Message-
From: Ilya Maximets [mailto:i.maxim...@ovn.org] 
Sent: 2/25/2021 0:02 AM
To: Ilya Maximets ; hepeng.0320 
; d...@openvswitch.org
Cc: William Tu ; Yifeng Sun ; 
Guohongzhi (Russell Lab) 
Subject: Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix 
crash at ofproto_destroy

On 2/24/21 3:29 PM, Ilya Maximets wrote:
> On 7/17/20 3:50 AM, hepeng.0320 wrote:
>> From: Peng He 
>>
>> The call stack of rule_destroy_cb:
>>
>> remove_rules_postponed (one grace period)
>> -> remove_rule_rcu
>> -> remove_rule_rcu__
>> -> ofproto_rule_unref -> ref count != 1
>> -> ... more grace periods.
>> -> rule_destroy_cb (> 2 grace periods)
>> -> free
>>
>> Currently ofproto_destory waits only two grace periods, this means 
>> when rule_destroy_cb is called, the ofproto might have been already 
>> destroyed.
>>
>> This patch adds a refcount for ofproto to ensure ofproto exists when 
>> it is needed to call free functions.
>>
>> This patch fixes the crashes found:
>> Program terminated with signal SIGSEGV, Segmentation fault.
>> 0  0x562a55169e49 in rule_destroy_cb (rule=0x562a598be2c0) at 
>> ofproto/ofproto.c:2956
>> 1  0x562a552623d6 in ovsrcu_call_postponed () at 
>> lib/ovs-rcu.c:348
>> 2  0x562a55262504 in ovsrcu_postpone_thread (arg=) 
>> at lib/ovs-rcu.c:364
>> 3  0x562a55264aef in ovsthread_wrapper (aux_=) at 
>> lib/ovs-thread.c:383
>> 4  0x7febe715a4a4 in start_thread (arg=0x7febe4c1c700) at 
>> pthread_create.c:456
>> 5  0x7febe6990d0f in clone () at 
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>> (gdb) p rule->ofproto->ofproto_class
>> $3 = (const struct ofproto_class *) 0x0
>>
>> Also:
>> 0  ofproto_dpif_credit_table_stats (ofproto=0x5583583332b0, 
>> table_id=0 '\000', n_matches=5, n_misses=0) at 
>> ofproto/ofproto-dpif.c:4310
>> 1  0x558354c68514 in xlate_push_stats_entry 
>> (entry=entry@entry=0x7ff488031398, stats=stats@entry=0x7ff49af30b60) 
>> at ofproto/ofproto-dpif-xlate-cache.c:99
>> 2  0x558354c686f3 in xlate_push_stats (xcache=, 
>> stats=stats@entry=0x7ff49af30b60) at 
>> ofproto/ofproto-dpif-xlate-cache.c:181
>> 3  0x558354c5411a in revalidate_ukey (udpif=udpif@entry=0x5583583baba0, 
>> ukey=ukey@entry=0x7ff47809f770, stats=stats@entry=0x7ff49af32128, 
>> odp_actions=odp_actions@entry=0x7ff49af30c50, 
>> reval_seq=reval_seq@entry=275670456,
>>recircs=recircs@entry=0x7ff49af30cd0) at 
>> ofproto/ofproto-dpif-upcall.c:2292
>> 4  0x558354c57cbc in revalidate (revalidator=) at 
>> ofproto/ofproto-dpif-upcall.c:2681
>> 5  0x558354c57f8e in udpif_revalidator (arg=0x5583583d2a90) at 
>> ofproto/ofproto-dpif-upcall.c:934
>> 6  0x558354d24aef in ovsthread_wrapper (aux_=) at 
>> lib/ovs-thread.c:383
>> 7  0x7ff4a479d4a4 in start_thread (arg=0x7ff49af35700) at 
>> pthread_create.c:456
>> 8  0x7ff4a3fd3d0f in clone () at 
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>> (gdb) p ofproto->up.ofproto_class
>> $3 = (const struct ofproto_class *) 0x0
>>
>> Signed-off-by: Peng He 
>> ---

CC: Hongzhi Guo

And I just remembered that there is a very similar patch that was submitted to 
the mail-list several months betore this one.
Ben started review, but didn't finish:

http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.19884-1-guohongz...@huawei.com/

Both patches has smilar issues, i.e. mixing RCU with refcounters and not 
covering all the cases.

There are at least 2 major places where ofproto needs to be refcounted:
1. rules
   1.a xlate caches?
2. ofproto groups

IMO, mixing RCU with refcounts basically means that we do not have a clear 
understanding on how it works and using RCU as a safety net.
We shuld stop using RCU for orproto and just have refcounts.  Once ofproto 
refcount reaches zero it should be immediately destroyed.
All the places where we need to increase refcount needs to be tracked down.

It'll be great if you can cooperate to prepare a complete solution for this 
issue.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] netdev-linux: correct unit of burst parameter

2021-02-24 Thread Tonghao Zhang
On Wed, Feb 24, 2021 at 10:38 PM Simon Horman 
wrote:

> From: Yong Xu 
>
> Correct calculation of burst parameter used when configuring TC policer
> action for ingress port-based policing in the case where TC offload is in
> use. This now matches the value calculated for the case where TC offload is
> not in use.
>
> The division by 8 is to convert from bits to bytes.
> Its unclear why 64 was previously used.
>
> Fixes: e7f6ba220 ("lib/tc: add ingress ratelimiting support for
> tc-offload")
> Signed-off-by: Yong Xu 
> [simon: reworked changelog]
> Signed-off-by: Simon Horman 
> Signed-off-by: Louis Peens 
> ---
>  lib/netdev-linux.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 6be23dbee..eb28777f9 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -2572,7 +2572,7 @@ exit:
>  static struct tc_police
>  tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
>  {
> -unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
> +unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8;

There is any test case for this in ovs?

>
>  unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
>  struct tc_police police;
>  struct tc_ratespec rate;
> --
> 2.20.1
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] 答复: [ovs-discuss] Do you know how I can set nd_options_type field for ipv6 ND message?

2021-02-24 Thread 杨燚
No, neighbor advertisement (reply for neighbor solicitation, i.e. ARP reply) 
must have nd_options_type==2.

-邮件原件-
发件人: William Tu [mailto:u9012...@gmail.com] 
发送时间: 2021年2月25日 3:05
收件人: Yi Yang (杨燚)-云服务集团 
抄送: f...@sysclose.org; b...@ovn.org; ovs-dev@openvswitch.org; 
ovs-disc...@openvswitch.org
主题: Re: [ovs-discuss] Do you know how I can set nd_options_type field for ipv6 
ND message?

On Tue, Feb 23, 2021 at 6:03 PM Yi Yang (杨燚)-云服务集团  wrote:
>
> Out of curious, I remember OVN doesn't support OVS DPDK, I believe OVN also 
> does IPv6 ND by openflow, is it acceptable to use slow path to handle IPv6 ND 
> for OVS kernel datapath?
>
Maybe OVN uses IPv6 ND, but not setting/matching 'nd_options_type'?
William
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn] ovn-northd: Warn for >1 IPv4 in build_lrouter_force_snat_flows_op().

2021-02-24 Thread Ben Pfaff
The comments here point out that more than one IPv4 address is a
problem (versus more than 2 IPv6 addresses), but the code doesn't match.

I didn't test this, it's just an observation from reading code and
comments.

Signed-off-by: Ben Pfaff 
CC: Numan Siddique 
Fixes: c6e21a23bd8c ("northd: Provide the Gateway router option 
'lb_force_snat_ip' to take router port ips.")
---
 northd/ovn-northd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 85022d36c6ae..ac872aadea3d 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -9164,7 +9164,7 @@ build_lrouter_force_snat_flows_op(struct ovn_port *op,
   op->lrp_networks.ipv4_addrs[0].addr_s);
 ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_SNAT, 110,
   ds_cstr(match), ds_cstr(actions));
-if (op->lrp_networks.n_ipv4_addrs > 2) {
+if (op->lrp_networks.n_ipv4_addrs > 1) {
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 VLOG_WARN_RL(, "Logical router port %s is configured with "
   "multiple IPv4 addresses.  Only the first "
-- 
2.29.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v6 ovn] controller: introduce stats counters for ovn-controller incremental processing

2021-02-24 Thread Mark Gray
On 24/02/2021 15:38, Lorenzo Bianconi wrote:

Thanks for all the work on this Lorenzo, I think it is a really useful
feature.

I have some reservations about the output format as I think it may be
hard to grep for a node but that is only an aesthetic NIT so I suggest
it is good to go.

I ran some tests and it worked.

Acked-by: Mark Gray 

Thanks!

> Introduce inc-engine/show-stats ovs-appctl command in order to dump
> ovn-controller incremental processing engine statistics. So far for each
> node a counter for run, abort and engine_handler have been added.
> Counters are incremented when the node move to "updated" state.
> In order to dump I-P stats we can can use the following commands:
> 
> $ovs-appctl -t ovn-controller inc-engine/show-stats
> Node: SB_address_set
> - recompute:   38
> - compute:  0
> - abort:0
> Node: addr_sets
> - recompute:2
> - compute:  0
> - abort:1
> Node: SB_port_group
> - recompute:   37
> - compute:  0
> - abort:0
> 
> Node: flow_output
> - recompute:2
> - compute: 20
> - abort:0
> 
> Moreover introduce the inc-engine/clear-stats command to reset engine
> statistics
> 
> $ovs-appctl -t ovn-controller inc-engine/clear-stats
> 
> Signed-off-by: Lorenzo Bianconi 
> ---
> Changes since v5:
> - cosmetics
> - add missing recompute increments
> - move about counter in engine_run routine
> - add missing ovs-appctl cmd documentation
> Changes since v4:
> - rename handlers in inc-engine/show-stats and inc-engine/clear-stats
> - move all the code in lib/inc-proc-eng.{c,h}
> - instad of run and change_handler, track node recompute and node compute
> Changes since v3:
> - drop engine_set_note_update_from_run/engine_set_note_update_from_handler
>   macros and move stats code in lib/inc-proc-eng.c
> - fix commit log
> Changes since v2:
> - introduce inc-engine/stats and inc-engine/stats-clear commands and drop
>   COVERAGE_* dependency
> Changes since v1:
> - drop handler counters and add global abort counter
> - improve documentation and naming scheme
> - introduce engine_set_node_updated utility macro
> ---
>  NEWS|  1 +
>  controller/ovn-controller.8.xml | 22 
>  lib/inc-proc-eng.c  | 46 +
>  lib/inc-proc-eng.h  |  9 +++
>  4 files changed, 78 insertions(+)
> 
> diff --git a/NEWS b/NEWS
> index fca96f658..6cbb88add 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -1,5 +1,6 @@
>  Post-v21.03.0
>  -
> +  - Introduce ovn-controller incremetal processing engine statistics
>  
>  OVN v21.03.0 - 5 Mar 2020
>  -
> diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
> index 51c0c372c..8886df568 100644
> --- a/controller/ovn-controller.8.xml
> +++ b/controller/ovn-controller.8.xml
> @@ -578,6 +578,28 @@
>  Displays logical flow cache statistics: enabled/disabled, per cache
>  type entry counts.
>
> +
> +  inc-engine/show-stats
> +  
> +Display ovn-controller engine counters. For each engine
> +node the following counters have been added:
> +
> +  
> +recompute
> +  
> +  
> +compute
> +  
> +  
> +abort
> +  
> +
> +  
> +
> +  inc-engine/clear-stats
> +  
> +Reset ovn-controller engine counters.
> +  
>
>  
>  
> diff --git a/lib/inc-proc-eng.c b/lib/inc-proc-eng.c
> index 916dbbe39..a6337a1d9 100644
> --- a/lib/inc-proc-eng.c
> +++ b/lib/inc-proc-eng.c
> @@ -27,6 +27,7 @@
>  #include "openvswitch/hmap.h"
>  #include "openvswitch/vlog.h"
>  #include "inc-proc-eng.h"
> +#include "unixctl.h"
>  
>  VLOG_DEFINE_THIS_MODULE(inc_proc_eng);
>  
> @@ -102,6 +103,40 @@ engine_get_nodes(struct engine_node *node, size_t 
> *n_count)
>  return engine_topo_sort(node, NULL, n_count, _size);
>  }
>  
> +static void
> +engine_clear_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +   const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
> +{
> +for (size_t i = 0; i < engine_n_nodes; i++) {
> +struct engine_node *node = engine_nodes[i];
> +
> +memset(>stats, 0, sizeof node->stats);
> +}
> +unixctl_command_reply(conn, NULL);
> +}
> +
> +static void
> +engine_dump_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +  const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
> +{
> +struct ds dump = DS_EMPTY_INITIALIZER;
> +
> +for (size_t i = 0; i < engine_n_nodes; i++) {
> +struct engine_node *node = engine_nodes[i];
> +
> +ds_put_format(,
> +  "Node: %s\n"
> +  "- recompute: %12"PRIu64"\n"
> +  "- compute:   %12"PRIu64"\n"
> +  "- abort: 

Re: [ovs-dev] [PATCH] netdev-linux: correct unit of burst parameter

2021-02-24 Thread Yi-Hung Wei
On Wed, Feb 24, 2021 at 6:38 AM Simon Horman  wrote:
>
> From: Yong Xu 
>
> Correct calculation of burst parameter used when configuring TC policer
> action for ingress port-based policing in the case where TC offload is in
> use. This now matches the value calculated for the case where TC offload is
> not in use.
>
> The division by 8 is to convert from bits to bytes.
> Its unclear why 64 was previously used.

A minor nit.

s/Its/It's
>
> Fixes: e7f6ba220 ("lib/tc: add ingress ratelimiting support for tc-offload")
> Signed-off-by: Yong Xu 
> [simon: reworked changelog]
> Signed-off-by: Simon Horman 
> Signed-off-by: Louis Peens 
> ---

This patch LGTM.

Acked-by: Yi-Hung Wei 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [ovs-discuss] Do you know how I can set nd_options_type field for ipv6 ND message?

2021-02-24 Thread William Tu
On Tue, Feb 23, 2021 at 6:03 PM Yi Yang (杨燚)-云服务集团  wrote:
>
> Out of curious, I remember OVN doesn't support OVS DPDK, I believe OVN also 
> does IPv6 ND by openflow, is it acceptable to use slow path to handle IPv6 ND 
> for OVS kernel datapath?
>
Maybe OVN uses IPv6 ND, but not setting/matching 'nd_options_type'?
William
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 ovn 0/3] Handle LB VIPs that share backends and make flows offloadable.

2021-02-24 Thread Numan Siddique
On Wed, Feb 24, 2021 at 9:40 PM Dumitru Ceara  wrote:
>
> Patch 1/3 fixes a bug when using learn() action to generate hairpin
> reply flows for different VIPs that share backends.  It also addresses
> backwards compatibility in order to avoid having issues during upgrades.
>
> Patches 2/3 and 3/3 use the new OVS registers populated by patch 1/3 to
> simplify the hairpin flows and to avoid matching on conntrack original
> tuple fields or ct.dnat as tests have shown that these are not offloadable
> on specific NICs (e.g., MLX-CX5).
>
> Changes in V2:
> - Addressed Numan's comments.
> - Added Numan's acks.
>
> Dumitru Ceara (3):
>   Properly handle hairpin traffic for VIPs with shared backends.
>   lflow: Avoid matching on conntrack original tuple if possible.
>   northd: Avoid matching on ct.dnat flags for load balancers.

Thanks Dumitru for addressing the comments.

I applied the patch series to the main branch.
Along with the 1st patch I also applied the other 2 patches to the
newly created branch 21.03
as I thought it makes sense to apply the whole series rather than just
the 1st patch.

Thanks
Numan

>
>
>  controller/lflow.c   |  107 +-
>  include/ovn/logical-fields.h |8 +
>  lib/lb.c |3
>  lib/lb.h |3
>  northd/ovn-northd.8.xml  |   53 +++--
>  northd/ovn-northd.c  |  143 +++---
>  ovn-sb.xml   |6 +
>  tests/ofproto-macros.at  |2
>  tests/ovn-northd.at  |   57 -
>  tests/ovn.at |  443 
> +-
>  tests/system-ovn.at  |   40 ++--
>  11 files changed, 642 insertions(+), 223 deletions(-)
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovs-dev v3 1/4] dpif-netdev: Expand the meters supported number.

2021-02-24 Thread Ilya Maximets
On 2/24/21 1:31 PM, Tonghao Zhang wrote:
> Now this patch version is v3. and stay a long time. Any maintainer
> will continue to review this patch ?  Thanks!

Sorry for long dalays.  I have it on my TODO list for this week along
with the overflow fix v2.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 4/4] conntrack: compact the size of conn structure.

2021-02-24 Thread Aaron Conole
"hepeng.0320"  writes:

> From: hepeng 
>
> This patch tries to use ovs_spin_lock as an alternative to reduce the
> size of each conn. The ovs_mutex_lock consumes 48 bytes while the
> ovs_spin_lock only uses 16 bytes. Also, we remove the alg info into an
> extra space as alg is not common in the real world userspace ct use case.
> (mostly used as security group and nat in the cloud).
>
> The size of conn_tcp: 312 -> 240.

I didn't do a thorough review of either this patch or 3/4.

> Signed-off-by: Peng He 
> ---
>  lib/conntrack-private.h |  86 
>  lib/conntrack-tp.c  |  18 +++
>  lib/conntrack.c | 108 +---
>  3 files changed, 131 insertions(+), 81 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index 2aa828674..51a3f5347 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -93,48 +93,92 @@ enum ct_alg_ctl_type {
>  CT_ALG_CTL_MAX,
>  };
>  
> -#define CONN_FLAG_NAT_MASK 0xf
> -#define CONN_FLAG_CTL_FTP  (CT_ALG_CTL_FTP  << 4)
> -#define CONN_FLAG_CTL_TFTP (CT_ALG_CTL_TFTP << 4)
> -#define CONN_FLAG_CTL_SIP  (CT_ALG_CTL_SIP  << 4)
> +#define CONN_FLAG_NAT_MASK0xf
> +#define CONN_FLAG_CTL_FTP (CT_ALG_CTL_FTP  << 4)
> +#define CONN_FLAG_CTL_TFTP(CT_ALG_CTL_TFTP << 4)
> +#define CONN_FLAG_CTL_SIP (CT_ALG_CTL_SIP  << 4)
>  /* currently only 3 algs supported */
> -#define CONN_FLAG_ALG_MASK 0x70
> +#define CONN_FLAG_ALG_MASK0x70
> +#define CONN_FLAG_ALG_RELATED 0x80
> +#define CONN_FLAG_CLEANED 0x100 /* indicate if it is removed from timer 
> */
> + 
>  
>  struct conn_dir {
>  struct cmap_node cm_node;
>  bool orig;
>  };
>  
> +#if HAVE_PTHREAD_SPIN_LOCK == 0
> +static inline void conn_lock(const struct ovs_mutex *lock) {
> +ovs_mutex_lock(lock);
> +}
> +#else
> +static inline void conn_lock(const struct ovs_spin *lock) {
> +ovs_spin_lock(lock);
> +}
> +#endif
> +
> +#if HAVE_PTHREAD_SPIN_LOCK == 0
> +static inline void conn_unlock(const struct ovs_mutex *lock) {
> +ovs_mutex_unlock(lock);
> +}
> +#else
> +static inline void conn_unlock(const struct ovs_spin *lock) {
> +ovs_spin_unlock(lock);
> +}
> +#endif
> +
> +#if HAVE_PTHREAD_SPIN_LOCK == 0
> +static inline void conn_lock_init(struct ovs_mutex *lock) {
> +ovs_mutex_init_adaptive(lock);
> +}
> +#else
> +static inline void conn_lock_init(struct ovs_spin *lock) {
> +ovs_spin_init(lock);
> +}
> +#endif
> +
> +#if HAVE_PTHREAD_SPIN_LOCK == 0
> +static inline void conn_lock_destroy(struct ovs_mutex *lock) {
> +ovs_mutex_destroy(lock);
> +}
> +#else
> +static inline void conn_lock_destroy(struct ovs_spin *lock) {
> +(void)lock;

Please use OVS_UNUSED instead of this hack.

> +}
> +#endif
> +
> +struct conn_alg {
> +struct conn_key parent_key; /* Only used for orig_tuple support. */
> +int seq_skew;
> +bool seq_skew_dir; /* TCP sequence skew direction due to NATTing of FTP
> +* control messages; true if reply direction. */
> +};
> + 
>  struct conn {
>  /* Immutable data. */
>  struct conn_key key;
>  struct conn_dir orig;
>  struct conn_key rev_key;
>  struct conn_dir rev;
> -struct conn_key parent_key; /* Only used for orig_tuple support. */
>  struct ovs_list exp_node;
>  uint64_t conn_flags;
>  
> +/* Immutable data. */
> +int32_t admit_zone; /* The zone for managing zone limit counts. */
> +uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */
> +uint32_t tp_id; /* Timeout policy ID. */
> +
>  /* Mutable data. */
> +#if HAVE_PTHREAD_SPIN_LOCK == 0
>  struct ovs_mutex lock; /* Guards all mutable fields. */
> +#else
> +struct ovs_spin  lock; /* Guards all mutable fields. */
> +#endif
>  ovs_u128 label;
>  long long expiration;
> +struct conn_alg *alg;
>  uint32_t mark;
> -int seq_skew;
> -
> -/* Immutable data. */
> -int32_t admit_zone; /* The zone for managing zone limit counts. */
> -uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */
> -
> -/* Mutable data. */
> -bool seq_skew_dir; /* TCP sequence skew direction due to NATTing of FTP
> -* control messages; true if reply direction. */
> -bool cleaned; /* True if cleaned from expiry lists. */
> -
> -/* Immutable data. */
> -bool alg_related; /* True if alg data connection. */
> -
> -uint32_t tp_id; /* Timeout policy ID. */
>  };
>  
>  static inline struct conn * conn_from_conndir(struct conn_dir *conndir) {
> diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> index a586d3a8d..549ec7632 100644
> --- a/lib/conntrack-tp.c
> +++ b/lib/conntrack-tp.c
> @@ -236,19 +236,19 @@ conn_update_expiration__(struct conntrack *ct, struct 
> conn *conn,
>   uint32_t tp_value)
>  OVS_REQUIRES(conn->lock)
>  {
> -ovs_mutex_unlock(>lock);
> +conn_unlock(>lock);
>  
>

Re: [ovs-dev] [PATCH 2/4] conntrack: remove redundant comparation in nat_packet and un_nat_packet

2021-02-24 Thread Aaron Conole
"hepeng.0320"  writes:

> From: hepeng 
>
> In this patch, we split pat_packet and un_pat_packet function into _src
> and _dst two functions. When calling this two functions, we can usually
> infer the specific nat actions from the caller.
>
> This patch makes the conntrack code cleaner.

This is doing much more than the above mentioned - it refactors v4/v6
code, and makes a generic callin that takes a function to perform
tcp/udp/other port nat.  It's a bit difficult to review for that
reason.

> Signed-off-by: Peng He 
> ---
>  lib/conntrack.c | 261 
>  1 file changed, 152 insertions(+), 109 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 73d3a2fb2..a22252a63 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -701,24 +701,26 @@ handle_alg_ctl(struct conntrack *ct, const struct 
> conn_lookup_ctx *ctx,
>  }
>  
>  static void
> -pat_packet(struct dp_packet *pkt, const struct conn *conn)
> +pat_packet_src(struct dp_packet *pkt, const struct conn *conn)
>  {
> -if (conn->conn_flags & NAT_ACTION_SRC) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -struct tcp_header *th = dp_packet_l4(pkt);
> -packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -struct udp_header *uh = dp_packet_l4(pkt);
> -packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
> -}
> -} else if (conn->conn_flags & NAT_ACTION_DST) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -struct tcp_header *th = dp_packet_l4(pkt);
> -packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -struct udp_header *uh = dp_packet_l4(pkt);
> -packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
> -}
> +if (conn->key.nw_proto == IPPROTO_TCP) {
> +struct tcp_header *th = dp_packet_l4(pkt);
> +packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
> +} else if (conn->key.nw_proto == IPPROTO_UDP) {
> +struct udp_header *uh = dp_packet_l4(pkt);
> +packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
> +}
> +}
> +
> +static void
> +pat_packet_dst(struct dp_packet *pkt, const struct conn *conn)
> +{
> +if (conn->key.nw_proto == IPPROTO_TCP) {
> +struct tcp_header *th = dp_packet_l4(pkt);
> +packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
> +} else if (conn->key.nw_proto == IPPROTO_UDP) {
> +struct udp_header *uh = dp_packet_l4(pkt);
> +packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
>  }
>  }
>  
> @@ -738,7 +740,7 @@ nat_packet(struct dp_packet *pkt, const struct conn 
> *conn, bool related)
>   >rev_key.dst.addr.ipv6, true);
>  }
>  if (!related) {
> -pat_packet(pkt, conn);
> +pat_packet_src(pkt, conn);
>  }
>  } else if (conn->conn_flags & NAT_ACTION_DST) {
>  pkt->md.ct_state |= CS_DST_NAT;
> @@ -753,61 +755,92 @@ nat_packet(struct dp_packet *pkt, const struct conn 
> *conn, bool related)
>   >rev_key.src.addr.ipv6, true);
>  }
>  if (!related) {
> -pat_packet(pkt, conn);
> +pat_packet_dst(pkt, conn);
>  }
>  }
>  }
>  
>  static void
> -un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
> +un_pat_packet_src(struct dp_packet *pkt, const struct conn *conn)
>  {
> -if (conn->conn_flags & NAT_ACTION_SRC) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -struct tcp_header *th = dp_packet_l4(pkt);
> -packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -struct udp_header *uh = dp_packet_l4(pkt);
> -packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
> -}
> -} else if (conn->conn_flags & NAT_ACTION_DST) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -struct tcp_header *th = dp_packet_l4(pkt);
> -packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -struct udp_header *uh = dp_packet_l4(pkt);
> -packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
> -}
> +if (conn->key.nw_proto == IPPROTO_TCP) {
> +struct tcp_header *th = dp_packet_l4(pkt);
> +packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
> +} else if (conn->key.nw_proto == IPPROTO_UDP) {
> +struct udp_header *uh = dp_packet_l4(pkt);
> +packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
>  }
>  }
>  
>  static void
> -reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
> 

Re: [ovs-dev] [PATCH 1/4] conntrack: remove nat_conn

2021-02-24 Thread Aaron Conole
"hepeng.0320"  writes:

> From: hepeng 
>
> Currently, when doing NAT, the userspace conntrack will use an extra
> conn for the two directions in a flow. However, each conn has actually
> the two keys for both orig and rev flow directions. This patch
> introduces a conn_dir member in the conn and it consists of both rev and
> orig cmap_node for hash lookup. This saves extra allocation for
> nat_conn, and makes userspace code much cleaner.

If I'm understanding this correctly, you still re-insert the conn into
the conn list?

> We also introduces a conn_flags member to reduce the memory footprint of a
> conn.

We should split this out to a separate patch.

> Signed-off-by: Peng He 
> ---
>  lib/conntrack-private.h |  20 +--
>  lib/conntrack.c | 264 
>  2 files changed, 121 insertions(+), 163 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index 789af82ff..6bec43d3f 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -83,21 +83,23 @@ struct alg_exp_node {
>  bool nat_rpl_dst;
>  };
>  
> -enum OVS_PACKED_ENUM ct_conn_type {
> -CT_CONN_TYPE_DEFAULT,
> -CT_CONN_TYPE_UN_NAT,
> +#define CONN_FLAG_NAT_MASK 0xf
> +
> +struct conn_dir {
> +struct cmap_node cm_node;
> +bool orig;

This naming is confusing.  We can have 'conn->orig->orig'.  Consider
renaming one of these fields to distinguish what is going on.  I would
prefer seeing 'fwd' used (since it's the forward direction).

>  };
>  
>  struct conn {
>  /* Immutable data. */
>  struct conn_key key;
> +struct conn_dir orig;
>  struct conn_key rev_key;
> +struct conn_dir rev;

I think this might be better if setup like:
+enum ct_direction {
+CT_DIRECTION_FWD,
+CT_DIRECTION_REV,
+CT_DIRECTIONS
+}
+
+struct conn_data {
+struct conn_key key;
+struct cmap_node cm_node;
 };
 
 struct conn {
-struct conn_key key;
-struct conn_key rev_key;
+struct conn_data cn_data[CT_DIRECTIONS];
...

Then in the code, we can always get orig and rev information:

  conn->cn_data[CT_DIRECTION_FWD]
  conn->cn_data[CT_DIRECTION_REV]

Did I miss something?

>  struct conn_key parent_key; /* Only used for orig_tuple support. */
>  struct ovs_list exp_node;
> -struct cmap_node cm_node;
> -struct nat_action_info_t *nat_info;
> +uint64_t conn_flags;

As mentioned, please separate this as a different patch.  This is new
piece of functionality.

>  char *alg;
> -struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
>  
>  /* Mutable data. */
>  struct ovs_mutex lock; /* Guards all mutable fields. */
> @@ -117,11 +119,15 @@ struct conn {
>  
>  /* Immutable data. */
>  bool alg_related; /* True if alg data connection. */
> -enum ct_conn_type conn_type;
>  
>  uint32_t tp_id; /* Timeout policy ID. */
>  };
>  
> +static inline struct conn * conn_from_conndir(struct conn_dir *conndir) {
> +return conndir->orig ? CONTAINER_OF(conndir, struct conn, orig) : \
> +   CONTAINER_OF(conndir, struct conn, rev);
> +}
> +
>  enum ct_update_res {
>  CT_UPDATE_INVALID,
>  CT_UPDATE_VALID,
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 930ed0be6..73d3a2fb2 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -94,7 +94,6 @@ static struct conn *new_conn(struct conntrack *ct, struct 
> dp_packet *pkt,
>   uint32_t tp_id);
>  static void delete_conn_cmn(struct conn *);
>  static void delete_conn(struct conn *);
> -static void delete_conn_one(struct conn *conn);
>  static enum ct_update_res conn_update(struct conntrack *ct, struct conn 
> *conn,
>struct dp_packet *pkt,
>struct conn_lookup_ctx *ctx,
> @@ -108,8 +107,8 @@ static void set_label(struct dp_packet *, struct conn *,
>  static void *clean_thread_main(void *f_);
>  
>  static bool
> -nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
> -   struct conn *nat_conn);
> +nat_select_range_tuple(struct conntrack *ct, struct conn *conn,
> +   const struct nat_action_info_t *nat_action);
>  
>  static uint8_t
>  reverse_icmp_type(uint8_t type);
> @@ -233,6 +232,7 @@ conn_key_cmp(const struct conn_key *key1, const struct 
> conn_key *key2)
>  return 1;
>  }
>  
> +#if 0

Please remove functions which are no longer used.

>  static void
>  ct_print_conn_info(const struct conn *c, const char *log_msg,
> enum vlog_level vll, bool force, bool rl_on)
> @@ -287,6 +287,7 @@ ct_print_conn_info(const struct conn *c, const char 
> *log_msg,
>  }
>  }
>  }
> +#endif
>  
>  /* Initializes the connection tracker 'ct'.  The caller is responsible for
>   * calling 'conntrack_destroy()', when the instance is not needed anymore */
> @@ -437,7 +438,7 @@ conn_clean_cmn(struct conntrack *ct, 

Re: [ovs-dev] [PATCH v1 6/9] conntrack: Do not schedule zero ms timers

2021-02-24 Thread William Tu
On Tue, Feb 23, 2021 at 4:35 PM Gaëtan Rivet  wrote:
>
> On Tue, Feb 23, 2021, at 22:56, William Tu wrote:
> > On Wed, Feb 17, 2021 at 8:34 AM Gaetan Rivet  wrote:
> > >
> > > When ct_sweep() is far behind on its work, the 'next_wake' returned can
> > > be before the moment it started. When it happens, the thread schedules a
> > > zero ms timer that is logged as an error.
> > >
> > > Instead, mark the thread for immediate wake in the next poll_block().
> > >
> > > Signed-off-by: Gaetan Rivet 
> > > Reviewed-by: Eli Britstein 
> > > ---
> >
> > Looks ok to me.
> > I guess previously we don't want to clean too often, so there is
> > a minimal CT_CLEAN_MIN_INTERVAL.
> > With this change, we might end up busy doing ct_sweep() and
> > hit 100% cpu?
> >
>
> Yes, this patch and the next will remove CT_CLEAN_MIN_INTERVAL.
> The thread will still call poll_block() and quiesce, but it has the potential 
> to hog the core if work is still needed.
>
> Is it an issue? If so instead it could yield for a short time.
> The main idea is to avoid waiting for 200 ms, as it is not useful now.
>
I don't have a strong opinion on this.
It's just sometimes customers complain OVS is using too much CPU.
And they start to look at which process/thread is using 100%.
Maybe we can find a balance between a reasonable backlog, accuracy of
timeout, and cpu utilizaion.

William
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v1 1/9] conntrack: Use rcu-lists to store conn expirations

2021-02-24 Thread William Tu
On Tue, Feb 23, 2021 at 4:34 PM Gaëtan Rivet  wrote:
>
>
>
> On Tue, Feb 23, 2021, at 22:55, William Tu wrote:
> > Hi Gaetan,
> >
> > Thanks for the patch, looks very useful.
> > I haven't tested it yet.
> > Minor question/comments inline.
>
> Hi William, thanks for taking a look!
>
> [...]
> > >
> > > +/* Timeouts: all the possible timeout states passed to 
> > > update_expiration()
> > > + * are listed here. The name will be prefix by CT_TM_ and the value is in
> > > + * milliseconds */
> > > +#define CT_TIMEOUTS \
> > > +CT_TIMEOUT(TCP_FIRST_PACKET) \
> > > +CT_TIMEOUT(TCP_OPENING) \
> > > +CT_TIMEOUT(TCP_ESTABLISHED) \
> > > +CT_TIMEOUT(TCP_CLOSING) \
> > > +CT_TIMEOUT(TCP_FIN_WAIT) \
> > > +CT_TIMEOUT(TCP_CLOSED) \
> > > +CT_TIMEOUT(OTHER_FIRST) \
> > > +CT_TIMEOUT(OTHER_MULTIPLE) \
> > > +CT_TIMEOUT(OTHER_BIDIR) \
> > > +CT_TIMEOUT(ICMP_FIRST) \
> > > +CT_TIMEOUT(ICMP_REPLY)
> > > +
> > > +enum ct_timeout {
> > > +#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> > > +CT_TIMEOUTS
> > > +#undef CT_TIMEOUT
> > > +N_CT_TM
> > > +};
> > > +
> > any reason for moving this macro to here?
> >
>
> I embed the enum ct_timeout within the conn expiration node, so I must have 
> it defined first.
> I did not want to put too much code between 'ct_conn_type' below and the 
> 'conn' struct itself,
> which is why I put 'ct_timeout' just above.
>
> Otherwise, this could be avoided by linking the rculist pointer directly 
> instead of storing its index.
> But I thought it would be unwise as it would more tightly couple the list 
> type to the expiration node, and it means taking a full pointer vs. storing 
> an enum.
>

I see, make sense, thanks!

> > >  enum OVS_PACKED_ENUM ct_conn_type {
> > >  CT_CONN_TYPE_DEFAULT,
> > >  CT_CONN_TYPE_UN_NAT,
> > >  };
> > >
> > > +struct conn_expire {
> > > +/* Set once when initializing the expiration node. */
> > > +struct conntrack *ct;
> > > +/* Timeout state of the connection.
> > > + * It follows the connection state updates.
> > > + */
> > > +enum ct_timeout tm;
> > > +/* Insert and remove the expiration node only once per RCU syncs.
> > > + * If multiple threads update the connection, its expiration should
> > > + * be removed only once and added only once to timeout lists.
> > > + */
> > > +atomic_flag insert_once;
> > > +atomic_flag remove_once;
> > > +struct rculist node;
> > > +};
> > > +
> > >  struct conn {
> > >  /* Immutable data. */
> > >  struct conn_key key;
> > >  struct conn_key rev_key;
> > >  struct conn_key parent_key; /* Only used for orig_tuple support. */
> > > -struct ovs_list exp_node;
> > >  struct cmap_node cm_node;
> > >  struct nat_action_info_t *nat_info;
> > >  char *alg;
> > > @@ -104,6 +143,7 @@ struct conn {
> > >
> > >  /* Mutable data. */
> > >  struct ovs_mutex lock; /* Guards all mutable fields. */
> > > +struct conn_expire exp;
> > >  ovs_u128 label;
> > >  long long expiration;
> > >  uint32_t mark;
> > > @@ -132,33 +172,10 @@ enum ct_update_res {
> > >  CT_UPDATE_VALID_NEW,
> > >  };
> > >
> > > -/* Timeouts: all the possible timeout states passed to 
> > > update_expiration()
> > > - * are listed here. The name will be prefix by CT_TM_ and the value is in
> > > - * milliseconds */
> > > -#define CT_TIMEOUTS \
> > > -CT_TIMEOUT(TCP_FIRST_PACKET) \
> > > -CT_TIMEOUT(TCP_OPENING) \
> > > -CT_TIMEOUT(TCP_ESTABLISHED) \
> > > -CT_TIMEOUT(TCP_CLOSING) \
> > > -CT_TIMEOUT(TCP_FIN_WAIT) \
> > > -CT_TIMEOUT(TCP_CLOSED) \
> > > -CT_TIMEOUT(OTHER_FIRST) \
> > > -CT_TIMEOUT(OTHER_MULTIPLE) \
> > > -CT_TIMEOUT(OTHER_BIDIR) \
> > > -CT_TIMEOUT(ICMP_FIRST) \
> > > -CT_TIMEOUT(ICMP_REPLY)
> > > -
> > > -enum ct_timeout {
> > > -#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> > > -CT_TIMEOUTS
> > > -#undef CT_TIMEOUT
> > > -N_CT_TM
> > > -};
> > > -
> > >  struct conntrack {
> > >  struct ovs_mutex ct_lock; /* Protects 2 following fields. */
> > >  struct cmap conns OVS_GUARDED;
> > > -struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
> > > +struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
> > >  struct hmap zone_limits OVS_GUARDED;
> > >  struct hmap timeout_policies OVS_GUARDED;
> > >  uint32_t hash_basis; /* Salt for hashing a connection key. */
> > > @@ -204,4 +221,13 @@ struct ct_l4_proto {
> > > struct ct_dpif_protoinfo *);
> > >  };
> > >
> > > +static inline void
> > > +conn_expire_remove(struct conn_expire *exp)
> > > +{
> > > +if (!atomic_flag_test_and_set(>remove_once)
> > > +&& rculist_next(>node)) {
> > > +rculist_remove(>node);
> > > +}
> > > +}
> > > +
> > >  #endif /* conntrack-private.h */
> > > diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> > > index a586d3a8d..30ba4bda8 100644
> > > --- a/lib/conntrack-tp.c
> > > +++ b/lib/conntrack-tp.c
> > 

Re: [ovs-dev] [PATCH] ovs-ctl: Allow recording hostname separately

2021-02-24 Thread Ilya Maximets
On 2/19/21 9:10 AM, Christian Ehrhardt wrote:
> On Tue, Feb 16, 2021 at 5:32 PM Frode Nordahl
>  wrote:
>>
>> ovs-ctl determines the system FQDN or hostname and records it in
>> the `external-ids:hostname` field of the `Open-vSwitch` table on
>> system startup.
>>
>> This value may be consumed by downstream software and having it
>> unset or set to a incorrect value could lead to erratic behavior
>> of a system.
>>
>> When a system is configured to use a Open vSwitch controlled
>> datapath as its only network connection, the current ordering of
>> events would always produce a unreliable hostname
>>
>> Reported-At: https://bugs.launchpad.net/bugs/1915829
>> Signed-off-by: Frode Nordahl 
> 
> I've looked at this for the Ubuntu problem that started Frodes work,
> and while I can't give any authoritative Ack I can at least say
> 
> Reviewed-by: Christian Ehrhardt 
> 

Hi.  Reading the description and the bug on launchpad it seems that
the issue is that hostname in database changes in unexpected way and
this is somewhat similar to what the following patch tried to fix:
  
http://patchwork.ozlabs.org/project/openvswitch/patch/20200525152821.19838-1-dalva...@redhat.com/

Is it still a problem with above patch applied?

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn 1/3] Properly handle hairpin traffic for VIPs with shared backends.

2021-02-24 Thread Dumitru Ceara

On 2/24/21 10:34 AM, Numan Siddique wrote:

On Wed, Feb 24, 2021 at 2:43 PM Numan Siddique  wrote:


On Wed, Feb 24, 2021 at 2:37 PM Numan Siddique  wrote:


On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:


If two load balancer VIPs share the same backend, both sets of hairpin
reply learn() flows should be generated.  In order to ensure that,
also match on the original destination IP and port tuple.  These are
now stored in OVS registers by ovn-northd in stage ls-in-stateful.

An alternative solution would be to add an additional match on
ct_nw_dst() and ct_tp_dst() in the hairpin detection flows but it's
better to avoid that because these matches are usually not offloadable
to hardware.

To ensure backwards compatibility though, if ovn-controller detects
that ovn-northd doesn't store the original destination tuple
information in OVS registers, ovn-controller falls back to using
ct_nw_dst() and ct_tp_dst().

Reported-by: Tim Rozet 
Reported-at: https://bugzilla.redhat.com/1931599
Fixes: 022ea339c8e2 ("lflow: Use learn() action to generate LB hairpin reply 
flows.")
Signed-off-by: Dumitru Ceara 


Hi Dumitru,

Thanks for the fix.

A couple of minor nits below.
With those addressed.

Acked-by: Numan Siddique 



I would also suggest to enhance the test in ovn.at to pause ovn-northd,
clear the lb option - 'hairpin_orig_tuple' and make sure that flows
have the match with the ct_* fields.


My bad. You already do it. Please forget this comment.

Numan



Thanks for the review Numan!

I sent a v2: http://patchwork.ozlabs.org/project/ovn/list/?series=230875

Regards,
Dumitru

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] ofproto:fix use-after-free

2021-02-24 Thread Ilya Maximets
On 3/9/20 3:35 AM, guohongzhi (A) wrote:
> Only RCU may not be sufficient. The deletion of rule and group uses both RCU 
> and reference accounting, but the deletion of ofproto uses only RCU.
> 
> The execution process as follows:
> ofproto_destroy=>p->ofproto_class->destruct=>ofproto_rule_delete=>ofproto_rule_unref
>  (suppose rule-A’s reference accounting not reach the last, rule-A will not 
> be added to deffered deletion list )=>…=>ofproto_destroy(The ofproto will be 
> added to deferred deletion list directly in the last line of the 
> function)=>soon after,rule-A’s reference accounting reach the last, it will 
> be added to deferred deletion list after oproto. So, ofproto will be released 
> before the rule-A. When the rule_destroy_cb is executed, the internal access 
> of ofproto will cause use-after-free.
> 
> 
> -Original Message-
> From: Ben Pfaff [mailto:b...@ovn.org] 
> Sent: 2020年3月7日 4:58
> To: guohongzhi (A) 
> Cc: d...@openvswitch.org; num...@ovn.org; Zhoujingbin (Robin, Russell Lab) 
> ; chenchanghu ; Lilijun 
> (Jerry) 
> Subject: Re: [PATCH] [ovs-dev]ofproto:fix use-after-free
> 
> On Fri, Mar 06, 2020 at 09:05:55PM +0800, guohongzhi wrote:
>> ASAN report use-after-free when destroy ofproto_rule, the 
>> rule->ofproto has freed in ofproto_destroy.
>> Add ref_count for ofproto to avoid use-after-free when destroy 
>> ofproto_rule adn group.
>>
>> Signed-off-by: guohongzhi 
> 
> Why isn't RCU sufficient to avoid use-after-free?

Marking this patch as 'changes requested' in patchwork in context
of my comments to the similar patch:
  
http://patchwork.ozlabs.org/project/openvswitch/patch/20200717015041.82746-1-hepeng.0...@bytedance.com/#2637046

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn v2 1/3] Properly handle hairpin traffic for VIPs with shared backends.

2021-02-24 Thread Dumitru Ceara
If two load balancer VIPs share the same backend, both sets of hairpin
reply learn() flows should be generated.  In order to ensure that,
also match on the original destination IP and port tuple.  These are
now stored in OVS registers by ovn-northd in stage ls-in-stateful.

An alternative solution would be to add an additional match on
ct_nw_dst() and ct_tp_dst() in the hairpin detection flows but it's
better to avoid that because these matches are usually not offloadable
to hardware.

To ensure backwards compatibility though, if ovn-controller detects
that ovn-northd doesn't store the original destination tuple
information in OVS registers, ovn-controller falls back to using
ct_nw_dst() and ct_tp_dst().

Reported-by: Tim Rozet 
Reported-at: https://bugzilla.redhat.com/1931599
Fixes: 022ea339c8e2 ("lflow: Use learn() action to generate LB hairpin reply 
flows.")
Acked-by: Numan Siddique 
Signed-off-by: Dumitru Ceara 
---
V2:
- Addressed Numan's comments:
  - fix doc.
  - move smap declaration.
  - better define MFF_LOG_LB_ORIG_DIP_IPV6.
---
 controller/lflow.c   |   57 ++
 include/ovn/logical-fields.h |8 +
 lib/lb.c |3 
 lib/lb.h |3 
 northd/ovn-northd.8.xml  |   21 ++
 northd/ovn-northd.c  |   93 +-
 ovn-sb.xml   |6 +
 tests/ofproto-macros.at  |2 
 tests/ovn-northd.at  |   28 ++-
 tests/ovn.at |  377 +++---
 tests/system-ovn.at  |   40 +++-
 11 files changed, 489 insertions(+), 149 deletions(-)

diff --git a/controller/lflow.c b/controller/lflow.c
index 8468cb4..37a621a 100644
--- a/controller/lflow.c
+++ b/controller/lflow.c
@@ -1228,6 +1228,12 @@ add_lb_vip_hairpin_reply_action(struct in6_addr *vip6, 
ovs_be32 vip,
 ofpact_finish_LEARN(ofpacts, );
 }
 
+/* Adds flows to detect hairpin sessions.
+ *
+ * For backwards compatibilty with older ovn-northd versions, uses
+ * ct_nw_dst(), ct_ipv6_dst(), ct_tp_dst(), otherwise uses the
+ * original destination tuple stored by ovn-northd.
+ */
 static void
 add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
  struct ovn_lb_vip *lb_vip,
@@ -1243,30 +1249,59 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
 put_load(, sizeof value, MFF_LOG_FLAGS,
  MLF_LOOKUP_LB_HAIRPIN_BIT, 1, );
 
+/* Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() requires matching
+ * on ct_state first.
+ */
+if (!lb->hairpin_orig_tuple) {
+uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
+match_set_ct_state_masked(_match, ct_state, ct_state);
+}
+
 if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
 ovs_be32 bip4 = in6_addr_get_mapped_ipv4(_backend->ip);
-ovs_be32 vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
+ovs_be32 vip4 = in6_addr_get_mapped_ipv4(_vip->vip);
+ovs_be32 snat_vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
 ? lb->hairpin_snat_ips.ipv4_addrs[0].addr
-: in6_addr_get_mapped_ipv4(_vip->vip);
+: vip4;
 
 match_set_dl_type(_match, htons(ETH_TYPE_IP));
 match_set_nw_src(_match, bip4);
 match_set_nw_dst(_match, bip4);
 
-add_lb_vip_hairpin_reply_action(NULL, vip4, lb_proto,
+if (!lb->hairpin_orig_tuple) {
+match_set_ct_nw_dst(_match, vip4);
+} else {
+match_set_reg(_match,
+  MFF_LOG_LB_ORIG_DIP_IPV4 - MFF_LOG_REG0,
+  ntohl(vip4));
+}
+
+add_lb_vip_hairpin_reply_action(NULL, snat_vip4, lb_proto,
 lb_backend->port,
 lb->slb->header_.uuid.parts[0],
 );
 } else {
 struct in6_addr *bip6 = _backend->ip;
-struct in6_addr *vip6 = lb->hairpin_snat_ips.n_ipv6_addrs
-? >hairpin_snat_ips.ipv6_addrs[0].addr
-: _vip->vip;
+struct in6_addr *snat_vip6 =
+lb->hairpin_snat_ips.n_ipv6_addrs
+? >hairpin_snat_ips.ipv6_addrs[0].addr
+: _vip->vip;
 match_set_dl_type(_match, htons(ETH_TYPE_IPV6));
 match_set_ipv6_src(_match, bip6);
 match_set_ipv6_dst(_match, bip6);
 
-add_lb_vip_hairpin_reply_action(vip6, 0, lb_proto,
+if (!lb->hairpin_orig_tuple) {
+match_set_ct_ipv6_dst(_match, _vip->vip);
+} else {
+ovs_be128 vip6_value;
+
+memcpy(_value, _vip->vip, sizeof vip6_value);
+match_set_xxreg(_match,
+MFF_LOG_LB_ORIG_DIP_IPV6 - MFF_LOG_XXREG0,
+ntoh128(vip6_value));
+}
+
+add_lb_vip_hairpin_reply_action(snat_vip6, 0, lb_proto,
 lb_backend->port,
 

[ovs-dev] [PATCH ovn v2 3/3] northd: Avoid matching on ct.dnat flags for load balancers.

2021-02-24 Thread Dumitru Ceara
Matching on ct.dnat creates openflows that often are not offloadable
to hardware.  ovn-northd uses ct.dnat only for load balancer hairpin
traffic handling and it turns out we don't really need to match on
ct.dnat.

Acked-by: Numan Siddique 
Signed-off-by: Dumitru Ceara 
---
V2:
- Address Numan's comments:
  - Add ovn-northd.at unit test for new hairpin logical flows.
---
 northd/ovn-northd.8.xml |   32 +++--
 northd/ovn-northd.c |   52 +--
 tests/ovn-northd.at |   29 ++
 3 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index deffe8c..a16937a 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -813,19 +813,12 @@
   
 If the logical switch has load balancer(s) configured, then a
 priority-100 flow is added with the match
-ip  ct.trk ct.dnat to check if the
+ip  ct.trk to check if the
 packet needs to be hairpinned (if after load balancing the destination
-IP matches the source IP) or not by executing the action
-reg0[6] = chk_lb_hairpin(); and advances the packet to
-the next table.
-  
-
-  
-If the logical switch has load balancer(s) configured, then a
-priority-90 flow is added with the match ip to check if
-the packet is a reply for a hairpinned connection or not by executing
-the action reg0[6] = chk_lb_hairpin_reply(); and advances
-the packet to the next table.
+IP matches the source IP) or not by executing the actions
+reg0[6] = chk_lb_hairpin(); and
+reg0[12] = chk_lb_hairpin_reply(); and advances the packet
+to the next table.
   
 
   
@@ -838,16 +831,25 @@
   
  If the logical switch has load balancer(s) configured, then a
  priority-100 flow is added with the match
- ip  (ct.new || ct.est)  ct.trk 
- ct.dnat  reg0[6] == 1 which hairpins the traffic by
+ ip  ct.new  ct.trk 
+ reg0[6] == 1 which hairpins the traffic by
  NATting source IP to the load balancer VIP by executing the action
  ct_snat_to_vip and advances the packet to the next table.
   
 
   
  If the logical switch has load balancer(s) configured, then a
+ priority-100 flow is added with the match
+ ip  ct.est  ct.trk 
+ reg0[6] == 1 which hairpins the traffic by
+ NATting source IP to the load balancer VIP by executing the action
+ ct_snat and advances the packet to the next table.
+  
+
+  
+ If the logical switch has load balancer(s) configured, then a
  priority-90 flow is added with the match
- ip  reg0[6] == 1 which matches on the replies
+ ip  reg0[12] == 1 which matches on the replies
  of hairpinned traffic (i.e., destination IP is VIP,
  source IP is the backend IP and source L4 port is backend port for L4
  load balancers) and executes ct_snat and advances the
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 5a81211..746995f 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -227,6 +227,7 @@ enum ovn_stage {
 #define REGBIT_ACL_HINT_DROP  "reg0[9]"
 #define REGBIT_ACL_HINT_BLOCK "reg0[10]"
 #define REGBIT_LKUP_FDB   "reg0[11]"
+#define REGBIT_HAIRPIN_REPLY  "reg0[12]"
 
 #define REG_ORIG_DIP_IPV4 "reg1"
 #define REG_ORIG_DIP_IPV6 "xxreg1"
@@ -266,7 +267,8 @@ enum ovn_stage {
  *
  * Logical Switch pipeline:
  * ++--+---+--+
- * | R0 | REGBIT_{CONNTRACK/DHCP/DNS/HAIRPIN}  |   |  |
+ * | R0 | REGBIT_{CONNTRACK/DHCP/DNS}  |   |  |
+ * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   | X |  |
  * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} | X |  |
  * ++--+ X |  |
  * | R1 | ORIG_DIP_IPV4 (>= IN_STATEFUL)   | R |  |
@@ -6036,39 +6038,49 @@ build_lb_hairpin(struct ovn_datapath *od, struct hmap 
*lflows)
 ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 0, "1", "next;");
 
 if (od->has_lb_vip) {
-/* Check if the packet needs to be hairpinned. */
-ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_HAIRPIN, 100,
-"ip && ct.trk && ct.dnat",
-REGBIT_HAIRPIN " = chk_lb_hairpin(); next;",
+/* Check if the packet needs to be hairpinned.
+ * Set REGBIT_HAIRPIN in the original direction and
+ * REGBIT_HAIRPIN_REPLY in the reply direction.
+ */
+ovn_lflow_add_with_hint(
+lflows, od, S_SWITCH_IN_PRE_HAIRPIN, 100, "ip && ct.trk",
+REGBIT_HAIRPIN " = 

[ovs-dev] [PATCH ovn v2 2/3] lflow: Avoid matching on conntrack original tuple if possible.

2021-02-24 Thread Dumitru Ceara
Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() creates openflows
that often are not offloadable to hardware.  This was used for
detecting load balancer hairpin sessions.

However, it can be avoided if ovn-northd stores the original
destination tuple in OVS registers.  For backwards compatibility,
during upgrade, fall back to matching on the conntrack original tuple.

Acked-by: Numan Siddique 
Signed-off-by: Dumitru Ceara 
---
V2:
- Fixed usage of MFF_LOG_LB_ORIG_DIP_IPV6 after change in patch 1/3.
---
 controller/lflow.c |   50 +
 tests/ovn.at   |   90 ++--
 2 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/controller/lflow.c b/controller/lflow.c
index 37a621a..a3d84af 100644
--- a/controller/lflow.c
+++ b/controller/lflow.c
@@ -1341,6 +1341,12 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
 ofpbuf_uninit();
 }
 
+/* Adds flows to perform SNAT for hairpin sessions.
+ *
+ * For backwards compatibilty with older ovn-northd versions, uses
+ * ct_nw_dst(), ct_ipv6_dst(), ct_tp_dst(), otherwise uses the
+ * original destination tuple stored by ovn-northd.
+ */
 static void
 add_lb_ct_snat_vip_flows(struct ovn_controller_lb *lb,
  struct ovn_lb_vip *lb_vip,
@@ -1383,20 +1389,50 @@ add_lb_ct_snat_vip_flows(struct ovn_controller_lb *lb,
 ofpact_finish(, >ofpact);
 
 struct match match = MATCH_CATCHALL_INITIALIZER;
+
+/* Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() requires matching
+ * on ct_state first.
+ */
+if (!lb->hairpin_orig_tuple) {
+uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
+match_set_ct_state_masked(, ct_state, ct_state);
+}
+
 if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
+ovs_be32 vip4 = in6_addr_get_mapped_ipv4(_vip->vip);
+
 match_set_dl_type(, htons(ETH_TYPE_IP));
-match_set_ct_nw_dst(, in6_addr_get_mapped_ipv4(_vip->vip));
+
+if (!lb->hairpin_orig_tuple) {
+match_set_ct_nw_dst(, vip4);
+} else {
+match_set_reg(, MFF_LOG_LB_ORIG_DIP_IPV4 - MFF_LOG_REG0,
+  ntohl(vip4));
+}
 } else {
 match_set_dl_type(, htons(ETH_TYPE_IPV6));
-match_set_ct_ipv6_dst(, _vip->vip);
+
+if (!lb->hairpin_orig_tuple) {
+match_set_ct_ipv6_dst(, _vip->vip);
+} else {
+ovs_be128 vip6_value;
+
+memcpy(_value, _vip->vip, sizeof vip6_value);
+match_set_xxreg(, MFF_LOG_LB_ORIG_DIP_IPV6 - MFF_LOG_XXREG0,
+ntoh128(vip6_value));
+}
 }
 
 match_set_nw_proto(, lb_proto);
-match_set_ct_nw_proto(, lb_proto);
-match_set_ct_tp_dst(, htons(lb_vip->vip_port));
-
-uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
-match_set_ct_state_masked(, ct_state, ct_state);
+if (lb_vip->vip_port) {
+if (!lb->hairpin_orig_tuple) {
+match_set_ct_nw_proto(, lb_proto);
+match_set_ct_tp_dst(, htons(lb_vip->vip_port));
+} else {
+match_set_reg_masked(, MFF_LOG_LB_ORIG_TP_DPORT - MFF_REG0,
+ lb_vip->vip_port, UINT16_MAX);
+}
+}
 
 for (size_t i = 0; i < lb->slb->n_datapaths; i++) {
 match_set_metadata(,
diff --git a/tests/ovn.at b/tests/ovn.at
index b061f88..b465784 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -23700,7 +23700,7 @@ NXST_FLOW reply (xid=0x8):
 ])
 
 AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=70 | ofctl_strip_all | grep 
-v NXST], [0], [dnl
- table=70, 
priority=100,ct_state=+trk+dnat,ct_nw_dst=88.88.88.88,ct_nw_proto=6,ct_tp_dst=8080,tcp,metadata=0x1
 actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
+ table=70, priority=100,tcp,reg1=0x58585858,reg2=0x1f90/0x,metadata=0x1 
actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
 ])
 
 AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=68 | grep -v NXST], [1], 
[dnl
@@ -23729,8 +23729,8 @@ NXST_FLOW reply (xid=0x8):
 ])
 
 AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=70 | ofctl_strip_all | grep 
-v NXST], [0], [dnl
- table=70, 
priority=100,ct_state=+trk+dnat,ct_nw_dst=88.88.88.88,ct_nw_proto=6,ct_tp_dst=8080,tcp,metadata=0x1
 actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
- table=70, 
priority=100,ct_state=+trk+dnat,ct_nw_dst=88.88.88.90,ct_nw_proto=6,ct_tp_dst=8080,tcp,metadata=0x1
 actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.90))
+ table=70, priority=100,tcp,reg1=0x58585858,reg2=0x1f90/0x,metadata=0x1 
actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
+ table=70, priority=100,tcp,reg1=0x5858585a,reg2=0x1f90/0x,metadata=0x1 
actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.90))
 ])
 
 AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=68 | grep -v NXST], [1], 
[dnl
@@ -23760,8 +23760,8 @@ AT_CHECK([as hv2 ovs-ofctl 

[ovs-dev] [PATCH v2 ovn 0/3] Handle LB VIPs that share backends and make flows offloadable.

2021-02-24 Thread Dumitru Ceara
Patch 1/3 fixes a bug when using learn() action to generate hairpin
reply flows for different VIPs that share backends.  It also addresses
backwards compatibility in order to avoid having issues during upgrades.

Patches 2/3 and 3/3 use the new OVS registers populated by patch 1/3 to
simplify the hairpin flows and to avoid matching on conntrack original
tuple fields or ct.dnat as tests have shown that these are not offloadable
on specific NICs (e.g., MLX-CX5).

Changes in V2:
- Addressed Numan's comments.
- Added Numan's acks.

Dumitru Ceara (3):
  Properly handle hairpin traffic for VIPs with shared backends.
  lflow: Avoid matching on conntrack original tuple if possible.
  northd: Avoid matching on ct.dnat flags for load balancers.


 controller/lflow.c   |  107 +-
 include/ovn/logical-fields.h |8 +
 lib/lb.c |3 
 lib/lb.h |3 
 northd/ovn-northd.8.xml  |   53 +++--
 northd/ovn-northd.c  |  143 +++---
 ovn-sb.xml   |6 +
 tests/ofproto-macros.at  |2 
 tests/ovn-northd.at  |   57 -
 tests/ovn.at |  443 +-
 tests/system-ovn.at  |   40 ++--
 11 files changed, 642 insertions(+), 223 deletions(-)

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix crash at ofproto_destroy

2021-02-24 Thread Ilya Maximets
On 2/24/21 3:29 PM, Ilya Maximets wrote:
> On 7/17/20 3:50 AM, hepeng.0320 wrote:
>> From: Peng He 
>>
>> The call stack of rule_destroy_cb:
>>
>> remove_rules_postponed (one grace period)
>> -> remove_rule_rcu
>> -> remove_rule_rcu__
>> -> ofproto_rule_unref -> ref count != 1
>> -> ... more grace periods.
>> -> rule_destroy_cb (> 2 grace periods)
>> -> free
>>
>> Currently ofproto_destory waits only two grace periods, this means when
>> rule_destroy_cb is called, the ofproto might have been already destroyed.
>>
>> This patch adds a refcount for ofproto to ensure ofproto exists when it
>> is needed to call free functions.
>>
>> This patch fixes the crashes found:
>> Program terminated with signal SIGSEGV, Segmentation fault.
>> 0  0x562a55169e49 in rule_destroy_cb (rule=0x562a598be2c0) at 
>> ofproto/ofproto.c:2956
>> 1  0x562a552623d6 in ovsrcu_call_postponed () at lib/ovs-rcu.c:348
>> 2  0x562a55262504 in ovsrcu_postpone_thread (arg=) at 
>> lib/ovs-rcu.c:364
>> 3  0x562a55264aef in ovsthread_wrapper (aux_=) at 
>> lib/ovs-thread.c:383
>> 4  0x7febe715a4a4 in start_thread (arg=0x7febe4c1c700) at 
>> pthread_create.c:456
>> 5  0x7febe6990d0f in clone () at 
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>> (gdb) p rule->ofproto->ofproto_class
>> $3 = (const struct ofproto_class *) 0x0
>>
>> Also:
>> 0  ofproto_dpif_credit_table_stats (ofproto=0x5583583332b0, table_id=0 
>> '\000', n_matches=5, n_misses=0) at ofproto/ofproto-dpif.c:4310
>> 1  0x558354c68514 in xlate_push_stats_entry 
>> (entry=entry@entry=0x7ff488031398, stats=stats@entry=0x7ff49af30b60) at 
>> ofproto/ofproto-dpif-xlate-cache.c:99
>> 2  0x558354c686f3 in xlate_push_stats (xcache=, 
>> stats=stats@entry=0x7ff49af30b60) at ofproto/ofproto-dpif-xlate-cache.c:181
>> 3  0x558354c5411a in revalidate_ukey (udpif=udpif@entry=0x5583583baba0, 
>> ukey=ukey@entry=0x7ff47809f770, stats=stats@entry=0x7ff49af32128, 
>> odp_actions=odp_actions@entry=0x7ff49af30c50, 
>> reval_seq=reval_seq@entry=275670456,
>>recircs=recircs@entry=0x7ff49af30cd0) at 
>> ofproto/ofproto-dpif-upcall.c:2292
>> 4  0x558354c57cbc in revalidate (revalidator=) at 
>> ofproto/ofproto-dpif-upcall.c:2681
>> 5  0x558354c57f8e in udpif_revalidator (arg=0x5583583d2a90) at 
>> ofproto/ofproto-dpif-upcall.c:934
>> 6  0x558354d24aef in ovsthread_wrapper (aux_=) at 
>> lib/ovs-thread.c:383
>> 7  0x7ff4a479d4a4 in start_thread (arg=0x7ff49af35700) at 
>> pthread_create.c:456
>> 8  0x7ff4a3fd3d0f in clone () at 
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>> (gdb) p ofproto->up.ofproto_class
>> $3 = (const struct ofproto_class *) 0x0
>>
>> Signed-off-by: Peng He 
>> ---

CC: Hongzhi Guo

And I just remembered that there is a very similar patch that
was submitted to the mail-list several months betore this one.
Ben started review, but didn't finish:

http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.19884-1-guohongz...@huawei.com/

Both patches has smilar issues, i.e. mixing RCU with refcounters
and not covering all the cases.

There are at least 2 major places where ofproto needs to be refcounted:
1. rules
   1.a xlate caches?
2. ofproto groups

IMO, mixing RCU with refcounts basically means that we do not have a
clear understanding on how it works and using RCU as a safety net.
We shuld stop using RCU for orproto and just have refcounts.  Once
ofproto refcount reaches zero it should be immediately destroyed.
All the places where we need to increase refcount needs to be tracked
down.

It'll be great if you can cooperate to prepare a complete solution
for this issue.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v6 ovn] controller: introduce stats counters for ovn-controller incremental processing

2021-02-24 Thread Lorenzo Bianconi
Introduce inc-engine/show-stats ovs-appctl command in order to dump
ovn-controller incremental processing engine statistics. So far for each
node a counter for run, abort and engine_handler have been added.
Counters are incremented when the node move to "updated" state.
In order to dump I-P stats we can can use the following commands:

$ovs-appctl -t ovn-controller inc-engine/show-stats
Node: SB_address_set
- recompute:   38
- compute:  0
- abort:0
Node: addr_sets
- recompute:2
- compute:  0
- abort:1
Node: SB_port_group
- recompute:   37
- compute:  0
- abort:0

Node: flow_output
- recompute:2
- compute: 20
- abort:0

Moreover introduce the inc-engine/clear-stats command to reset engine
statistics

$ovs-appctl -t ovn-controller inc-engine/clear-stats

Signed-off-by: Lorenzo Bianconi 
---
Changes since v5:
- cosmetics
- add missing recompute increments
- move about counter in engine_run routine
- add missing ovs-appctl cmd documentation
Changes since v4:
- rename handlers in inc-engine/show-stats and inc-engine/clear-stats
- move all the code in lib/inc-proc-eng.{c,h}
- instad of run and change_handler, track node recompute and node compute
Changes since v3:
- drop engine_set_note_update_from_run/engine_set_note_update_from_handler
  macros and move stats code in lib/inc-proc-eng.c
- fix commit log
Changes since v2:
- introduce inc-engine/stats and inc-engine/stats-clear commands and drop
  COVERAGE_* dependency
Changes since v1:
- drop handler counters and add global abort counter
- improve documentation and naming scheme
- introduce engine_set_node_updated utility macro
---
 NEWS|  1 +
 controller/ovn-controller.8.xml | 22 
 lib/inc-proc-eng.c  | 46 +
 lib/inc-proc-eng.h  |  9 +++
 4 files changed, 78 insertions(+)

diff --git a/NEWS b/NEWS
index fca96f658..6cbb88add 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,6 @@
 Post-v21.03.0
 -
+  - Introduce ovn-controller incremetal processing engine statistics
 
 OVN v21.03.0 - 5 Mar 2020
 -
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
index 51c0c372c..8886df568 100644
--- a/controller/ovn-controller.8.xml
+++ b/controller/ovn-controller.8.xml
@@ -578,6 +578,28 @@
 Displays logical flow cache statistics: enabled/disabled, per cache
 type entry counts.
   
+
+  inc-engine/show-stats
+  
+Display ovn-controller engine counters. For each engine
+node the following counters have been added:
+
+  
+recompute
+  
+  
+compute
+  
+  
+abort
+  
+
+  
+
+  inc-engine/clear-stats
+  
+Reset ovn-controller engine counters.
+  
   
 
 
diff --git a/lib/inc-proc-eng.c b/lib/inc-proc-eng.c
index 916dbbe39..a6337a1d9 100644
--- a/lib/inc-proc-eng.c
+++ b/lib/inc-proc-eng.c
@@ -27,6 +27,7 @@
 #include "openvswitch/hmap.h"
 #include "openvswitch/vlog.h"
 #include "inc-proc-eng.h"
+#include "unixctl.h"
 
 VLOG_DEFINE_THIS_MODULE(inc_proc_eng);
 
@@ -102,6 +103,40 @@ engine_get_nodes(struct engine_node *node, size_t *n_count)
 return engine_topo_sort(node, NULL, n_count, _size);
 }
 
+static void
+engine_clear_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
+   const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
+{
+for (size_t i = 0; i < engine_n_nodes; i++) {
+struct engine_node *node = engine_nodes[i];
+
+memset(>stats, 0, sizeof node->stats);
+}
+unixctl_command_reply(conn, NULL);
+}
+
+static void
+engine_dump_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
+  const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
+{
+struct ds dump = DS_EMPTY_INITIALIZER;
+
+for (size_t i = 0; i < engine_n_nodes; i++) {
+struct engine_node *node = engine_nodes[i];
+
+ds_put_format(,
+  "Node: %s\n"
+  "- recompute: %12"PRIu64"\n"
+  "- compute:   %12"PRIu64"\n"
+  "- abort: %12"PRIu64"\n",
+  node->name, node->stats.recompute,
+  node->stats.compute, node->stats.abort);
+}
+unixctl_command_reply(conn, ds_cstr());
+
+ds_destroy();
+}
+
 void
 engine_init(struct engine_node *node, struct engine_arg *arg)
 {
@@ -115,6 +150,11 @@ engine_init(struct engine_node *node, struct engine_arg 
*arg)
 engine_nodes[i]->data = NULL;
 }
 }
+
+unixctl_command_register("inc-engine/show-stats", "", 0, 0,
+ engine_dump_stats, NULL);
+unixctl_command_register("inc-engine/clear-stats", "", 0, 0,
+  

Re: [ovs-dev] [PATCH] netdev-linux: correct unit of burst parameter

2021-02-24 Thread 0-day Robot
Bleep bloop.  Greetings Simon Horman, I am a robot and I have tried out your 
patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Unexpected sign-offs from developers who are not authors or co-authors 
or committers: Simon Horman , Louis Peens 

Lines checked: 38, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn 3/3] northd: Avoid matching on ct.dnat flags for load balancers.

2021-02-24 Thread Dumitru Ceara

On 2/24/21 10:17 AM, Numan Siddique wrote:

On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:


Matching on ct.dnat creates openflows that often are not offloadable
to hardware.  ovn-northd uses ct.dnat only for load balancer hairpin
traffic handling and it turns out we don't really need to match on
ct.dnat.

Signed-off-by: Dumitru Ceara 


Acked-by: Numan Siddique 

Can you please add tests in ovn-northd for this ? This would make sure
ovn-northd-ddlog (once we have the ddlog patches)
is not out of sync with northd.


Good idea, thanks for pointing it out!

+Ben

I think that if my patches go in before the ddlog ones it's still ok to 
accept the ddlog ones in their current state.  I can work on a follow up 
to add ddlog support for the new LB hairpin logical flows soon afterwards.




Thanks
Numan



Thanks,
Dumitru

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn 2/3] lflow: Avoid matching on conntrack original tuple if possible.

2021-02-24 Thread Dumitru Ceara

On 2/24/21 10:15 AM, Numan Siddique wrote:

On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:


Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() creates openflows
that often are not offloadable to hardware.  This was used for
detecting load balancer hairpin sessions.

However, it can be avoided if ovn-northd stores the original
destination tuple in OVS registers.  For backwards compatibility,
during upgrade, fall back to matching on the conntrack original tuple.

Signed-off-by: Dumitru Ceara 


Hi Numan,



Acked-by: Numan Siddique 


Thanks for the review!



It will be great if you can enhance the test to pause northd, clear
the lb option
and make sure ovn-controller programs the flow in the old way.


The test I added in patch 1/3 does that already.  There's no change in 
this patch because in backwards compatibility mode flows in table=70 
look exactly the same as they did with patch 1/3.




Thanks
Numan



Thanks,
Dumitru

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] netdev-linux: correct unit of burst parameter

2021-02-24 Thread Simon Horman
From: Yong Xu 

Correct calculation of burst parameter used when configuring TC policer
action for ingress port-based policing in the case where TC offload is in
use. This now matches the value calculated for the case where TC offload is
not in use.

The division by 8 is to convert from bits to bytes.
Its unclear why 64 was previously used.

Fixes: e7f6ba220 ("lib/tc: add ingress ratelimiting support for tc-offload")
Signed-off-by: Yong Xu 
[simon: reworked changelog]
Signed-off-by: Simon Horman 
Signed-off-by: Louis Peens 
---
 lib/netdev-linux.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 6be23dbee..eb28777f9 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2572,7 +2572,7 @@ exit:
 static struct tc_police
 tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
 {
-unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
+unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8;
 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
 struct tc_police police;
 struct tc_ratespec rate;
-- 
2.20.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [ovs-dev v2] ofproto: add refcount to ofproto to fix crash at ofproto_destroy

2021-02-24 Thread Ilya Maximets
On 7/17/20 3:50 AM, hepeng.0320 wrote:
> From: Peng He 
> 
> The call stack of rule_destroy_cb:
> 
> remove_rules_postponed (one grace period)
> -> remove_rule_rcu
> -> remove_rule_rcu__
> -> ofproto_rule_unref -> ref count != 1
> -> ... more grace periods.
> -> rule_destroy_cb (> 2 grace periods)
> -> free
> 
> Currently ofproto_destory waits only two grace periods, this means when
> rule_destroy_cb is called, the ofproto might have been already destroyed.
> 
> This patch adds a refcount for ofproto to ensure ofproto exists when it
> is needed to call free functions.
> 
> This patch fixes the crashes found:
> Program terminated with signal SIGSEGV, Segmentation fault.
> 0  0x562a55169e49 in rule_destroy_cb (rule=0x562a598be2c0) at 
> ofproto/ofproto.c:2956
> 1  0x562a552623d6 in ovsrcu_call_postponed () at lib/ovs-rcu.c:348
> 2  0x562a55262504 in ovsrcu_postpone_thread (arg=) at 
> lib/ovs-rcu.c:364
> 3  0x562a55264aef in ovsthread_wrapper (aux_=) at 
> lib/ovs-thread.c:383
> 4  0x7febe715a4a4 in start_thread (arg=0x7febe4c1c700) at 
> pthread_create.c:456
> 5  0x7febe6990d0f in clone () at 
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> (gdb) p rule->ofproto->ofproto_class
> $3 = (const struct ofproto_class *) 0x0
> 
> Also:
> 0  ofproto_dpif_credit_table_stats (ofproto=0x5583583332b0, table_id=0 
> '\000', n_matches=5, n_misses=0) at ofproto/ofproto-dpif.c:4310
> 1  0x558354c68514 in xlate_push_stats_entry 
> (entry=entry@entry=0x7ff488031398, stats=stats@entry=0x7ff49af30b60) at 
> ofproto/ofproto-dpif-xlate-cache.c:99
> 2  0x558354c686f3 in xlate_push_stats (xcache=, 
> stats=stats@entry=0x7ff49af30b60) at ofproto/ofproto-dpif-xlate-cache.c:181
> 3  0x558354c5411a in revalidate_ukey (udpif=udpif@entry=0x5583583baba0, 
> ukey=ukey@entry=0x7ff47809f770, stats=stats@entry=0x7ff49af32128, 
> odp_actions=odp_actions@entry=0x7ff49af30c50, 
> reval_seq=reval_seq@entry=275670456,
>recircs=recircs@entry=0x7ff49af30cd0) at ofproto/ofproto-dpif-upcall.c:2292
> 4  0x558354c57cbc in revalidate (revalidator=) at 
> ofproto/ofproto-dpif-upcall.c:2681
> 5  0x558354c57f8e in udpif_revalidator (arg=0x5583583d2a90) at 
> ofproto/ofproto-dpif-upcall.c:934
> 6  0x558354d24aef in ovsthread_wrapper (aux_=) at 
> lib/ovs-thread.c:383
> 7  0x7ff4a479d4a4 in start_thread (arg=0x7ff49af35700) at 
> pthread_create.c:456
> 8  0x7ff4a3fd3d0f in clone () at 
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> (gdb) p ofproto->up.ofproto_class
> $3 = (const struct ofproto_class *) 0x0
> 
> Signed-off-by: Peng He 
> ---
>  ofproto/ofproto-dpif-xlate-cache.c |  1 +
>  ofproto/ofproto-dpif.c | 22 
>  ofproto/ofproto-provider.h |  2 ++
>  ofproto/ofproto.c  | 41 +++---
>  ofproto/ofproto.h  |  4 +++
>  5 files changed, 56 insertions(+), 14 deletions(-)
> 
> diff --git a/ofproto/ofproto-dpif-xlate-cache.c 
> b/ofproto/ofproto-dpif-xlate-cache.c
> index dcc91cb38..0deee365d 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.c
> +++ b/ofproto/ofproto-dpif-xlate-cache.c
> @@ -209,6 +209,7 @@ xlate_cache_clear_entry(struct xc_entry *entry)
>  {
>  switch (entry->type) {
>  case XC_TABLE:
> +ofproto_unref(&(entry->table.ofproto->up));
>  break;
>  case XC_RULE:
>  ofproto_rule_unref(>rule->up);
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index 4f0638f23..6480b3c78 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -4414,11 +4414,12 @@ rule_dpif_lookup_from_table(struct ofproto_dpif 
> *ofproto,
>  }
>  if (xcache) {
>  struct xc_entry *entry;
> -
> -entry = xlate_cache_add_entry(xcache, XC_TABLE);
> -entry->table.ofproto = ofproto;
> -entry->table.id = *table_id;
> -entry->table.match = true;
> +if (ofproto_try_ref(>up)) {
> +entry = xlate_cache_add_entry(xcache, XC_TABLE);

You're handling only XC_TABLE cache entries.  Do we need to ref ofproto
for other types of cache?  XC_NORMAL holds the pointer too.

> +entry->table.ofproto = ofproto;
> +entry->table.id = *table_id;
> +entry->table.match = true;
> +}
>  }
>  return rule;
>  }
> @@ -4450,11 +4451,12 @@ rule_dpif_lookup_from_table(struct ofproto_dpif 
> *ofproto,
>  }
>  if (xcache) {
>  struct xc_entry *entry;
> -
> -entry = xlate_cache_add_entry(xcache, XC_TABLE);
> -entry->table.ofproto = ofproto;
> -entry->table.id = next_id;
> -entry->table.match = (rule != NULL);
> +if (ofproto_try_ref(>up)) {
> +entry = xlate_cache_add_entry(xcache, 

[ovs-dev] [PATCH ovn] northd: Remove the usage of 'ct.inv' in logical flows.

2021-02-24 Thread numans
From: Numan Siddique 

Presently we add 65535 priority lflows in the stages -
'ls_in_acl' and 'ls_out_acl' to drop packets which
match on 'ct.inv'.

As per the 'ovs-fields' man page, this
ct state field can be used to identify problems such as:
 •  L3/L4 protocol handler is not loaded/unavailable.

 •  L3/L4 protocol handler determines that the packet is
malformed.

 •  Packets are unexpected length for protocol.

This patch removes the usage of this field for the following
reasons:

 • Some of the smart NICs which support offloading datapath
   flows don't support this field.

 • A recent commit in kernel ovs datapath sets the committed
   connection tracking entry to be liberal for out-of-window
   tcp packets (nf_ct_set_tcp_be_liberal()).  Such TCP
   packets will not be marked as invalid.

 • Even if a ct.inv packet is delivered to a VIF, the
   networking stack of the VIF's kernel can handle such
   packets.

Signed-off-by: Numan Siddique 
---
 northd/ovn-northd.c | 24 
 tests/ovn-northd.at | 24 
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index dcdb777a2..e30fb532c 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -5617,10 +5617,10 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows,
  *
  * This is enforced at a higher priority than ACLs can be defined. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
-  "ct.inv || (ct.est && ct.rpl && ct_label.blocked == 1)",
+  "ct.est && ct.rpl && ct_label.blocked == 1",
   "drop;");
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
-  "ct.inv || (ct.est && ct.rpl && ct_label.blocked == 1)",
+  "ct.est && ct.rpl && ct_label.blocked == 1",
   "drop;");
 
 /* Ingress and Egress ACL Table (Priority 65535).
@@ -5633,12 +5633,12 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows,
  *
  * This is enforced at a higher priority than ACLs can be defined. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
-  "ct.est && !ct.rel && !ct.new && !ct.inv "
-  "&& ct.rpl && ct_label.blocked == 0",
+  "ct.est && !ct.rel && !ct.new && "
+  "ct.rpl && ct_label.blocked == 0",
   "next;");
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
-  "ct.est && !ct.rel && !ct.new && !ct.inv "
-  "&& ct.rpl && ct_label.blocked == 0",
+  "ct.est && !ct.rel && !ct.new && "
+  "ct.rpl && ct_label.blocked == 0",
   "next;");
 
 /* Ingress and Egress ACL Table (Priority 65535).
@@ -5653,12 +5653,12 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows,
  * related traffic such as an ICMP Port Unreachable through
  * that's generated from a non-listening UDP port.  */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
-  "!ct.est && ct.rel && !ct.new && !ct.inv "
-  "&& ct_label.blocked == 0",
+  "!ct.est && ct.rel && !ct.new && "
+  "ct_label.blocked == 0",
   "next;");
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
-  "!ct.est && ct.rel && !ct.new && !ct.inv "
-  "&& ct_label.blocked == 0",
+  "!ct.est && ct.rel && !ct.new && "
+  "ct_label.blocked == 0",
   "next;");
 
 /* Ingress and Egress ACL Table (Priority 65535).
@@ -5846,11 +5846,11 @@ build_lb(struct ovn_datapath *od, struct hmap *lflows)
  *
  * Send established traffic through conntrack for just NAT. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, UINT16_MAX - 1,
-  "ct.est && !ct.rel && !ct.new && !ct.inv && "
+  "ct.est && !ct.rel && !ct.new && "
   "ct_label.natted == 1",
   REGBIT_CONNTRACK_NAT" = 1; next;");
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, UINT16_MAX - 1,
-  "ct.est && !ct.rel && !ct.new && !ct.inv && "
+  "ct.est && !ct.rel && !ct.new && "
   "ct_label.natted == 1",
   REGBIT_CONNTRACK_NAT" = 1; next;");
 }
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index ad0f9f562..dc49c2543 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -1940,9 +1940,9 @@ AT_CHECK([ovn-sbctl lflow-list ls | grep -e 
ls_in_acl_hint -e ls_out_acl_hint -e
   table=4 (ls_out_acl_hint), priority=6, match=(!ct.new && ct.est && 
!ct.rpl && ct_label.blocked == 1), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
   

Re: [ovs-dev] [PATCH RESEND v2 1/3] dpif-netdev: Fix the meter buckets overflow.

2021-02-24 Thread 0-day Robot
Bleep bloop.  Greetings Tonghao Zhang, I am a robot and I have tried out your 
patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
ERROR: Inappropriate spacing around cast
#52 FILE: lib/dpif-netdev.c:6213:
band->bucket += (uint64_t)delta_t * band->rate;

Lines checked: 110, Warnings: 0, Errors: 1


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] dpctl/add-flow: Fix ovs-dpdk crash when add dp flow without in_port field.

2021-02-24 Thread Ilya Maximets
On 2/24/21 3:51 AM, Mao YingMing wrote:
> Fix the following ovs-dpdk crash:
> 
> $ ovs-appctl dpctl/add-flow  "eth(),eth_type(0x0800),ipv4()" "3"
> unixctl|WARN|error communicating with 
> unix:/usr/local/var/run/openvswitch/ovs-vswitchd.1995.ctl: End of file
> ovs-appctl: ovs-vswitchd: transaction error (End of file)
> 
> ovs-vswitchd.log record:
> util(ovs-vswitchd)|EMER|lib/dpif-netdev.c:3638: assertion 
> match->wc.masks.in_port.odp_port == ODPP_NONE failed in dp_netdev_flow_add()
> daemon_unix(monitor)|ERR|2 crashes: pid 1995 died, killed (Aborted), core 
> dumped, restarting

Good catch.  Even though 'dpctl/add-flow' is a debug interface
and this should not happen in normal case, we need to fix this.

Could you, please, add a unit test for this issue to tests/dpif-netdev.at ?

Some comments inline.

Best regards, Ilya Maximets.

> 
> Fix result:
> $ ovs-appctl dpctl/add-flow  "eth(),eth_type(0x0800),ipv4()" "3"
> ovs-vswitchd: updating flow table (Invalid argument)
> ovs-appctl: ovs-vswitchd: server returned an error
> 
> ovs-vswitchd.log record:
> dpif_netdev|ERR|missing in_port info in dpif_netdev_flow_put
> dpif|WARN|netdev@ovs-netdev: failed to put[create] (Invalid argument) 
> ufid:c6994eb3-9c27-467d-9c41-8d235e1511b5 
> eth(src=00:00:00:00:00:00/00:00:00:00:00:00,dst=00:00:00:00:00:00/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=0.0.0.0/0.0.0.0,dst=0.0.0.0/0.0.0.0,proto=0/0,tos=0/0,ttl=0/0),
>  actions:3
> 
> Signed-off-by: Mao YingMing 
> ---
>  lib/dpif-netdev.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e3fd0a0..567c3d1 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -3834,6 +3834,12 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct 
> dpif_flow_put *put)
>  return error;
>  }
>  
> +if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
> +VLOG_ERR("missing in_port info in %s", __func__);

We likely need the error message rate limited to avoid flood in logs
in case we ever hit that in a normal scenario.  You could create a
separate rate_limit variable and use VLOG_ERR_RL macro.

The message itself might be a bit more user-friendly.  Maybe something like:

VLOG_ERR_RL(, "failed to put%s flow: in_port is not an exact match",
(put->flags & DPIF_FP_CREATE) ? "[create]"
: (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");


> +error = EINVAL;

No need to set error. Just return EINVAL.

> +return error;
> +}> +
>  if (put->ufid) {
>  ufid = *put->ufid;
>  } else {
> 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovs-dev v3 1/4] dpif-netdev: Expand the meters supported number.

2021-02-24 Thread Tonghao Zhang
Now this patch version is v3. and stay a long time. Any maintainer
will continue to review this patch ?  Thanks!

On Sat, May 23, 2020 at 6:33 PM  wrote:
>
> From: Tonghao Zhang 
>
> For now, ovs-vswitchd use the array of the dp_meter struct
> to store meter's data, and at most, there are only 65536
> (defined by MAX_METERS) meters that can be used. But in some
> case, for example, in the edge gateway, we should use 200,000,
> at least, meters for IP address bandwidth limitation.
> Every one IP address will use two meters for its rx and tx
> path[1]. In other way, ovs-vswitchd should support meter-offload
> (rte_mtr_xxx api introduced by dpdk.), but there are more than
> 65536 meters in the hardware, such as Mellanox ConnectX-6.
>
> This patch use array to manage the meter, but it can ben expanded.
>
> [1].
> $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
>
> Cc: Ilya Maximets 
> Cc: William Tu 
> Cc: Jarno Rajahalme 
> Cc: Ben Pfaff 
> Cc: Andy Zhou 
> Cc: Pravin Shelar 
> Signed-off-by: Tonghao Zhang 
> ---
> v3:
> * rename n_meters -> n_allocated in dp_meter_instance
> * rename count -> n_used in dp_meter_table
> * rename "ti" -> "meter_inst" or "inst" in different functions/struction
> * remove parenthesize for sizeof
> * rename dp_netdev_meter_destroy/init to dp_netdev_meter_table_destroy/init
> * fix OVS_REQUIRES style
> v2:
> * add comments for dp_meter_instance
> * change the log
> * remove extra newline
> * I don't move the dp_netdev_meter_init/destroy up. because
>   them depends other meters function and put all meter function
>   together may make the codes clean.
> ---
>  lib/dpif-netdev.c | 319 --
>  1 file changed, 250 insertions(+), 69 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 51c888501bdf..920fef3ec572 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -99,9 +99,12 @@ DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>
>  /* Configuration parameters. */
>  enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
> -enum { MAX_METERS = 65536 };/* Maximum number of meters. */
> -enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
> -enum { N_METER_LOCKS = 64 };/* Maximum number of meters. */
> +
> +/* Maximum number of meters in the table. */
> +#define METER_ENTRY_MAX (20ULL)
> +/* Maximum number of bands / meter. */
> +#define METER_BAND_MAX  (8)
> +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10)
>
>  COVERAGE_DEFINE(datapath_drop_meter);
>  COVERAGE_DEFINE(datapath_drop_upcall_error);
> @@ -284,12 +287,26 @@ struct dp_meter {
>  uint16_t flags;
>  uint16_t n_bands;
>  uint32_t max_delta_t;
> +uint32_t id;
> +struct ovs_mutex lock;
>  uint64_t used;
>  uint64_t packet_count;
>  uint64_t byte_count;
>  struct dp_meter_band bands[];
>  };
>
> +struct dp_meter_instance {
> +uint32_t n_allocated;
> +/* Followed by struct dp_meter[n]; where n is the n_allocated. */
> +OVSRCU_TYPE(struct dp_meter *) dp_meters[];
> +};
> +
> +struct dp_meter_table {
> +OVSRCU_TYPE(struct dp_meter_instance *) meter_inst;
> +uint32_t n_used;
> +struct ovs_mutex lock;
> +};
> +
>  struct pmd_auto_lb {
>  bool auto_lb_requested; /* Auto load balancing requested by user. */
>  bool is_enabled;/* Current status of Auto load balancing. */
> @@ -330,8 +347,7 @@ struct dp_netdev {
>  atomic_uint32_t tx_flush_interval;
>
>  /* Meters. */
> -struct ovs_mutex meter_locks[N_METER_LOCKS];
> -struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> +struct dp_meter_table meter_tbl;
>
>  /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
>  OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
> @@ -379,19 +395,6 @@ struct dp_netdev {
>  struct pmd_auto_lb pmd_alb;
>  };
>
> -static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> -OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -ovs_mutex_lock(>meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
> -OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -ovs_mutex_unlock(>meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -
>  static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev 
> *dp,
>  odp_port_t)
>  OVS_REQUIRES(dp->port_mutex);
> @@ -1524,6 +1527,9 @@ choose_port(struct dp_netdev *dp, const char *name)
>  return ODPP_NONE;
>  }
>
> +static void dp_netdev_meter_table_init(struct dp_meter_table *tbl);
> +static void dp_netdev_meter_table_destroy(struct dp_meter_table *tbl);
> +
>  static int
>  create_dp_netdev(const char *name, const struct dpif_class *class,
>   struct dp_netdev **dpp)
> @@ -1557,9 

[ovs-dev] [PATCH RESEND v2 3/3] tests/dpif-netdev: Update dpif-netdev meter testcase.

2021-02-24 Thread xiangxia . m . yue
From: Tonghao Zhang 

The buckets used was changed, and now dpif-netdev support
burst_size, change the testcase.

Signed-off-by: Tonghao Zhang 
---
 tests/dpif-netdev.at | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
index 2862a3c9b96d..3aa9c0cba518 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -282,7 +282,7 @@ OVS_VSWITCHD_START(
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps burst stats 
bands=type=drop rate=1 burst_size=1'])
-AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=2 kbps burst stats 
bands=type=drop rate=1 burst_size=2'])
+AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=2 kbps burst stats 
bands=type=drop rate=1 burst_size=1'])
 AT_CHECK([ovs-ofctl -O OpenFlow13 add-flow br0 'in_port=1 action=meter:1,7'])
 AT_CHECK([ovs-ofctl -O OpenFlow13 add-flow br0 'in_port=7 action=meter:2,1'])
 AT_CHECK([ovs-ofctl add-flow br1 'in_port=2 action=8'])
@@ -295,7 +295,7 @@ meter=1 pktps burst stats bands=
 type=drop rate=1 burst_size=1
 
 meter=2 kbps burst stats bands=
-type=drop rate=1 burst_size=2
+type=drop rate=1 burst_size=1
 ])
 
 ovs-appctl time/warp 5000
@@ -312,14 +312,14 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p8 
'in_port(8),packet_type(ns=0,id=0),
 sleep 1  # wait for forwarders process packets
 
 # Meter 1 is measuring packets, allowing one packet per second with
-# bursts of one packet, so 4 out of 5 packets should hit the drop
+# bursts of one packet, so 3 out of 5 packets should hit the drop
 # band.
-# Meter 2 is measuring kbps, with burst size 2 (== 2000 bits). 4 packets
+# Meter 2 is measuring kbps, with burst size 1 ( = 2000bits). 4 packets
 # (240 bytes == 1920 bits) pass, but the last packet should hit the drop band.
 AT_CHECK([ovs-ofctl -O OpenFlow13 meter-stats br0 | strip_timers], [0], [dnl
 OFPST_METER reply (OF1.3) (xid=0x2):
 meter:1 flow_count:1 packet_in_count:5 byte_in_count:300 duration:0.0s bands:
-0: packet_count:4 byte_count:240
+0: packet_count:3 byte_count:180
 
 meter:2 flow_count:1 packet_in_count:5 byte_in_count:300 duration:0.0s bands:
 0: packet_count:1 byte_count:60
@@ -343,13 +343,13 @@ sleep 1  # wait for forwarders process packets
 # Meter 1 is measuring packets, allowing one packet per second with
 # bursts of one packet, so all 5 of the new packets should hit the drop
 # band.
-# Meter 2 is measuring kbps, with burst size 2 (== 2000 bits). After 500ms
+# Meter 2 is measuring kbps, with burst size 1 (== 2000 bits). After 500ms
 # there should be space for 80 + 500 bits, so one new 60 byte (480 bit) packet
 # should pass, remaining 4 should hit the drop band.
 AT_CHECK([ovs-ofctl -O OpenFlow13 meter-stats br0 | strip_timers], [0], [dnl
 OFPST_METER reply (OF1.3) (xid=0x2):
 meter:1 flow_count:1 packet_in_count:10 byte_in_count:600 duration:0.0s bands:
-0: packet_count:9 byte_count:540
+0: packet_count:8 byte_count:480
 
 meter:2 flow_count:1 packet_in_count:10 byte_in_count:600 duration:0.0s bands:
 0: packet_count:5 byte_count:300
@@ -360,7 +360,7 @@ ovs-appctl time/warp 5000
 AT_CHECK([
 ovs-appctl coverage/read-counter datapath_drop_meter
 ], [0], [dnl
-14
+13
 ])
 
 AT_CHECK([cat ovs-vswitchd.log | filter_flow_install | 
strip_xout_keep_actions], [0], [dnl
-- 
2.27.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RESEND v2 2/3] dpif-netdev: Add the burst size to buckets.

2021-02-24 Thread xiangxia . m . yue
From: Tonghao Zhang 

For now, the meter of the userspace datapath, don't include
the bucket burst size to buckets. This patch includes it now.

$ ovs-ofctl -O OpenFlow13 add-meter br0 \
'meter=1 pktps burst stats bands=type=drop rate=1 burst_size=2000'

Signed-off-by: Tonghao Zhang 
---
 lib/dpif-netdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 614f6fef6b77..94632b85b375 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -6209,7 +6209,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct 
dp_packet_batch *packets_,
 
 band = >bands[m];
 /* Update band's bucket. */
-max_bucket_size = band->rate * 1000ULL;
+max_bucket_size = (band->burst_size + band->rate) * 1000ULL;
 band->bucket += (uint64_t)delta_t * band->rate;
 if (band->bucket > max_bucket_size) {
 band->bucket = max_bucket_size;
@@ -6332,7 +6332,8 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id 
meter_id,
 meter->bands[i].rate = config->bands[i].rate;
 meter->bands[i].burst_size = config->bands[i].burst_size;
 /* Start with a full bucket. */
-meter->bands[i].bucket = meter->bands[i].rate * 1000ULL;
+meter->bands[i].bucket =
+(meter->bands[i].burst_size + meter->bands[i].rate) * 1000ULL;
 
 /* Figure out max delta_t that is enough to fill any bucket. */
 band_max_delta_t
-- 
2.27.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RESEND v2 1/3] dpif-netdev: Fix the meter buckets overflow.

2021-02-24 Thread xiangxia . m . yue
From: Tonghao Zhang 

When setting the meter rate to 4.3+Gbps, there is an overflow, the
meters don't work as expected.

$ ovs-ofctl -O OpenFlow13 add-meter br-int "meter=1 kbps stats bands=type=drop 
rate=4294968"

Before the patch, the buckets of meters was stored in its burst_size
of ofputil_meter_band. It was overflow when we set the rate to 4294968.
This patch don't change the public API and structure. This patch remove
the "up" from dp_meter_band, and introduce the type, rate to datapath's
meter bands. Then datapath don't depend upper layer.

Signed-off-by: Tonghao Zhang 
---
 lib/dpif-netdev.c | 42 --
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4381c618f1be..614f6fef6b77 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -279,8 +279,10 @@ static bool dpcls_lookup(struct dpcls *cls,
 ( 1 << OFPMBT13_DROP )
 
 struct dp_meter_band {
-struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
-uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
+uint16_t type;
+uint32_t rate;
+uint32_t burst_size;
+uint64_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 uint64_t packet_count;
 uint64_t byte_count;
 };
@@ -6203,12 +6205,14 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct 
dp_packet_batch *packets_,
 /* Update all bands and find the one hit with the highest rate for each
  * packet (if any). */
 for (int m = 0; m < meter->n_bands; ++m) {
-band = >bands[m];
+uint64_t max_bucket_size;
 
+band = >bands[m];
 /* Update band's bucket. */
-band->bucket += delta_t * band->up.rate;
-if (band->bucket > band->up.burst_size) {
-band->bucket = band->up.burst_size;
+max_bucket_size = band->rate * 1000ULL;
+band->bucket += (uint64_t)delta_t * band->rate;
+if (band->bucket > max_bucket_size) {
+band->bucket = max_bucket_size;
 }
 
 /* Drain the bucket for all the packets, if possible. */
@@ -6226,8 +6230,8 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct 
dp_packet_batch *packets_,
  * (Only one band will be fired by a packet, and that
  * can be different for each packet.) */
 for (int i = band_exceeded_pkt; i < cnt; i++) {
-if (band->up.rate > exceeded_rate[i]) {
-exceeded_rate[i] = band->up.rate;
+if (band->rate > exceeded_rate[i]) {
+exceeded_rate[i] = band->rate;
 exceeded_band[i] = m;
 }
 }
@@ -6246,8 +6250,8 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct 
dp_packet_batch *packets_,
 /* Update the exceeding band for the exceeding packet.
  * (Only one band will be fired by a packet, and that
  * can be different for each packet.) */
-if (band->up.rate > exceeded_rate[i]) {
-exceeded_rate[i] = band->up.rate;
+if (band->rate > exceeded_rate[i]) {
+exceeded_rate[i] = band->rate;
 exceeded_band[i] = m;
 }
 }
@@ -6324,21 +6328,15 @@ dpif_netdev_meter_set(struct dpif *dpif, 
ofproto_meter_id meter_id,
 for (i = 0; i < config->n_bands; ++i) {
 uint32_t band_max_delta_t;
 
-/* Set burst size to a workable value if none specified. */
-if (config->bands[i].burst_size == 0) {
-config->bands[i].burst_size = config->bands[i].rate;
-}
-
-meter->bands[i].up = config->bands[i];
-/* Convert burst size to the bucket units: */
-/* pkts => 1/1000 packets, kilobits => bits. */
-meter->bands[i].up.burst_size *= 1000;
-/* Initialize bucket to empty. */
-meter->bands[i].bucket = 0;
+meter->bands[i].type = config->bands[i].type;
+meter->bands[i].rate = config->bands[i].rate;
+meter->bands[i].burst_size = config->bands[i].burst_size;
+/* Start with a full bucket. */
+meter->bands[i].bucket = meter->bands[i].rate * 1000ULL;
 
 /* Figure out max delta_t that is enough to fill any bucket. */
 band_max_delta_t
-= meter->bands[i].up.burst_size / meter->bands[i].up.rate;
+= meter->bands[i].bucket / meter->bands[i].rate;
 if (band_max_delta_t > meter->max_delta_t) {
 meter->max_delta_t = band_max_delta_t;
 }
-- 
2.27.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RESEND v2 0/3] dpif-netdev meter overflow fixes

2021-02-24 Thread xiangxia . m . yue
From: Tonghao Zhang 

[resend with cover letter]

Patch 1 fixes the meter overflow, with that change, patch 2
adds burst size to buckets which more accurate. Patch 3
change testcase because the way of computing buckets changed.

v2:
change the testcase.

v1:
split patches from:
http://patchwork.ozlabs.org/project/openvswitch/patch/1588244439-58766-2-git-send-email-xiangxia.m@gmail.com/
http://patchwork.ozlabs.org/project/openvswitch/patch/1588244439-58766-3-git-send-email-xiangxia.m@gmail.com/
http://patchwork.ozlabs.org/project/openvswitch/patch/1588244439-58766-4-git-send-email-xiangxia.m@gmail.com/
http://patchwork.ozlabs.org/project/openvswitch/patch/1588244439-58766-5-git-send-email-xiangxia.m@gmail.com/

Tonghao Zhang (3):
  dpif-netdev: Fix the meter buckets overflow.
  dpif-netdev: Add the burst size to buckets.
  tests/dpif-netdev: Update dpif-netdev meter testcase.

 lib/dpif-netdev.c| 43 +--
 tests/dpif-netdev.at | 16 
 2 files changed, 29 insertions(+), 30 deletions(-)

-- 
2.27.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] ovn-controller refactor doc (LONG)

2021-02-24 Thread Dumitru Ceara

On 2/11/21 3:22 PM, Mark Michelson wrote:

On 2/11/21 7:47 AM, Numan Siddique wrote:

On Thu, Feb 11, 2021 at 2:49 PM Han Zhou  wrote:


On Tue, Jan 26, 2021 at 1:04 PM Mark Michelson  
wrote:


Warning: Very long email ahead



Hi Mark,

Thanks for writing this up. Please see my comments (also very long :))
below.


And thanks for taking the time to read and respond.


Hi Mark, Numan, Han,

Thanks for the discussion until now!

I'll try to share some thoughts too.






ovn-controller has seen numerous drastic improvements over the past
several years. The focus has been mostly on performance and adding new
features. This, however, has resulted in a code base that has become
difficult to work with. The problems it has can be summed up as 
follows:
* It is not always obvious how certain pieces of code interact with 
each

other.
* The architecture comes across as monolithic. That is, the
ovn-controller application does not have individual components or 
layers

that have isolated duties.
* Southbound database IDL structures are used directly throughout the
code, rather than structures that may better represent the network
makeup or that take advantage of types and other benefits of C.
* There are a lot of special-cases. In other words, circumstances can
result in different code paths running.

These have led to problems, such as:
* Unit testing is difficult.
* System tests can be flaky. We might see some developers run tests
successfully 90% of the time, while other devs run the same tests with
90% failure.
* Attempting to replace a section of code with something new is 
difficult.

* Modifying the code can result in unexpected errors in what seems at
first glance to be an unrelated section.
* The incremental engine sometimes encounters situations that require
full recomputes of all southbound data. Many of these situations could
be avoided if the focus of the incremental engine were narrowed.

To fix these issues, we suggest the following ideas:
* Isolate database IDL usage to a data translation layer. That is,
database-specific structures from the IDL will be translated into local
structures. Any lower layers would act upon the local structures only,
never seeing the database data directly.
    * This allows lower layers to be more easily unit tested.
    * This allows changing database technologies more easily.
    * This potentially allows for more natural representations of data.


I think another implication is that such a layer would enable further 
optimization in ovn-controller.


For example, the current IDL change tracking doesn't provide information 
about old vs new values of specific columns in a table.  One consequence 
of that is that with logical datapath groups enabled, when a datapath 
group is updated (e.g., a datapath is added), we are forced to remove 
the resulting OF flows for all datapaths and readd them for the new 
contents of the 'dp_group->datapaths' column.


If the data translation layer would provide more fine grain information 
about changes that happened to a record this could be easily optimized 
and only OF flows for deleted/updated/new datapaths in a dp_group would 
have to be recomputed.


In other words, we are not restricted to the relations and limited 
types

that the databases use.


This sounds reasonable, but we should also keep in mind that an 
abstraction

layer would introduce some cost. I think it would be worth the effort &
cost if we really want to change the database technologies.


Yes, I agree. If I recall how DDLog works, then a similar translation 
layer would have to be written in order to use DDLog, since we would 
need to translate between OVSDB data and DDLog relations. It means that 
switching to DDlog would not require locking into a specific DB.





* Isolate the incremental engine to determine only on which data is
relevant to be processed by lower layers. The lower layers should then
take whatever data is relevant and perform any actions that should be
performed (e.g. claiming/releasing port bindings, writing openflow
flows, etc.)
    * This reduces most special-casing of code in lower layers.
    * This makes it easier to reason about what effect the incremental
engine will have on what will be processed.
    * This reduces the overall stretch of the incremental engine on the
rest of the code.


I think this was the initial design principle of the I-P engine. It 
should
only represent data dependency and invokes the change handlers (if 
this is
what the *lower layer* means here) to react to what's changed 
accordingly.

The difficult part is the change handler implementation: it needs to
understand the relationship between all the inputs and take correct 
actions

when any of the input changes, examine the change and reconcile by
processing the other related data.
Due to this complexity, there were only simple scenarios supported 
with I-P
initially. While adding more change handlers for more scenarios to 
utilize

I-P, the complexity 

Re: [ovs-dev] [PATCH v5 ovn] controller: introduce stats counters for ovn-controller incremental processing

2021-02-24 Thread Ilya Maximets
On 2/23/21 3:52 PM, Lorenzo Bianconi wrote:
> Introduce inc-engine/show-stats ovs-appctl command in order to dump
> ovn-controller incremental processing engine statistics. So far for each
> node a counter for run, abort and engine_handler have been added.
> Counters are incremented when the node move to "updated" state.
> In order to dump I-P stats we can can use the following commands:

Hi.  Thanks for working on this!
Few style/doc comments inline.

Best regards, Ilya Maximets.

> 
> $ovs-appctl -t ovn-controller inc-engine/show-stats
> SB_address_set
> recompute 1   abort   0   compute 0
> addr_sets
> recompute 2   abort   1   compute 0
> SB_port_group
> recompute 0   abort   0   compute 0
> port_groups
> recompute 2   abort   0   compute 0
> ofctrl_is_connected
> recompute 1   abort   0   compute 0
> OVS_open_vswitch
> recompute 0   abort   0   compute 0
> OVS_bridge
> recompute 0   abort   0   compute 0
> OVS_qos
> recompute 0   abort   0   compute 0
> SB_chassis
> recompute 1   abort   0   compute 0
> 
> 
> flow_output
> recompute 2   abort   0   compute 34
> 
> Moreover introduce the inc-engine/clear-stats command to reset engine
> statistics
> 
> $ovs-appctl -t ovn-controller inc-engine/clear-stats

These commends should be documented in controller/ovn-controller.8.xml
and, probably, mentioned in a NEWS file.

> 
> Signed-off-by: Lorenzo Bianconi 
> ---
> Changes since v4:
> - rename handlers in inc-engine/show-stats and inc-engine/clear-stats
> - move all the code in lib/inc-proc-eng.{c,h}
> - instad of run and change_handler, track node recompute and node compute
> Changes since v3:
> - drop engine_set_note_update_from_run/engine_set_note_update_from_handler
>   macros and move stats code in lib/inc-proc-eng.c
> - fix commit log
> Changes since v2:
> - introduce inc-engine/stats and inc-engine/stats-clear commands and drop
>   COVERAGE_* dependency
> Changes since v1:
> - drop handler counters and add global abort counter
> - improve documentation and naming scheme
> - introduce engine_set_node_updated utility macro
> ---
>  lib/inc-proc-eng.c | 41 +
>  lib/inc-proc-eng.h | 10 ++
>  2 files changed, 51 insertions(+)
> 
> diff --git a/lib/inc-proc-eng.c b/lib/inc-proc-eng.c
> index 916dbbe39..6afe692bb 100644
> --- a/lib/inc-proc-eng.c
> +++ b/lib/inc-proc-eng.c
> @@ -27,6 +27,7 @@
>  #include "openvswitch/hmap.h"
>  #include "openvswitch/vlog.h"
>  #include "inc-proc-eng.h"
> +#include "unixctl.h"
>  
>  VLOG_DEFINE_THIS_MODULE(inc_proc_eng);
>  
> @@ -102,6 +103,37 @@ engine_get_nodes(struct engine_node *node, size_t 
> *n_count)
>  return engine_topo_sort(node, NULL, n_count, _size);
>  }
>  
> +static void
> +engine_clear_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +   const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
> +{
> +for (size_t i = 0; i < engine_n_nodes; i++) {
> +struct engine_node *node = engine_nodes[i];
> +
> +memset(>stats, 0, sizeof(node->stats));

Please, don't parenthesize the argument of a sizeof.

> +}
> +unixctl_command_reply(conn, NULL);
> +}
> +
> +static void
> +engine_dump_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +  const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
> +{
> +struct ds dump = DS_EMPTY_INITIALIZER;
> +
> +for (size_t i = 0; i < engine_n_nodes; i++) {
> +struct engine_node *node = engine_nodes[i];
> +
> +ds_put_format(, "%s\n", node->name);
> +ds_put_format(, "\trecompute\t%lu", node->stats.recompute);
> +ds_put_format(, "\tcompute\t%lu", node->stats.compute);
> +ds_put_format(, "\tabort\t%lu\n", node->stats.abort);

Please, don't use tabs in the output.  OVS and OVN (as derived from OVS) uses
spaces for indentation including command outputs.  For the numbers there
will be fixed-size fields to make them aligned.


> +}
> +unixctl_command_reply(conn, ds_cstr());
> +
> +ds_destroy();
> +}
> +
>  void
>  engine_init(struct engine_node *node, struct engine_arg *arg)
>  {
> @@ -115,6 +147,11 @@ engine_init(struct engine_node *node, struct engine_arg 
> *arg)
>  engine_nodes[i]->data = NULL;
>  }
>  }
> +
> +unixctl_command_register("inc-engine/show-stats", "", 0, 0,
> + engine_dump_stats, NULL);
> +unixctl_command_register("inc-engine/clear-stats", "", 0, 0,
> + engine_clear_stats, NULL);
>  }
>  
>  void
> @@ -283,11 +320,13 @@ engine_recompute(struct engine_node *node, bool forced, 
> bool allowed)
>  if (!allowed) {
>  VLOG_DBG("node: %s, recompute aborted", node->name);
>  engine_set_node_state(node, EN_ABORTED);
> +node->stats.abort++;
>

Re: [ovs-dev] [PATCH] netdev-offload-tc: Flush rules on all chains before attach ingress block

2021-02-24 Thread Simon Horman
On Tue, Feb 23, 2021 at 09:53:10AM +0200, Roi Dayan wrote:
> Hi,
> 
> Any comments or thoughts on this patch?
> 
> Thanks,
> Roi

Thanks, and sorry for the extended delay.

This looks good to me. And as there has been no other activity
around this patch I've taken the liberty of pushing it to master.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH V2 13/14] netdev-offload-dpdk: Support vports flows offload

2021-02-24 Thread Sriharsha Basavapatna via dev
On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
>
> Vports are virtual, OVS only logical devices, so rte_flows cannot be
> applied as is on them. Instead, apply the rules the physical port from
> which the packet has arrived, provided by orig_in_port field.
>
> Signed-off-by: Eli Britstein 
> Reviewed-by: Gaetan Rivet 
> ---
>  lib/netdev-offload-dpdk.c | 169 ++
>  1 file changed, 137 insertions(+), 32 deletions(-)
>
> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> index f6e668bff..ad47d717f 100644
> --- a/lib/netdev-offload-dpdk.c
> +++ b/lib/netdev-offload-dpdk.c
> @@ -62,6 +62,7 @@ struct ufid_to_rte_flow_data {
>  struct rte_flow *rte_flow;
>  bool actions_offloaded;
>  struct dpif_flow_stats stats;
> +struct netdev *physdev;
>  };
>
>  /* Find rte_flow with @ufid. */
> @@ -87,7 +88,8 @@ ufid_to_rte_flow_data_find(const ovs_u128 *ufid, bool warn)
>
>  static inline struct ufid_to_rte_flow_data *
>  ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct netdev *netdev,
> -   struct rte_flow *rte_flow, bool actions_offloaded)
> +   struct netdev *vport, struct rte_flow *rte_flow,
> +   bool actions_offloaded)
>  {
>  size_t hash = hash_bytes(ufid, sizeof *ufid, 0);
>  struct ufid_to_rte_flow_data *data = xzalloc(sizeof *data);
> @@ -105,7 +107,8 @@ ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct 
> netdev *netdev,
>  }
>
>  data->ufid = *ufid;
> -data->netdev = netdev_ref(netdev);
> +data->netdev = vport ? netdev_ref(vport) : netdev_ref(netdev);
> +data->physdev = netdev_ref(netdev);

For non-tunnel offloads, we end up getting two references to the same
'netdev'; can we avoid this ? That is, get a reference to physdev only
for the vport case.
>  data->rte_flow = rte_flow;
>  data->actions_offloaded = actions_offloaded;
>
> @@ -122,6 +125,7 @@ ufid_to_rte_flow_disassociate(struct 
> ufid_to_rte_flow_data *data)
>  cmap_remove(_to_rte_flow,
>  CONST_CAST(struct cmap_node *, >node), hash);
>  netdev_close(data->netdev);
> +netdev_close(data->physdev);

Similar comment, release reference to physdev only if we got a
reference earlier (i.e., physdev should be non-null only when netdev
is a vport).
>  ovsrcu_postpone(free, data);
>  }
>
> @@ -134,6 +138,8 @@ struct flow_patterns {
>  struct rte_flow_item *items;
>  int cnt;
>  int current_max;
> +uint32_t num_of_tnl_items;

change to --> num_pmd_tnl_items
> +struct ds s_tnl;
>  };
>
>  struct flow_actions {
> @@ -154,16 +160,20 @@ struct flow_actions {
>  static void
>  dump_flow_attr(struct ds *s, struct ds *s_extra,
> const struct rte_flow_attr *attr,
> +   struct flow_patterns *flow_patterns,
> struct flow_actions *flow_actions)
>  {
>  if (flow_actions->num_of_tnl_actions) {
>  ds_clone(s_extra, _actions->s_tnl);
> +} else if (flow_patterns->num_of_tnl_items) {
> +ds_clone(s_extra, _patterns->s_tnl);
>  }
> -ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s%s",
> +ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s%s%s",
>attr->ingress  ? "ingress " : "",
>attr->egress   ? "egress " : "", attr->priority, 
> attr->group,
>attr->transfer ? "transfer " : "",
> -  flow_actions->num_of_tnl_actions ? "tunnel_set 1 " : "");
> +  flow_actions->num_of_tnl_actions ? "tunnel_set 1 " : "",
> +  flow_patterns->num_of_tnl_items ? "tunnel_match 1 " : "");
>  }
>
>  /* Adds one pattern item 'field' with the 'mask' to dynamic string 's' using
> @@ -177,9 +187,18 @@ dump_flow_attr(struct ds *s, struct ds *s_extra,
>  }
>
>  static void
> -dump_flow_pattern(struct ds *s, const struct rte_flow_item *item)
> +dump_flow_pattern(struct ds *s,
> +  struct flow_patterns *flow_patterns,
> +  int pattern_index)
>  {
> -if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
> +const struct rte_flow_item *item = _patterns->items[pattern_index];
> +
> +if (item->type == RTE_FLOW_ITEM_TYPE_END) {
> +ds_put_cstr(s, "end ");
> +} else if (flow_patterns->num_of_tnl_items &&
> +   pattern_index < flow_patterns->num_of_tnl_items) {
> +return;
> +} else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>  const struct rte_flow_item_eth *eth_spec = item->spec;
>  const struct rte_flow_item_eth *eth_mask = item->mask;
>
> @@ -569,19 +588,19 @@ dump_flow_action(struct ds *s, struct ds *s_extra,
>  static struct ds *
>  dump_flow(struct ds *s, struct ds *s_extra,
>const struct rte_flow_attr *attr,
> -  const struct rte_flow_item *items,
> +  struct flow_patterns *flow_patterns,
>struct flow_actions *flow_actions)
>  {
>  int 

Re: [ovs-dev] [PATCH V2 11/14] netdev-offload-dpdk: Refactor offload rule creation

2021-02-24 Thread Eli Britstein



On 2/24/2021 11:55 AM, Sriharsha Basavapatna wrote:

On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:

Refactor offload rule creation as a pre-step towards tunnel matching
that depend on the netdev.

Signed-off-by: Eli Britstein 
Reviewed-by: Gaetan Rivet 
---
  lib/netdev-offload-dpdk.c | 106 --
  1 file changed, 45 insertions(+), 61 deletions(-)

diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
index 493cc9159..f6e668bff 100644
--- a/lib/netdev-offload-dpdk.c
+++ b/lib/netdev-offload-dpdk.c
@@ -1009,30 +1009,6 @@ add_flow_mark_rss_actions(struct flow_actions *actions,
  add_flow_action(actions, RTE_FLOW_ACTION_TYPE_END, NULL);
  }

-static struct rte_flow *
-netdev_offload_dpdk_mark_rss(struct flow_patterns *patterns,
- struct netdev *netdev,
- uint32_t flow_mark)
-{
-struct flow_actions actions = { .actions = NULL, .cnt = 0 };
-const struct rte_flow_attr flow_attr = {
-.group = 0,
-.priority = 0,
-.ingress = 1,
-.egress = 0
-};
-struct rte_flow_error error;
-struct rte_flow *flow;
-
-add_flow_mark_rss_actions(, flow_mark, netdev);
-
-flow = netdev_offload_dpdk_flow_create(netdev, _attr, patterns->items,
-   , );
-
-free_flow_actions();
-return flow;
-}
-
  static void
  add_count_action(struct flow_actions *actions)
  {
@@ -1509,27 +1485,48 @@ parse_flow_actions(struct netdev *netdev,
  return 0;
  }

-static struct rte_flow *
-netdev_offload_dpdk_actions(struct netdev *netdev,
-struct flow_patterns *patterns,
-struct nlattr *nl_actions,
-size_t actions_len)
+static struct ufid_to_rte_flow_data *
+create_netdev_offload(struct netdev *netdev,
+  const ovs_u128 *ufid,
+  struct flow_patterns *flow_patterns,
+  struct flow_actions *flow_actions,
+  bool enable_full,
+  uint32_t flow_mark)
  {
-const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
-struct flow_actions actions = { .actions = NULL, .cnt = 0 };
+struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1, };
+struct flow_actions rss_actions = { .actions = NULL, .cnt = 0 };
+struct rte_flow_item *items = flow_patterns->items;
+struct ufid_to_rte_flow_data *flow_data = NULL;
+bool actions_offloaded = true;
  struct rte_flow *flow = NULL;
  struct rte_flow_error error;
-int ret;

-ret = parse_flow_actions(netdev, , nl_actions, actions_len);
-if (ret) {
-goto out;
+if (enable_full) {
+flow = netdev_offload_dpdk_flow_create(netdev, _attr, items,
+   flow_actions, );
  }
-flow = netdev_offload_dpdk_flow_create(netdev, _attr, patterns->items,
-   , );
-out:
-free_flow_actions();
-return flow;
+
+if (!flow) {
+/* If we failed to offload the rule actions fallback to MARK+RSS
+ * actions.
+ */

A  debug message might be useful here, when we fallback to mark/rss action ?
We can, but this is just a refactor commit, and this fallback is not 
new. Add it anyway?

+actions_offloaded = false;
+flow_attr.transfer = 0;
+add_flow_mark_rss_actions(_actions, flow_mark, netdev);
+flow = netdev_offload_dpdk_flow_create(netdev, _attr, items,
+   _actions, );
+}
+
+if (flow) {
+flow_data = ufid_to_rte_flow_associate(ufid, netdev, flow,
+   actions_offloaded);
+VLOG_DBG("%s: installed flow %p by ufid "UUID_FMT,
+ netdev_get_name(netdev), flow,
+ UUID_ARGS((struct uuid *) ufid));
+}
+
+free_flow_actions(_actions);

This free is needed only when we fallback to mark/rss offload, not otherwise.

OK.



+return flow_data;
  }

  static struct ufid_to_rte_flow_data *
@@ -1541,9 +1538,9 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
   struct offload_info *info)
  {
  struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
-struct ufid_to_rte_flow_data *flows_data = NULL;
-bool actions_offloaded = true;
-struct rte_flow *flow;
+struct flow_actions actions = { .actions = NULL, .cnt = 0 };
+struct ufid_to_rte_flow_data *flow_data = NULL;
+int err;

  if (parse_flow_match(, match)) {
  VLOG_DBG_RL(, "%s: matches of ufid "UUID_FMT" are not supported",
@@ -1551,28 +1548,15 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
  goto out;
  }

-flow = netdev_offload_dpdk_actions(netdev, , nl_actions,
-   actions_len);
-if (!flow) {
-  

Re: [ovs-dev] [PATCH v5 ovn] controller: introduce stats counters for ovn-controller incremental processing

2021-02-24 Thread Lorenzo Bianconi
> On 23/02/2021 14:52, Lorenzo Bianconi wrote:
> > Introduce inc-engine/show-stats ovs-appctl command in order to dump
> > ovn-controller incremental processing engine statistics. So far for each
> > node a counter for run, abort and engine_handler have been added.
> > Counters are incremented when the node move to "updated" state.
> > In order to dump I-P stats we can can use the following commands:
> > 
> > $ovs-appctl -t ovn-controller inc-engine/show-stats
> > SB_address_set
> > recompute 1   abort   0   compute 0
> > addr_sets
> > recompute 2   abort   1   compute 0
> > SB_port_group
> > recompute 0   abort   0   compute 0
> > port_groups
> > recompute 2   abort   0   compute 0
> > ofctrl_is_connected
> > recompute 1   abort   0   compute 0
> > OVS_open_vswitch
> > recompute 0   abort   0   compute 0
> > OVS_bridge
> > recompute 0   abort   0   compute 0
> > OVS_qos
> > recompute 0   abort   0   compute 0
> > SB_chassis
> > recompute 1   abort   0   compute 0
> > 
> > 
> > flow_output
> > recompute 2   abort   0   compute 34
> > 
> > Moreover introduce the inc-engine/clear-stats command to reset engine
> > statistics
> > 
> > $ovs-appctl -t ovn-controller inc-engine/clear-stats
> > 
> > Signed-off-by: Lorenzo Bianconi 
> > ---
> > Changes since v4:
> > - rename handlers in inc-engine/show-stats and inc-engine/clear-stats
> > - move all the code in lib/inc-proc-eng.{c,h}
> > - instad of run and change_handler, track node recompute and node compute
> > Changes since v3:
> > - drop engine_set_note_update_from_run/engine_set_note_update_from_handler
> >   macros and move stats code in lib/inc-proc-eng.c
> > - fix commit log
> > Changes since v2:
> > - introduce inc-engine/stats and inc-engine/stats-clear commands and drop
> >   COVERAGE_* dependency
> > Changes since v1:
> > - drop handler counters and add global abort counter
> > - improve documentation and naming scheme
> > - introduce engine_set_node_updated utility macro
> > ---
> >  lib/inc-proc-eng.c | 41 +
> >  lib/inc-proc-eng.h | 10 ++
> >  2 files changed, 51 insertions(+)
> > 
> > diff --git a/lib/inc-proc-eng.c b/lib/inc-proc-eng.c
> > index 916dbbe39..6afe692bb 100644
> > --- a/lib/inc-proc-eng.c
> > +++ b/lib/inc-proc-eng.c
> > @@ -27,6 +27,7 @@
> >  #include "openvswitch/hmap.h"
> >  #include "openvswitch/vlog.h"
> >  #include "inc-proc-eng.h"
> > +#include "unixctl.h"
> >  
> >  VLOG_DEFINE_THIS_MODULE(inc_proc_eng);
> >  
> > @@ -102,6 +103,37 @@ engine_get_nodes(struct engine_node *node, size_t 
> > *n_count)
> >  return engine_topo_sort(node, NULL, n_count, _size);
> >  }
> >  
> > +static void
> > +engine_clear_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
> > +   const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
> > +{
> > +for (size_t i = 0; i < engine_n_nodes; i++) {
> > +struct engine_node *node = engine_nodes[i];
> > +
> > +memset(>stats, 0, sizeof(node->stats));
> > +}
> > +unixctl_command_reply(conn, NULL);
> > +}
> > +
> > +static void
> > +engine_dump_stats(struct unixctl_conn *conn, int argc OVS_UNUSED,
> > +  const char *argv[] OVS_UNUSED, void *arg OVS_UNUSED)
> > +{
> > +struct ds dump = DS_EMPTY_INITIALIZER;
> > +
> > +for (size_t i = 0; i < engine_n_nodes; i++) {
> > +struct engine_node *node = engine_nodes[i];
> > +
> > +ds_put_format(, "%s\n", node->name);
> > +ds_put_format(, "\trecompute\t%lu", node->stats.recompute);
> > +ds_put_format(, "\tcompute\t%lu", node->stats.compute);
> > +ds_put_format(, "\tabort\t%lu\n", node->stats.abort);
> > +}
> > +unixctl_command_reply(conn, ds_cstr());
> > +
> > +ds_destroy();
> > +}
> > +
> >  void
> >  engine_init(struct engine_node *node, struct engine_arg *arg)
> >  {
> > @@ -115,6 +147,11 @@ engine_init(struct engine_node *node, struct 
> > engine_arg *arg)
> >  engine_nodes[i]->data = NULL;
> >  }
> >  }
> > +
> > +unixctl_command_register("inc-engine/show-stats", "", 0, 0,
> > + engine_dump_stats, NULL);
> > +unixctl_command_register("inc-engine/clear-stats", "", 0, 0,
> > + engine_clear_stats, NULL);
> >  }
> >  
> >  void
> > @@ -283,11 +320,13 @@ engine_recompute(struct engine_node *node, bool 
> > forced, bool allowed)
> >  if (!allowed) {
> >  VLOG_DBG("node: %s, recompute aborted", node->name);
> >  engine_set_node_state(node, EN_ABORTED);
> > +node->stats.abort++;
> >  return;
> 
> node->run() itself could cause an abort. You should move this counter
> into engine_run() and check the state of each node after
> engine_node_run() has been invoked. In this way, you 

Re: [ovs-dev] [PATCH V2 11/14] netdev-offload-dpdk: Refactor offload rule creation

2021-02-24 Thread Sriharsha Basavapatna via dev
On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:
>
> Refactor offload rule creation as a pre-step towards tunnel matching
> that depend on the netdev.
>
> Signed-off-by: Eli Britstein 
> Reviewed-by: Gaetan Rivet 
> ---
>  lib/netdev-offload-dpdk.c | 106 --
>  1 file changed, 45 insertions(+), 61 deletions(-)
>
> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> index 493cc9159..f6e668bff 100644
> --- a/lib/netdev-offload-dpdk.c
> +++ b/lib/netdev-offload-dpdk.c
> @@ -1009,30 +1009,6 @@ add_flow_mark_rss_actions(struct flow_actions *actions,
>  add_flow_action(actions, RTE_FLOW_ACTION_TYPE_END, NULL);
>  }
>
> -static struct rte_flow *
> -netdev_offload_dpdk_mark_rss(struct flow_patterns *patterns,
> - struct netdev *netdev,
> - uint32_t flow_mark)
> -{
> -struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> -const struct rte_flow_attr flow_attr = {
> -.group = 0,
> -.priority = 0,
> -.ingress = 1,
> -.egress = 0
> -};
> -struct rte_flow_error error;
> -struct rte_flow *flow;
> -
> -add_flow_mark_rss_actions(, flow_mark, netdev);
> -
> -flow = netdev_offload_dpdk_flow_create(netdev, _attr, 
> patterns->items,
> -   , );
> -
> -free_flow_actions();
> -return flow;
> -}
> -
>  static void
>  add_count_action(struct flow_actions *actions)
>  {
> @@ -1509,27 +1485,48 @@ parse_flow_actions(struct netdev *netdev,
>  return 0;
>  }
>
> -static struct rte_flow *
> -netdev_offload_dpdk_actions(struct netdev *netdev,
> -struct flow_patterns *patterns,
> -struct nlattr *nl_actions,
> -size_t actions_len)
> +static struct ufid_to_rte_flow_data *
> +create_netdev_offload(struct netdev *netdev,
> +  const ovs_u128 *ufid,
> +  struct flow_patterns *flow_patterns,
> +  struct flow_actions *flow_actions,
> +  bool enable_full,
> +  uint32_t flow_mark)
>  {
> -const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
> -struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> +struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1, };
> +struct flow_actions rss_actions = { .actions = NULL, .cnt = 0 };
> +struct rte_flow_item *items = flow_patterns->items;
> +struct ufid_to_rte_flow_data *flow_data = NULL;
> +bool actions_offloaded = true;
>  struct rte_flow *flow = NULL;
>  struct rte_flow_error error;
> -int ret;
>
> -ret = parse_flow_actions(netdev, , nl_actions, actions_len);
> -if (ret) {
> -goto out;
> +if (enable_full) {
> +flow = netdev_offload_dpdk_flow_create(netdev, _attr, items,
> +   flow_actions, );
>  }
> -flow = netdev_offload_dpdk_flow_create(netdev, _attr, 
> patterns->items,
> -   , );
> -out:
> -free_flow_actions();
> -return flow;
> +
> +if (!flow) {
> +/* If we failed to offload the rule actions fallback to MARK+RSS
> + * actions.
> + */
A  debug message might be useful here, when we fallback to mark/rss action ?
> +actions_offloaded = false;
> +flow_attr.transfer = 0;
> +add_flow_mark_rss_actions(_actions, flow_mark, netdev);
> +flow = netdev_offload_dpdk_flow_create(netdev, _attr, items,
> +   _actions, );
> +}
> +
> +if (flow) {
> +flow_data = ufid_to_rte_flow_associate(ufid, netdev, flow,
> +   actions_offloaded);
> +VLOG_DBG("%s: installed flow %p by ufid "UUID_FMT,
> + netdev_get_name(netdev), flow,
> + UUID_ARGS((struct uuid *) ufid));
> +}
> +
> +free_flow_actions(_actions);
This free is needed only when we fallback to mark/rss offload, not otherwise.

> +return flow_data;
>  }
>
>  static struct ufid_to_rte_flow_data *
> @@ -1541,9 +1538,9 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
>   struct offload_info *info)
>  {
>  struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
> -struct ufid_to_rte_flow_data *flows_data = NULL;
> -bool actions_offloaded = true;
> -struct rte_flow *flow;
> +struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> +struct ufid_to_rte_flow_data *flow_data = NULL;
> +int err;
>
>  if (parse_flow_match(, match)) {
>  VLOG_DBG_RL(, "%s: matches of ufid "UUID_FMT" are not supported",
> @@ -1551,28 +1548,15 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
>  goto out;
>  }
>
> -flow = netdev_offload_dpdk_actions(netdev, , nl_actions,
> - 

Re: [ovs-dev] [PATCH ovn 1/3] Properly handle hairpin traffic for VIPs with shared backends.

2021-02-24 Thread Numan Siddique
On Wed, Feb 24, 2021 at 2:43 PM Numan Siddique  wrote:
>
> On Wed, Feb 24, 2021 at 2:37 PM Numan Siddique  wrote:
> >
> > On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:
> > >
> > > If two load balancer VIPs share the same backend, both sets of hairpin
> > > reply learn() flows should be generated.  In order to ensure that,
> > > also match on the original destination IP and port tuple.  These are
> > > now stored in OVS registers by ovn-northd in stage ls-in-stateful.
> > >
> > > An alternative solution would be to add an additional match on
> > > ct_nw_dst() and ct_tp_dst() in the hairpin detection flows but it's
> > > better to avoid that because these matches are usually not offloadable
> > > to hardware.
> > >
> > > To ensure backwards compatibility though, if ovn-controller detects
> > > that ovn-northd doesn't store the original destination tuple
> > > information in OVS registers, ovn-controller falls back to using
> > > ct_nw_dst() and ct_tp_dst().
> > >
> > > Reported-by: Tim Rozet 
> > > Reported-at: https://bugzilla.redhat.com/1931599
> > > Fixes: 022ea339c8e2 ("lflow: Use learn() action to generate LB hairpin 
> > > reply flows.")
> > > Signed-off-by: Dumitru Ceara 
> >
> > Hi Dumitru,
> >
> > Thanks for the fix.
> >
> > A couple of minor nits below.
> > With those addressed.
> >
> > Acked-by: Numan Siddique 
> >
>
> I would also suggest to enhance the test in ovn.at to pause ovn-northd,
> clear the lb option - 'hairpin_orig_tuple' and make sure that flows
> have the match with the ct_* fields.

My bad. You already do it. Please forget this comment.

Numan

>
> Thanks
> Numan
>
> > Numan
> >
> > > ---
> > >  controller/lflow.c   |   56 +-
> > >  include/ovn/logical-fields.h |6 +
> > >  lib/lb.c |3
> > >  lib/lb.h |3
> > >  northd/ovn-northd.8.xml  |   21 ++
> > >  northd/ovn-northd.c  |   93 +-
> > >  ovn-sb.xml   |6 +
> > >  tests/ofproto-macros.at  |2
> > >  tests/ovn-northd.at  |   28 ++-
> > >  tests/ovn.at |  377 
> > > +++---
> > >  tests/system-ovn.at  |   40 +++-
> > >  11 files changed, 486 insertions(+), 149 deletions(-)
> > >
> > > diff --git a/controller/lflow.c b/controller/lflow.c
> > > index 8468cb4..104fbeb 100644
> > > --- a/controller/lflow.c
> > > +++ b/controller/lflow.c
> > > @@ -1228,6 +1228,12 @@ add_lb_vip_hairpin_reply_action(struct in6_addr 
> > > *vip6, ovs_be32 vip,
> > >  ofpact_finish_LEARN(ofpacts, );
> > >  }
> > >
> > > +/* Adds flows to detect hairpin sessions.
> > > + *
> > > + * For backwards compatibilty with older ovn-northd versions, uses
> > > + * ct_nw_dst(), ct_ipv6_dst(), ct_tp_dst(), otherwise uses the
> > > + * original destination tuple stored by ovn-northd.
> > > + */
> > >  static void
> > >  add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
> > >   struct ovn_lb_vip *lb_vip,
> > > @@ -1243,30 +1249,58 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb 
> > > *lb,
> > >  put_load(, sizeof value, MFF_LOG_FLAGS,
> > >   MLF_LOOKUP_LB_HAIRPIN_BIT, 1, );
> > >
> > > +/* Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() requires 
> > > matching
> > > + * on ct_state first.
> > > + */
> > > +if (!lb->hairpin_orig_tuple) {
> > > +uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
> > > +match_set_ct_state_masked(_match, ct_state, ct_state);
> > > +}
> > > +
> > >  if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
> > >  ovs_be32 bip4 = in6_addr_get_mapped_ipv4(_backend->ip);
> > > -ovs_be32 vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
> > > +ovs_be32 vip4 = in6_addr_get_mapped_ipv4(_vip->vip);
> > > +ovs_be32 snat_vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
> > >  ? lb->hairpin_snat_ips.ipv4_addrs[0].addr
> > > -: in6_addr_get_mapped_ipv4(_vip->vip);
> > > +: vip4;
> > >
> > >  match_set_dl_type(_match, htons(ETH_TYPE_IP));
> > >  match_set_nw_src(_match, bip4);
> > >  match_set_nw_dst(_match, bip4);
> > >
> > > -add_lb_vip_hairpin_reply_action(NULL, vip4, lb_proto,
> > > +if (!lb->hairpin_orig_tuple) {
> > > +match_set_ct_nw_dst(_match, vip4);
> > > +} else {
> > > +match_set_reg(_match,
> > > +  MFF_LOG_LB_ORIG_DIP_IPV4 - MFF_LOG_REG0,
> > > +  ntohl(vip4));
> > > +}
> > > +
> > > +add_lb_vip_hairpin_reply_action(NULL, snat_vip4, lb_proto,
> > >  lb_backend->port,
> > >  lb->slb->header_.uuid.parts[0],
> > >  );
> > >  } else {
> > >  struct in6_addr *bip6 = _backend->ip;
> > > -struct in6_addr *vip6 = 

Re: [ovs-dev] [PATCH ovn 3/3] northd: Avoid matching on ct.dnat flags for load balancers.

2021-02-24 Thread Numan Siddique
On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:
>
> Matching on ct.dnat creates openflows that often are not offloadable
> to hardware.  ovn-northd uses ct.dnat only for load balancer hairpin
> traffic handling and it turns out we don't really need to match on
> ct.dnat.
>
> Signed-off-by: Dumitru Ceara 

Acked-by: Numan Siddique 

Can you please add tests in ovn-northd for this ? This would make sure
ovn-northd-ddlog (once we have the ddlog patches)
is not out of sync with northd.

Thanks
Numan

> ---
>  northd/ovn-northd.8.xml |   32 +++--
>  northd/ovn-northd.c |   52 
> +--
>  2 files changed, 49 insertions(+), 35 deletions(-)
>
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index deffe8c..a16937a 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -813,19 +813,12 @@
>
>  If the logical switch has load balancer(s) configured, then a
>  priority-100 flow is added with the match
> -ip  ct.trk ct.dnat to check if the
> +ip  ct.trk to check if the
>  packet needs to be hairpinned (if after load balancing the 
> destination
> -IP matches the source IP) or not by executing the action
> -reg0[6] = chk_lb_hairpin(); and advances the packet to
> -the next table.
> -  
> -
> -  
> -If the logical switch has load balancer(s) configured, then a
> -priority-90 flow is added with the match ip to check if
> -the packet is a reply for a hairpinned connection or not by executing
> -the action reg0[6] = chk_lb_hairpin_reply(); and 
> advances
> -the packet to the next table.
> +IP matches the source IP) or not by executing the actions
> +reg0[6] = chk_lb_hairpin(); and
> +reg0[12] = chk_lb_hairpin_reply(); and advances the 
> packet
> +to the next table.
>
>
>
> @@ -838,16 +831,25 @@
>
>   If the logical switch has load balancer(s) configured, then a
>   priority-100 flow is added with the match
> - ip  (ct.new || ct.est)  ct.trk 
> - ct.dnat  reg0[6] == 1 which hairpins the traffic by
> + ip  ct.new  ct.trk 
> + reg0[6] == 1 which hairpins the traffic by
>   NATting source IP to the load balancer VIP by executing the action
>   ct_snat_to_vip and advances the packet to the next 
> table.
>
>
>
>   If the logical switch has load balancer(s) configured, then a
> + priority-100 flow is added with the match
> + ip  ct.est  ct.trk 
> + reg0[6] == 1 which hairpins the traffic by
> + NATting source IP to the load balancer VIP by executing the action
> + ct_snat and advances the packet to the next table.
> +  
> +
> +  
> + If the logical switch has load balancer(s) configured, then a
>   priority-90 flow is added with the match
> - ip  reg0[6] == 1 which matches on the replies
> + ip  reg0[12] == 1 which matches on the 
> replies
>   of hairpinned traffic (i.e., destination IP is VIP,
>   source IP is the backend IP and source L4 port is backend port for 
> L4
>   load balancers) and executes ct_snat and advances the
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index 18e4cac..f66bdea 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -227,6 +227,7 @@ enum ovn_stage {
>  #define REGBIT_ACL_HINT_DROP  "reg0[9]"
>  #define REGBIT_ACL_HINT_BLOCK "reg0[10]"
>  #define REGBIT_LKUP_FDB   "reg0[11]"
> +#define REGBIT_HAIRPIN_REPLY  "reg0[12]"
>
>  #define REG_ORIG_DIP_IPV4 "reg1"
>  #define REG_ORIG_DIP_IPV6 "xxreg1"
> @@ -266,7 +267,8 @@ enum ovn_stage {
>   *
>   * Logical Switch pipeline:
>   * 
> ++--+---+--+
> - * | R0 | REGBIT_{CONNTRACK/DHCP/DNS/HAIRPIN}  |   | 
>  |
> + * | R0 | REGBIT_{CONNTRACK/DHCP/DNS}  |   | 
>  |
> + * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   | X | 
>  |
>   * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} | X | 
>  |
>   * ++--+ X | 
>  |
>   * | R1 | ORIG_DIP_IPV4 (>= IN_STATEFUL)   | R | 
>  |
> @@ -6036,39 +6038,49 @@ build_lb_hairpin(struct ovn_datapath *od, struct hmap 
> *lflows)
>  ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 0, "1", "next;");
>
>  if (od->has_lb_vip) {
> -/* Check if the packet needs to be hairpinned. */
> -ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_HAIRPIN, 100,
> -"ip && ct.trk && ct.dnat",
> -REGBIT_HAIRPIN " = chk_lb_hairpin(); next;",
> +/* Check if the packet needs 

Re: [ovs-dev] [PATCH ovn 2/3] lflow: Avoid matching on conntrack original tuple if possible.

2021-02-24 Thread Numan Siddique
On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:
>
> Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() creates openflows
> that often are not offloadable to hardware.  This was used for
> detecting load balancer hairpin sessions.
>
> However, it can be avoided if ovn-northd stores the original
> destination tuple in OVS registers.  For backwards compatibility,
> during upgrade, fall back to matching on the conntrack original tuple.
>
> Signed-off-by: Dumitru Ceara 

Acked-by: Numan Siddique 

It will be great if you can enhance the test to pause northd, clear
the lb option
and make sure ovn-controller programs the flow in the old way.

Thanks
Numan

> ---
>  controller/lflow.c |   50 +
>  tests/ovn.at   |   90 
> ++--
>  2 files changed, 88 insertions(+), 52 deletions(-)
>
> diff --git a/controller/lflow.c b/controller/lflow.c
> index 104fbeb..e84bf20 100644
> --- a/controller/lflow.c
> +++ b/controller/lflow.c
> @@ -1340,6 +1340,12 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
>  ofpbuf_uninit();
>  }
>
> +/* Adds flows to perform SNAT for hairpin sessions.
> + *
> + * For backwards compatibilty with older ovn-northd versions, uses
> + * ct_nw_dst(), ct_ipv6_dst(), ct_tp_dst(), otherwise uses the
> + * original destination tuple stored by ovn-northd.
> + */
>  static void
>  add_lb_ct_snat_vip_flows(struct ovn_controller_lb *lb,
>   struct ovn_lb_vip *lb_vip,
> @@ -1382,20 +1388,50 @@ add_lb_ct_snat_vip_flows(struct ovn_controller_lb *lb,
>  ofpact_finish(, >ofpact);
>
>  struct match match = MATCH_CATCHALL_INITIALIZER;
> +
> +/* Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() requires matching
> + * on ct_state first.
> + */
> +if (!lb->hairpin_orig_tuple) {
> +uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
> +match_set_ct_state_masked(, ct_state, ct_state);
> +}
> +
>  if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
> +ovs_be32 vip4 = in6_addr_get_mapped_ipv4(_vip->vip);
> +
>  match_set_dl_type(, htons(ETH_TYPE_IP));
> -match_set_ct_nw_dst(, in6_addr_get_mapped_ipv4(_vip->vip));
> +
> +if (!lb->hairpin_orig_tuple) {
> +match_set_ct_nw_dst(, vip4);
> +} else {
> +match_set_reg(, MFF_LOG_LB_ORIG_DIP_IPV4 - MFF_LOG_REG0,
> +  ntohl(vip4));
> +}
>  } else {
>  match_set_dl_type(, htons(ETH_TYPE_IPV6));
> -match_set_ct_ipv6_dst(, _vip->vip);
> +
> +if (!lb->hairpin_orig_tuple) {
> +match_set_ct_ipv6_dst(, _vip->vip);
> +} else {
> +ovs_be128 vip6_value;
> +
> +memcpy(_value, _vip->vip, sizeof vip6_value);
> +match_set_xxreg(, MFF_LOG_LB_ORIG_DIP_IPV6,
> +ntoh128(vip6_value));
> +}
>  }
>
>  match_set_nw_proto(, lb_proto);
> -match_set_ct_nw_proto(, lb_proto);
> -match_set_ct_tp_dst(, htons(lb_vip->vip_port));
> -
> -uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
> -match_set_ct_state_masked(, ct_state, ct_state);
> +if (lb_vip->vip_port) {
> +if (!lb->hairpin_orig_tuple) {
> +match_set_ct_nw_proto(, lb_proto);
> +match_set_ct_tp_dst(, htons(lb_vip->vip_port));
> +} else {
> +match_set_reg_masked(, MFF_LOG_LB_ORIG_TP_DPORT - MFF_REG0,
> + lb_vip->vip_port, UINT16_MAX);
> +}
> +}
>
>  for (size_t i = 0; i < lb->slb->n_datapaths; i++) {
>  match_set_metadata(,
> diff --git a/tests/ovn.at b/tests/ovn.at
> index b061f88..b465784 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -23700,7 +23700,7 @@ NXST_FLOW reply (xid=0x8):
>  ])
>
>  AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=70 | ofctl_strip_all | 
> grep -v NXST], [0], [dnl
> - table=70, 
> priority=100,ct_state=+trk+dnat,ct_nw_dst=88.88.88.88,ct_nw_proto=6,ct_tp_dst=8080,tcp,metadata=0x1
>  actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
> + table=70, priority=100,tcp,reg1=0x58585858,reg2=0x1f90/0x,metadata=0x1 
> actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
>  ])
>
>  AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=68 | grep -v NXST], [1], 
> [dnl
> @@ -23729,8 +23729,8 @@ NXST_FLOW reply (xid=0x8):
>  ])
>
>  AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=70 | ofctl_strip_all | 
> grep -v NXST], [0], [dnl
> - table=70, 
> priority=100,ct_state=+trk+dnat,ct_nw_dst=88.88.88.88,ct_nw_proto=6,ct_tp_dst=8080,tcp,metadata=0x1
>  actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.88))
> - table=70, 
> priority=100,ct_state=+trk+dnat,ct_nw_dst=88.88.88.90,ct_nw_proto=6,ct_tp_dst=8080,tcp,metadata=0x1
>  actions=ct(commit,zone=NXM_NX_REG12[[0..15]],nat(src=88.88.88.90))
> + table=70, priority=100,tcp,reg1=0x58585858,reg2=0x1f90/0x,metadata=0x1 
> 

Re: [ovs-dev] [PATCH ovn 1/3] Properly handle hairpin traffic for VIPs with shared backends.

2021-02-24 Thread Numan Siddique
On Wed, Feb 24, 2021 at 2:37 PM Numan Siddique  wrote:
>
> On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:
> >
> > If two load balancer VIPs share the same backend, both sets of hairpin
> > reply learn() flows should be generated.  In order to ensure that,
> > also match on the original destination IP and port tuple.  These are
> > now stored in OVS registers by ovn-northd in stage ls-in-stateful.
> >
> > An alternative solution would be to add an additional match on
> > ct_nw_dst() and ct_tp_dst() in the hairpin detection flows but it's
> > better to avoid that because these matches are usually not offloadable
> > to hardware.
> >
> > To ensure backwards compatibility though, if ovn-controller detects
> > that ovn-northd doesn't store the original destination tuple
> > information in OVS registers, ovn-controller falls back to using
> > ct_nw_dst() and ct_tp_dst().
> >
> > Reported-by: Tim Rozet 
> > Reported-at: https://bugzilla.redhat.com/1931599
> > Fixes: 022ea339c8e2 ("lflow: Use learn() action to generate LB hairpin 
> > reply flows.")
> > Signed-off-by: Dumitru Ceara 
>
> Hi Dumitru,
>
> Thanks for the fix.
>
> A couple of minor nits below.
> With those addressed.
>
> Acked-by: Numan Siddique 
>

I would also suggest to enhance the test in ovn.at to pause ovn-northd,
clear the lb option - 'hairpin_orig_tuple' and make sure that flows
have the match with the ct_* fields.

Thanks
Numan

> Numan
>
> > ---
> >  controller/lflow.c   |   56 +-
> >  include/ovn/logical-fields.h |6 +
> >  lib/lb.c |3
> >  lib/lb.h |3
> >  northd/ovn-northd.8.xml  |   21 ++
> >  northd/ovn-northd.c  |   93 +-
> >  ovn-sb.xml   |6 +
> >  tests/ofproto-macros.at  |2
> >  tests/ovn-northd.at  |   28 ++-
> >  tests/ovn.at |  377 
> > +++---
> >  tests/system-ovn.at  |   40 +++-
> >  11 files changed, 486 insertions(+), 149 deletions(-)
> >
> > diff --git a/controller/lflow.c b/controller/lflow.c
> > index 8468cb4..104fbeb 100644
> > --- a/controller/lflow.c
> > +++ b/controller/lflow.c
> > @@ -1228,6 +1228,12 @@ add_lb_vip_hairpin_reply_action(struct in6_addr 
> > *vip6, ovs_be32 vip,
> >  ofpact_finish_LEARN(ofpacts, );
> >  }
> >
> > +/* Adds flows to detect hairpin sessions.
> > + *
> > + * For backwards compatibilty with older ovn-northd versions, uses
> > + * ct_nw_dst(), ct_ipv6_dst(), ct_tp_dst(), otherwise uses the
> > + * original destination tuple stored by ovn-northd.
> > + */
> >  static void
> >  add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
> >   struct ovn_lb_vip *lb_vip,
> > @@ -1243,30 +1249,58 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb 
> > *lb,
> >  put_load(, sizeof value, MFF_LOG_FLAGS,
> >   MLF_LOOKUP_LB_HAIRPIN_BIT, 1, );
> >
> > +/* Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() requires matching
> > + * on ct_state first.
> > + */
> > +if (!lb->hairpin_orig_tuple) {
> > +uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
> > +match_set_ct_state_masked(_match, ct_state, ct_state);
> > +}
> > +
> >  if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
> >  ovs_be32 bip4 = in6_addr_get_mapped_ipv4(_backend->ip);
> > -ovs_be32 vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
> > +ovs_be32 vip4 = in6_addr_get_mapped_ipv4(_vip->vip);
> > +ovs_be32 snat_vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
> >  ? lb->hairpin_snat_ips.ipv4_addrs[0].addr
> > -: in6_addr_get_mapped_ipv4(_vip->vip);
> > +: vip4;
> >
> >  match_set_dl_type(_match, htons(ETH_TYPE_IP));
> >  match_set_nw_src(_match, bip4);
> >  match_set_nw_dst(_match, bip4);
> >
> > -add_lb_vip_hairpin_reply_action(NULL, vip4, lb_proto,
> > +if (!lb->hairpin_orig_tuple) {
> > +match_set_ct_nw_dst(_match, vip4);
> > +} else {
> > +match_set_reg(_match,
> > +  MFF_LOG_LB_ORIG_DIP_IPV4 - MFF_LOG_REG0,
> > +  ntohl(vip4));
> > +}
> > +
> > +add_lb_vip_hairpin_reply_action(NULL, snat_vip4, lb_proto,
> >  lb_backend->port,
> >  lb->slb->header_.uuid.parts[0],
> >  );
> >  } else {
> >  struct in6_addr *bip6 = _backend->ip;
> > -struct in6_addr *vip6 = lb->hairpin_snat_ips.n_ipv6_addrs
> > -? >hairpin_snat_ips.ipv6_addrs[0].addr
> > -: _vip->vip;
> > +struct in6_addr *snat_vip6 =
> > +lb->hairpin_snat_ips.n_ipv6_addrs
> > +? >hairpin_snat_ips.ipv6_addrs[0].addr
> > +: _vip->vip;
> >  match_set_dl_type(_match, 

Re: [ovs-dev] [PATCH ovn 1/3] Properly handle hairpin traffic for VIPs with shared backends.

2021-02-24 Thread Numan Siddique
On Tue, Feb 23, 2021 at 6:50 PM Dumitru Ceara  wrote:
>
> If two load balancer VIPs share the same backend, both sets of hairpin
> reply learn() flows should be generated.  In order to ensure that,
> also match on the original destination IP and port tuple.  These are
> now stored in OVS registers by ovn-northd in stage ls-in-stateful.
>
> An alternative solution would be to add an additional match on
> ct_nw_dst() and ct_tp_dst() in the hairpin detection flows but it's
> better to avoid that because these matches are usually not offloadable
> to hardware.
>
> To ensure backwards compatibility though, if ovn-controller detects
> that ovn-northd doesn't store the original destination tuple
> information in OVS registers, ovn-controller falls back to using
> ct_nw_dst() and ct_tp_dst().
>
> Reported-by: Tim Rozet 
> Reported-at: https://bugzilla.redhat.com/1931599
> Fixes: 022ea339c8e2 ("lflow: Use learn() action to generate LB hairpin reply 
> flows.")
> Signed-off-by: Dumitru Ceara 

Hi Dumitru,

Thanks for the fix.

A couple of minor nits below.
With those addressed.

Acked-by: Numan Siddique 

Numan

> ---
>  controller/lflow.c   |   56 +-
>  include/ovn/logical-fields.h |6 +
>  lib/lb.c |3
>  lib/lb.h |3
>  northd/ovn-northd.8.xml  |   21 ++
>  northd/ovn-northd.c  |   93 +-
>  ovn-sb.xml   |6 +
>  tests/ofproto-macros.at  |2
>  tests/ovn-northd.at  |   28 ++-
>  tests/ovn.at |  377 
> +++---
>  tests/system-ovn.at  |   40 +++-
>  11 files changed, 486 insertions(+), 149 deletions(-)
>
> diff --git a/controller/lflow.c b/controller/lflow.c
> index 8468cb4..104fbeb 100644
> --- a/controller/lflow.c
> +++ b/controller/lflow.c
> @@ -1228,6 +1228,12 @@ add_lb_vip_hairpin_reply_action(struct in6_addr *vip6, 
> ovs_be32 vip,
>  ofpact_finish_LEARN(ofpacts, );
>  }
>
> +/* Adds flows to detect hairpin sessions.
> + *
> + * For backwards compatibilty with older ovn-northd versions, uses
> + * ct_nw_dst(), ct_ipv6_dst(), ct_tp_dst(), otherwise uses the
> + * original destination tuple stored by ovn-northd.
> + */
>  static void
>  add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
>   struct ovn_lb_vip *lb_vip,
> @@ -1243,30 +1249,58 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
>  put_load(, sizeof value, MFF_LOG_FLAGS,
>   MLF_LOOKUP_LB_HAIRPIN_BIT, 1, );
>
> +/* Matching on ct_nw_dst()/ct_ipv6_dst()/ct_tp_dst() requires matching
> + * on ct_state first.
> + */
> +if (!lb->hairpin_orig_tuple) {
> +uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_DST_NAT;
> +match_set_ct_state_masked(_match, ct_state, ct_state);
> +}
> +
>  if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
>  ovs_be32 bip4 = in6_addr_get_mapped_ipv4(_backend->ip);
> -ovs_be32 vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
> +ovs_be32 vip4 = in6_addr_get_mapped_ipv4(_vip->vip);
> +ovs_be32 snat_vip4 = lb->hairpin_snat_ips.n_ipv4_addrs
>  ? lb->hairpin_snat_ips.ipv4_addrs[0].addr
> -: in6_addr_get_mapped_ipv4(_vip->vip);
> +: vip4;
>
>  match_set_dl_type(_match, htons(ETH_TYPE_IP));
>  match_set_nw_src(_match, bip4);
>  match_set_nw_dst(_match, bip4);
>
> -add_lb_vip_hairpin_reply_action(NULL, vip4, lb_proto,
> +if (!lb->hairpin_orig_tuple) {
> +match_set_ct_nw_dst(_match, vip4);
> +} else {
> +match_set_reg(_match,
> +  MFF_LOG_LB_ORIG_DIP_IPV4 - MFF_LOG_REG0,
> +  ntohl(vip4));
> +}
> +
> +add_lb_vip_hairpin_reply_action(NULL, snat_vip4, lb_proto,
>  lb_backend->port,
>  lb->slb->header_.uuid.parts[0],
>  );
>  } else {
>  struct in6_addr *bip6 = _backend->ip;
> -struct in6_addr *vip6 = lb->hairpin_snat_ips.n_ipv6_addrs
> -? >hairpin_snat_ips.ipv6_addrs[0].addr
> -: _vip->vip;
> +struct in6_addr *snat_vip6 =
> +lb->hairpin_snat_ips.n_ipv6_addrs
> +? >hairpin_snat_ips.ipv6_addrs[0].addr
> +: _vip->vip;
>  match_set_dl_type(_match, htons(ETH_TYPE_IPV6));
>  match_set_ipv6_src(_match, bip6);
>  match_set_ipv6_dst(_match, bip6);
>
> -add_lb_vip_hairpin_reply_action(vip6, 0, lb_proto,
> +if (!lb->hairpin_orig_tuple) {
> +match_set_ct_ipv6_dst(_match, _vip->vip);
> +} else {
> +ovs_be128 vip6_value;
> +
> +memcpy(_value, _vip->vip, sizeof vip6_value);
> +match_set_xxreg(_match, MFF_LOG_LB_ORIG_DIP_IPV6,
> +  

Re: [ovs-dev] [PATCH V2 10/14] netdev-offload-dpdk: Support tunnel pop action

2021-02-24 Thread Eli Britstein



On 2/24/2021 9:52 AM, Sriharsha Basavapatna wrote:

On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein  wrote:

Support tunnel pop action.

Signed-off-by: Eli Britstein 
Reviewed-by: Gaetan Rivet 
---
  Documentation/howto/dpdk.rst |   1 +
  NEWS |   1 +
  lib/netdev-offload-dpdk.c| 173 ---
  3 files changed, 160 insertions(+), 15 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index f0d45e47b..4918d80f3 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -398,6 +398,7 @@ Supported actions for hardware offload are:
  - VLAN Push/Pop (push_vlan/pop_vlan).
  - Modification of IPv6 (set_field:->ipv6_src/ipv6_dst/mod_nw_ttl).
  - Clone/output (tnl_push and output) for encapsulating over a tunnel.
+- Tunnel pop, for changing from a physical port to a vport.

  Further Reading
  ---
diff --git a/NEWS b/NEWS
index a7bffce97..6850d5621 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,7 @@ v2.15.0 - xx xxx 
 - DPDK:
   * Removed support for vhost-user dequeue zero-copy.
   * Add support for DPDK 20.11.
+ * Add hardware offload support for tunnel pop action (experimental).
 - Userspace datapath:
   * Add the 'pmd' option to "ovs-appctl dpctl/dump-flows", which
 restricts a flow dump to a single PMD thread if set.
diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
index 78f866080..493cc9159 100644
--- a/lib/netdev-offload-dpdk.c
+++ b/lib/netdev-offload-dpdk.c
@@ -140,15 +140,30 @@ struct flow_actions {
  struct rte_flow_action *actions;
  int cnt;
  int current_max;
+struct netdev *tnl_netdev;
+/* tnl_actions is the opaque array of actions returned by the PMD. */
+struct rte_flow_action *tnl_actions;

Why is this an opaque array ? Since it is struct rte_flow_action, OVS
knows the type and members. Is it opaque because the value of
rte_flow_action_type member is unknown to OVS ? Is it a private action
type and if so how does the PMD ensure that it doesn't conflict with
standard action types ?


True it is not used by OVS, but that's not why it opaque. Although it is 
struct rte_flow_action array, the PMD may use its own private actions, 
not defined in rte_flow.h, thus not known to the application.


The details of this array is under the PMD's responsibility.




+uint32_t num_of_tnl_actions;
+/* tnl_actions_pos is where the tunnel actions starts within the 'actions'
+ * field.
+ */
+int tnl_actions_pos;

Names should indicate that they are private or pmd specific ?

tnl_actions --> tnl_private_actions or tnl_pmd_actions
num_of_tnl_actions --> num_private_tnl_actions or num_pmd_tnl_actions
tnl_actions_pos --> tnl_private_actions_pos or tnl_pmd_actions_pos

OK. _pmd_



+struct ds s_tnl;
  };

  static void
-dump_flow_attr(struct ds *s, const struct rte_flow_attr *attr)
+dump_flow_attr(struct ds *s, struct ds *s_extra,
+   const struct rte_flow_attr *attr,
+   struct flow_actions *flow_actions)
  {
-ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s",
+if (flow_actions->num_of_tnl_actions) {
+ds_clone(s_extra, _actions->s_tnl);
+}
+ds_put_format(s, "%s%spriority %"PRIu32" group %"PRIu32" %s%s",
attr->ingress  ? "ingress " : "",
attr->egress   ? "egress " : "", attr->priority, 
attr->group,
-  attr->transfer ? "transfer " : "");
+  attr->transfer ? "transfer " : "",
+  flow_actions->num_of_tnl_actions ? "tunnel_set 1 " : "");
  }

  /* Adds one pattern item 'field' with the 'mask' to dynamic string 's' using
@@ -395,9 +410,19 @@ dump_vxlan_encap(struct ds *s, const struct rte_flow_item 
*items)

  static void
  dump_flow_action(struct ds *s, struct ds *s_extra,
- const struct rte_flow_action *actions)
+ struct flow_actions *flow_actions, int act_index)
  {
-if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
+const struct rte_flow_action *actions = _actions->actions[act_index];
+
+if (actions->type == RTE_FLOW_ACTION_TYPE_END) {
+ds_put_cstr(s, "end");
+} else if (flow_actions->num_of_tnl_actions &&
+   act_index >= flow_actions->tnl_actions_pos &&
+   act_index < flow_actions->tnl_actions_pos +
+   flow_actions->num_of_tnl_actions) {
+/* Opaque PMD tunnel actions is skipped. */

Wouldn't it be useful to at least print the value of PMD action types ?
The only way we can print them as raw numbers. I see little added value 
for this on one hand, but on the other hand it will defect the testpmd 
format print, so I think better to avoid.



+return;
+} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
  const struct rte_flow_action_mark *mark = actions->conf;

  ds_put_cstr(s, "mark ");
@@ -528,6 +553,14 @@