Re: [ovs-dev] [PATCH V2 3/4] tc: Add header rewrite using tc pedit action

Paul Blakey Mon, 25 Sep 2017 06:32:17 -0700


On 18/09/2017 18:01, Simon Horman wrote:

On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:

From: Paul Blakey <pa...@mellanox.com>

To be later used to implement ovs action set offloading.

Signed-off-by: Paul Blakey <pa...@mellanox.com>
Reviewed-by: Roi Dayan <r...@mellanox.com>
---
  lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
  lib/tc.h |  16 +++
  2 files changed, 385 insertions(+), 3 deletions(-)

diff --git a/lib/tc.c b/lib/tc.c
index c9cada2..743b2ee 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -21,8 +21,10 @@
  #include <errno.h>
  #include <linux/if_ether.h>
  #include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_csum.h>
  #include <linux/tc_act/tc_gact.h>
  #include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_pedit.h>
  #include <linux/tc_act/tc_tunnel_key.h>
  #include <linux/tc_act/tc_vlan.h>
  #include <linux/gen_stats.h>
@@ -33,11 +35,14 @@
  #include "netlink-socket.h"
  #include "netlink.h"
  #include "openvswitch/ofpbuf.h"
+#include "openvswitch/util.h"
  #include "openvswitch/vlog.h"
  #include "packets.h"
  #include "timeval.h"
  #include "unaligned.h"

+#define MAX_PEDIT_OFFSETS 8


Why 8?

We don't expect anything more right now (ipv6 src/dst rewrite requires 8pedits iirc). I can't think of a larger use case, maybe ipv6 + macs ifthat's makes sens. do you suggest we increase it? to what?

+
  VLOG_DEFINE_THIS_MODULE(tc);

static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);

@@ -50,6 +55,82 @@ enum tc_offload_policy {

static enum tc_offload_policy tc_policy = TC_POLICY_NONE;+struct tc_pedit_key_ex {

+    enum pedit_header_type htype;
+    enum pedit_cmd cmd;
+};
+
+struct flower_key_to_pedit {
+    enum pedit_header_type htype;
+    int flower_offset;
+    int offset;
+    int size;
+};
+
+static struct flower_key_to_pedit flower_pedit_map[] = {
+    {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        12,
+        offsetof(struct tc_flower_key, ipv4.ipv4_src),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        16,
+        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        8,
+        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
+        8,
+        offsetof(struct tc_flower_key, ipv6.ipv6_src),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
+        24,
+        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        6,
+        offsetof(struct tc_flower_key, src_mac),
+        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        0,
+        offsetof(struct tc_flower_key, dst_mac),
+        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        12,
+        offsetof(struct tc_flower_key, eth_type),
+        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
+        0,
+        offsetof(struct tc_flower_key, tcp_src),
+        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
+        2,
+        offsetof(struct tc_flower_key, tcp_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
+        0,
+        offsetof(struct tc_flower_key, udp_src),
+        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
+        2,
+        offsetof(struct tc_flower_key, udp_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
+    },
+};
+
  struct tcmsg *
  tc_make_request(int ifindex, int type, unsigned int flags,
                  struct ofpbuf *request)
@@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower 
*flower) {
      }
  }

+static const struct nl_policy pedit_policy[] = {

+            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
+                                     .min_len = sizeof(struct tc_pedit),
+                                     .optional = false, },
+            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
+                                      .optional = false, },
+};
+
+static int
+nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
+{
+    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
+    const struct tc_pedit *pe;
+    const struct tc_pedit_key *keys;
+    const struct nlattr *nla, *keys_ex, *ex_type;
+    const void *keys_attr;
+    char *rewrite_key = (void *) &flower->rewrite.key;
+    char *rewrite_mask = (void *) &flower->rewrite.mask;
+    size_t keys_ex_size, left;
+    int type, i = 0;
+
+    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
+                         ARRAY_SIZE(pedit_policy))) {
+        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
+        return EPROTO;
+    }
+
+    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
+    keys = pe->keys;
+    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
+    keys_ex = nl_attr_get(keys_attr);
+    keys_ex_size = nl_attr_get_size(keys_attr);
+
+    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
+        if (i >= pe->nkeys) {
+            break;
+        }
+
+        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
+            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
+            type = nl_attr_get_u16(ex_type);
+
+            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
+                struct flower_key_to_pedit *m = &flower_pedit_map[j];
+                int flower_off = m->flower_offset;
+                int sz = m->size;
+                int mf = m->offset;
+
+                if (m->htype != type) {
+                   continue;
+                }
+
+                /* check overlap between current pedit key, which is always
+                 * 4 bytes (range [off, off + 3]), and a map entry in
+                 * flower_pedit_map (range [mf, mf + sz - 1]) */
+                if ((keys->off >= mf && keys->off < mf + sz)
+                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
+                    int diff = flower_off + (keys->off - mf);
+                    uint32_t *dst = (void *) (rewrite_key + diff);
+                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
+                    uint32_t mask = ~(keys->mask);
+                    uint32_t zero_bits;
+
+                    if (keys->off < mf) {
+                        zero_bits = 8 * (mf - keys->off);
+                        mask &= UINT32_MAX << zero_bits;
+                    } else if (keys->off + 4 > mf + m->size) {
+                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
+                        mask &= UINT32_MAX >> zero_bits;
+                    }
+
+                    *dst_m |= mask;
+                    *dst |= keys->val & mask;
+                }
+            }


If I understand the above correctly it is designed to make
pedit actions disjoint. If so, why is that necessary? >

It's not, as a single flower key rewrite can be split to multiple peditactions it finds the overlap between a flower key and a pedit action,if they do overlap it translates it to the correct offset and masks it out.

+        } else {
+            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
+                        nl_attr_type(nla));
+            return EOPNOTSUPP;
+        }


I think the code could exit early here as
nl_msg_put_flower_rewrite_pedits() does below.


Sorry, didn't understand. can you give an example?

+
+        keys++;
+        i++;
+    }
+
+    flower->rewrite.rewrite = true;
+
+    return 0;
+}
+
  static const struct nl_policy tunnel_key_policy[] = {
      [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
                                 .min_len = sizeof(struct tc_tunnel_key),
@@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct 
tc_flower *flower)
          nl_parse_act_vlan(act_options, flower);
      } else if (!strcmp(act_kind, "tunnel_key")) {
          nl_parse_act_tunnel_key(act_options, flower);
+    } else if (!strcmp(act_kind, "pedit")) {
+        nl_parse_act_pedit(act_options, flower);
+    } else if (!strcmp(act_kind, "csum")) {
+        /* not doing anything for now, ovs has an implicit csum recalculation
+         * with rewriting of packet headers (translating of pedit acts). */


I wonder if the absence of a csum action when needed (by TC)
should be treated as an error >


Do you mean validating csum flags from each protocol and such (like in put)?

      } else {
          VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
          return EINVAL;
@@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
  }

static void

+nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
+{
+    size_t offset;
+
+    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
+    {
+        struct tc_csum parm = { .action = TC_ACT_PIPE,
+                                .update_flags = flags };
+
+        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
+    }
+    nl_msg_end_nested(request, offset);
+}
+
+static void
+nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
+                     struct tc_pedit_key_ex *ex)
+{
+    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));


Are there unnecessary () on the line above?

No, I'll remove them.

+    size_t offset, offset_keys_ex, offset_key;
+    int i;
+
+    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
+    {
+        parm->action = TC_ACT_PIPE;
+
+        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
+        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
+        for (i = 0; i < parm->nkeys; i++, ex++) {
+            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
+            nl_msg_end_nested(request, offset_key);
+        }
+        nl_msg_end_nested(request, offset_keys_ex);
+    }
+    nl_msg_end_nested(request, offset);
+}
+
+static void
  nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
  {
      size_t offset;
@@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct 
tc_cookie *ck) {
      }
  }

+/* Given flower, a key_to_pedit map entry, calculates the rest,

+ * where:
+ *
+ * mask, data - pointers of where read the first word of flower->key/mask.
+ * current_offset - which offset to use for the first pedit action.
+ * cnt - max pedits actions to use.
+ * first_word_mask/last_word_mask - the mask to use for the first/last read
+ * (as we read entire words). */
  static void
+calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
+             int *cur_offset, int *cnt, uint32_t *last_word_mask,
+             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
+{
+    int start_offset, max_offset, total_size;
+    int diff, right_zero_bits, left_zero_bits;
+    char *rewrite_key = (void *) &flower->rewrite.key;
+    char *rewrite_mask = (void *) &flower->rewrite.mask;
+
+    max_offset = m->offset + m->size;
+    start_offset = ROUND_DOWN(m->offset, 4);
+    diff = m->offset - start_offset;
+    total_size = max_offset - start_offset;
+    right_zero_bits = 8 * (4 - (max_offset % 4));
+    left_zero_bits = 8 * (m->offset - start_offset);
+
+    *cur_offset = start_offset;
+    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
+    *last_word_mask = UINT32_MAX >> right_zero_bits;
+    *first_word_mask = UINT32_MAX << left_zero_bits;
+    *data = (void *) (rewrite_key + m->flower_offset - diff);
+    *mask = (void *) (rewrite_mask + m->flower_offset - diff);


The type of *data and *mask is uint32_t *.
Why not cast to that type?


It's to avoid warning of increasing pointer alignment (-Wcast-align).

+}
+
+static inline void
+csum_update_flag(struct tc_flower *flower,
+                 enum pedit_header_type htype) {


I think the above two lines could be one.

+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {


A case statement might be nicer here.


Right, thanks.

+        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
+    }
+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
+        if (flower->key.ip_proto == IPPROTO_TCP) {
+            flower->mask.ip_proto = UINT8_MAX;


What if the mask was not UINT8_MAX to start with?
Doesn't this create a different flow?

This is so the checksum will be strict (not setting/recalc udp checksumon non udp packets). It creates a more specific flow, a subset of theoriginal. This is fine as datapath is free to ignore a mask, which isthe same as a full mask we've set.

+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
+        } else if (flower->key.ip_proto == IPPROTO_UDP) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
+        } else if (flower->key.ip_proto == IPPROTO_ICMP
+                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
+        }
+    }
+}
+
+static int
+nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
+                                 struct tc_flower *flower)
+{
+    struct {
+        struct tc_pedit sel;
+        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
+        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
+    } sel = {
+        .sel = {
+            .nkeys = 0
+        }
+    };
+    int i, j;
+
+    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
+        struct flower_key_to_pedit *m = &flower_pedit_map[i];
+        struct tc_pedit_key *pedit_key = NULL;
+        struct tc_pedit_key_ex *pedit_key_ex = NULL;
+        uint32_t *mask, *data, first_word_mask, last_word_mask;
+        int cnt = 0, cur_offset = 0;
+
+        if (!m->size) {
+            continue;
+        }
+
+        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
+                     &first_word_mask, &mask, &data);
+
+        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
+            uint32_t mask_word = *mask;
+
+            if (j == 0) {
+                mask_word &= first_word_mask;
+            }
+            if (j == cnt - 1) {
+                mask_word &= last_word_mask;
+            }
+            if (!mask_word) {
+                continue;
+            }
+            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
+                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
+                             MAX_PEDIT_OFFSETS);
+                return EOPNOTSUPP;
+            }
+
+            pedit_key = &sel.keys[sel.sel.nkeys];
+            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
+            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
+            pedit_key_ex->htype = m->htype;
+            pedit_key->off = cur_offset;
+            pedit_key->mask = ~mask_word;
+            pedit_key->val = *data & mask_word;
+            sel.sel.nkeys++;
+            csum_update_flag(flower, m->htype);
+        }
+    }
+    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
+
+    return 0;
+}
+
+static int
  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
  {
      size_t offset;
@@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct 
tc_flower *flower)
      offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
      {
          uint16_t act_index = 1;
+        int error;

+ if (flower->rewrite.rewrite) {

+            act_offset = nl_msg_start_nested(request, act_index++);
+            error = nl_msg_put_flower_rewrite_pedits(request, flower);
+            if (error) {
+                return error;
+            }
+            nl_msg_end_nested(request, act_offset);
+
+            act_offset = nl_msg_start_nested(request, act_index++);
+            nl_msg_put_act_csum(request, flower->csum_update_flags);
+            nl_msg_end_nested(request, act_offset);
+        }
          if (flower->set.set) {
              act_offset = nl_msg_start_nested(request, act_index++);
              nl_msg_put_act_tunnel_key_set(request, flower->set.id,
@@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct 
tc_flower *flower)
          }
      }
      nl_msg_end_nested(request, offset);
+
+    return 0;
  }

static void

@@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct 
tc_flower *flower)
      nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
                              &flower->mask.member, sizeof flower->key.member)

-static void

+static int
  nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
  {
+
      uint16_t host_eth_type = ntohs(flower->key.eth_type);
      bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
+    int err;
+
+    /* need to parse acts first as some acts require changing the matching */


This seems strange to me.


from the strict (normalized?) header checksum.

+    err  = nl_msg_put_flower_acts(request, flower);
+    if (err) {
+        return err;
+    }

if (is_vlan) {

          host_eth_type = ntohs(flower->key.encap_eth_type);
@@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct 
tc_flower *flower)
          nl_msg_put_flower_tunnel(request, flower);
      }

- nl_msg_put_flower_acts(request, flower);

+    return 0;
  }

int

@@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t 
handle,
      nl_msg_put_string(&request, TCA_KIND, "flower");
      basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
      {
-        nl_msg_put_flower_options(&request, flower);
+        error = nl_msg_put_flower_options(&request, flower);
+
+        if (error) {
+            ofpbuf_uninit(&request);
+            return error;
+        }
      }
      nl_msg_end_nested(&request, basic_offset);

diff --git a/lib/tc.h b/lib/tc.h

index 6c69b79..7876051 100644
--- a/lib/tc.h
+++ b/lib/tc.h
@@ -96,6 +96,7 @@ struct tc_flower_key {
      struct {
          ovs_be32 ipv4_src;
          ovs_be32 ipv4_dst;
+        uint8_t rewrite_ttl;
      } ipv4;
      struct {
          struct in6_addr ipv6_src;
@@ -120,6 +121,14 @@ struct tc_flower {
      uint64_t lastused;

struct {

+        bool rewrite;
+        struct tc_flower_key key;
+        struct tc_flower_key mask;
+    } rewrite;
+
+    uint32_t csum_update_flags;
+
+    struct {
          bool set;
          ovs_be64 id;
          ovs_be16 tp_src;
@@ -152,6 +161,13 @@ struct tc_flower {
      struct tc_cookie act_cookie;
  };

+/* assert that if we overflow with a masked write of uint32_t to the last byte

+ * of flower.rewrite we overflow inside struct flower.
+ * shouldn't happen unless someone moves rewrite to the end of flower */
+BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
+                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
+                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
+
  int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
                        struct tc_flower *flower);
  int tc_del_filter(int ifindex, int prio, int handle);
--
2.7.5

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH V2 3/4] tc: Add header rewrite using tc pedit action

Reply via email to