On 18/09/2017 18:01, Simon Horman wrote:
We don't expect anything more right now (ipv6 src/dst rewrite requires 8 pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if that's makes sens. do you suggest we increase it? to what?On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:From: Paul Blakey <pa...@mellanox.com> To be later used to implement ovs action set offloading. Signed-off-by: Paul Blakey <pa...@mellanox.com> Reviewed-by: Roi Dayan <r...@mellanox.com> --- lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- lib/tc.h | 16 +++ 2 files changed, 385 insertions(+), 3 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index c9cada2..743b2ee 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -21,8 +21,10 @@ #include <errno.h> #include <linux/if_ether.h> #include <linux/rtnetlink.h> +#include <linux/tc_act/tc_csum.h> #include <linux/tc_act/tc_gact.h> #include <linux/tc_act/tc_mirred.h> +#include <linux/tc_act/tc_pedit.h> #include <linux/tc_act/tc_tunnel_key.h> #include <linux/tc_act/tc_vlan.h> #include <linux/gen_stats.h> @@ -33,11 +35,14 @@ #include "netlink-socket.h" #include "netlink.h" #include "openvswitch/ofpbuf.h" +#include "openvswitch/util.h" #include "openvswitch/vlog.h" #include "packets.h" #include "timeval.h" #include "unaligned.h"+#define MAX_PEDIT_OFFSETS 8Why 8?
+ VLOG_DEFINE_THIS_MODULE(tc);static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);@@ -50,6 +55,82 @@ enum tc_offload_policy {static enum tc_offload_policy tc_policy = TC_POLICY_NONE; +struct tc_pedit_key_ex {+ enum pedit_header_type htype; + enum pedit_cmd cmd; +}; + +struct flower_key_to_pedit { + enum pedit_header_type htype; + int flower_offset; + int offset; + int size; +}; + +static struct flower_key_to_pedit flower_pedit_map[] = { + { + TCA_PEDIT_KEY_EX_HDR_TYPE_IP4, + 12, + offsetof(struct tc_flower_key, ipv4.ipv4_src), + MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_IP4, + 16, + offsetof(struct tc_flower_key, ipv4.ipv4_dst), + MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_IP4, + 8, + offsetof(struct tc_flower_key, ipv4.rewrite_ttl), + MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_IP6, + 8, + offsetof(struct tc_flower_key, ipv6.ipv6_src), + MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_IP6, + 24, + offsetof(struct tc_flower_key, ipv6.ipv6_dst), + MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_ETH, + 6, + offsetof(struct tc_flower_key, src_mac), + MEMBER_SIZEOF(struct tc_flower_key, src_mac) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_ETH, + 0, + offsetof(struct tc_flower_key, dst_mac), + MEMBER_SIZEOF(struct tc_flower_key, dst_mac) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_ETH, + 12, + offsetof(struct tc_flower_key, eth_type), + MEMBER_SIZEOF(struct tc_flower_key, eth_type) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_TCP, + 0, + offsetof(struct tc_flower_key, tcp_src), + MEMBER_SIZEOF(struct tc_flower_key, tcp_src) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_TCP, + 2, + offsetof(struct tc_flower_key, tcp_dst), + MEMBER_SIZEOF(struct tc_flower_key, tcp_dst) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_UDP, + 0, + offsetof(struct tc_flower_key, udp_src), + MEMBER_SIZEOF(struct tc_flower_key, udp_src) + }, { + TCA_PEDIT_KEY_EX_HDR_TYPE_UDP, + 2, + offsetof(struct tc_flower_key, udp_dst), + MEMBER_SIZEOF(struct tc_flower_key, udp_dst) + }, +}; + struct tcmsg * tc_make_request(int ifindex, int type, unsigned int flags, struct ofpbuf *request) @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) { } }+static const struct nl_policy pedit_policy[] = {+ [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC, + .min_len = sizeof(struct tc_pedit), + .optional = false, }, + [TCA_PEDIT_KEYS_EX] = { .type = NL_A_NESTED, + .optional = false, }, +}; + +static int +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) +{ + struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)]; + const struct tc_pedit *pe; + const struct tc_pedit_key *keys; + const struct nlattr *nla, *keys_ex, *ex_type; + const void *keys_attr; + char *rewrite_key = (void *) &flower->rewrite.key; + char *rewrite_mask = (void *) &flower->rewrite.mask; + size_t keys_ex_size, left; + int type, i = 0; + + if (!nl_parse_nested(options, pedit_policy, pe_attrs, + ARRAY_SIZE(pedit_policy))) { + VLOG_ERR_RL(&error_rl, "failed to parse pedit action options"); + return EPROTO; + } + + pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe); + keys = pe->keys; + keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX]; + keys_ex = nl_attr_get(keys_attr); + keys_ex_size = nl_attr_get_size(keys_attr); + + NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) { + if (i >= pe->nkeys) { + break; + } + + if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) { + ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE); + type = nl_attr_get_u16(ex_type); + + for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) { + struct flower_key_to_pedit *m = &flower_pedit_map[j]; + int flower_off = m->flower_offset; + int sz = m->size; + int mf = m->offset; + + if (m->htype != type) { + continue; + } + + /* check overlap between current pedit key, which is always + * 4 bytes (range [off, off + 3]), and a map entry in + * flower_pedit_map (range [mf, mf + sz - 1]) */ + if ((keys->off >= mf && keys->off < mf + sz) + || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) { + int diff = flower_off + (keys->off - mf); + uint32_t *dst = (void *) (rewrite_key + diff); + uint32_t *dst_m = (void *) (rewrite_mask + diff); + uint32_t mask = ~(keys->mask); + uint32_t zero_bits; + + if (keys->off < mf) { + zero_bits = 8 * (mf - keys->off); + mask &= UINT32_MAX << zero_bits; + } else if (keys->off + 4 > mf + m->size) { + zero_bits = 8 * (keys->off + 4 - mf - m->size); + mask &= UINT32_MAX >> zero_bits; + } + + *dst_m |= mask; + *dst |= keys->val & mask; + } + }If I understand the above correctly it is designed to make pedit actions disjoint. If so, why is that necessary? >
It's not, as a single flower key rewrite can be split to multiple pedit actions it finds the overlap between a flower key and a pedit action, if they do overlap it translates it to the correct offset and masks it out.
+ } else { + VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d", + nl_attr_type(nla)); + return EOPNOTSUPP; + }I think the code could exit early here as nl_msg_put_flower_rewrite_pedits() does below.
Sorry, didn't understand. can you give an example?
+ + keys++; + i++; + } + + flower->rewrite.rewrite = true; + + return 0; +} + static const struct nl_policy tunnel_key_policy[] = { [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC, .min_len = sizeof(struct tc_tunnel_key), @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower) nl_parse_act_vlan(act_options, flower); } else if (!strcmp(act_kind, "tunnel_key")) { nl_parse_act_tunnel_key(act_options, flower); + } else if (!strcmp(act_kind, "pedit")) { + nl_parse_act_pedit(act_options, flower); + } else if (!strcmp(act_kind, "csum")) { + /* not doing anything for now, ovs has an implicit csum recalculation + * with rewriting of packet headers (translating of pedit acts). */I wonder if the absence of a csum action when needed (by TC) should be treated as an error >
Do you mean validating csum flags from each protocol and such (like in put)?
} else { VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind); return EINVAL; @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy) }static void+nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags) +{ + size_t offset; + + nl_msg_put_string(request, TCA_ACT_KIND, "csum"); + offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS); + { + struct tc_csum parm = { .action = TC_ACT_PIPE, + .update_flags = flags }; + + nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm); + } + nl_msg_end_nested(request, offset); +} + +static void +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm, + struct tc_pedit_key_ex *ex) +{ + size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));Are there unnecessary () on the line above?
No, I'll remove them.
+ size_t offset, offset_keys_ex, offset_key; + int i; + + nl_msg_put_string(request, TCA_ACT_KIND, "pedit"); + offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS); + { + parm->action = TC_ACT_PIPE; + + nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize); + offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX); + for (i = 0; i < parm->nkeys; i++, ex++) { + offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX); + nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype); + nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd); + nl_msg_end_nested(request, offset_key); + } + nl_msg_end_nested(request, offset_keys_ex); + } + nl_msg_end_nested(request, offset); +} + +static void nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio) { size_t offset; @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) { } }+/* Given flower, a key_to_pedit map entry, calculates the rest,+ * where: + * + * mask, data - pointers of where read the first word of flower->key/mask. + * current_offset - which offset to use for the first pedit action. + * cnt - max pedits actions to use. + * first_word_mask/last_word_mask - the mask to use for the first/last read + * (as we read entire words). */ static void +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m, + int *cur_offset, int *cnt, uint32_t *last_word_mask, + uint32_t *first_word_mask, uint32_t **mask, uint32_t **data) +{ + int start_offset, max_offset, total_size; + int diff, right_zero_bits, left_zero_bits; + char *rewrite_key = (void *) &flower->rewrite.key; + char *rewrite_mask = (void *) &flower->rewrite.mask; + + max_offset = m->offset + m->size; + start_offset = ROUND_DOWN(m->offset, 4); + diff = m->offset - start_offset; + total_size = max_offset - start_offset; + right_zero_bits = 8 * (4 - (max_offset % 4)); + left_zero_bits = 8 * (m->offset - start_offset); + + *cur_offset = start_offset; + *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0); + *last_word_mask = UINT32_MAX >> right_zero_bits; + *first_word_mask = UINT32_MAX << left_zero_bits; + *data = (void *) (rewrite_key + m->flower_offset - diff); + *mask = (void *) (rewrite_mask + m->flower_offset - diff);The type of *data and *mask is uint32_t *. Why not cast to that type?
It's to avoid warning of increasing pointer alignment (-Wcast-align).
+} + +static inline void +csum_update_flag(struct tc_flower *flower, + enum pedit_header_type htype) {I think the above two lines could be one.+ if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {A case statement might be nicer here.
Right, thanks.
+ flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR; + } + if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 + || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 + || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP + || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) { + if (flower->key.ip_proto == IPPROTO_TCP) { + flower->mask.ip_proto = UINT8_MAX;What if the mask was not UINT8_MAX to start with? Doesn't this create a different flow?
This is so the checksum will be strict (not setting/recalc udp checksum on non udp packets). It creates a more specific flow, a subset of the original. This is fine as datapath is free to ignore a mask, which is the same as a full mask we've set.
+ flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP; + } else if (flower->key.ip_proto == IPPROTO_UDP) { + flower->mask.ip_proto = UINT8_MAX; + flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP; + } else if (flower->key.ip_proto == IPPROTO_ICMP + || flower->key.ip_proto == IPPROTO_ICMPV6) { + flower->mask.ip_proto = UINT8_MAX; + flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP; + } + } +} + +static int +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, + struct tc_flower *flower) +{ + struct { + struct tc_pedit sel; + struct tc_pedit_key keys[MAX_PEDIT_OFFSETS]; + struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS]; + } sel = { + .sel = { + .nkeys = 0 + } + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) { + struct flower_key_to_pedit *m = &flower_pedit_map[i]; + struct tc_pedit_key *pedit_key = NULL; + struct tc_pedit_key_ex *pedit_key_ex = NULL; + uint32_t *mask, *data, first_word_mask, last_word_mask; + int cnt = 0, cur_offset = 0; + + if (!m->size) { + continue; + } + + calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask, + &first_word_mask, &mask, &data); + + for (j = 0; j < cnt; j++, mask++, data++, cur_offset += 4) { + uint32_t mask_word = *mask; + + if (j == 0) { + mask_word &= first_word_mask; + } + if (j == cnt - 1) { + mask_word &= last_word_mask; + } + if (!mask_word) { + continue; + } + if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) { + VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d", + MAX_PEDIT_OFFSETS); + return EOPNOTSUPP; + } + + pedit_key = &sel.keys[sel.sel.nkeys]; + pedit_key_ex = &sel.keys_ex[sel.sel.nkeys]; + pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET; + pedit_key_ex->htype = m->htype; + pedit_key->off = cur_offset; + pedit_key->mask = ~mask_word; + pedit_key->val = *data & mask_word; + sel.sel.nkeys++; + csum_update_flag(flower, m->htype); + } + } + nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex); + + return 0; +} + +static int nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) { size_t offset; @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) offset = nl_msg_start_nested(request, TCA_FLOWER_ACT); { uint16_t act_index = 1; + int error;+ if (flower->rewrite.rewrite) {+ act_offset = nl_msg_start_nested(request, act_index++); + error = nl_msg_put_flower_rewrite_pedits(request, flower); + if (error) { + return error; + } + nl_msg_end_nested(request, act_offset); + + act_offset = nl_msg_start_nested(request, act_index++); + nl_msg_put_act_csum(request, flower->csum_update_flags); + nl_msg_end_nested(request, act_offset); + } if (flower->set.set) { act_offset = nl_msg_start_nested(request, act_index++); nl_msg_put_act_tunnel_key_set(request, flower->set.id, @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) } } nl_msg_end_nested(request, offset); + + return 0; }static void@@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower) nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \ &flower->mask.member, sizeof flower->key.member)-static void+static int nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower) { + uint16_t host_eth_type = ntohs(flower->key.eth_type); bool is_vlan = (host_eth_type == ETH_TYPE_VLAN); + int err; + + /* need to parse acts first as some acts require changing the matching */This seems strange to me.
from the strict (normalized?) header checksum.
+ err = nl_msg_put_flower_acts(request, flower); + if (err) { + return err; + }if (is_vlan) {host_eth_type = ntohs(flower->key.encap_eth_type); @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower) nl_msg_put_flower_tunnel(request, flower); }- nl_msg_put_flower_acts(request, flower);+ return 0; }int@@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle, nl_msg_put_string(&request, TCA_KIND, "flower"); basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); { - nl_msg_put_flower_options(&request, flower); + error = nl_msg_put_flower_options(&request, flower); + + if (error) { + ofpbuf_uninit(&request); + return error; + } } nl_msg_end_nested(&request, basic_offset);diff --git a/lib/tc.h b/lib/tc.hindex 6c69b79..7876051 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -96,6 +96,7 @@ struct tc_flower_key { struct { ovs_be32 ipv4_src; ovs_be32 ipv4_dst; + uint8_t rewrite_ttl; } ipv4; struct { struct in6_addr ipv6_src; @@ -120,6 +121,14 @@ struct tc_flower { uint64_t lastused;struct {+ bool rewrite; + struct tc_flower_key key; + struct tc_flower_key mask; + } rewrite; + + uint32_t csum_update_flags; + + struct { bool set; ovs_be64 id; ovs_be16 tp_src; @@ -152,6 +161,13 @@ struct tc_flower { struct tc_cookie act_cookie; };+/* assert that if we overflow with a masked write of uint32_t to the last byte+ * of flower.rewrite we overflow inside struct flower. + * shouldn't happen unless someone moves rewrite to the end of flower */ +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite) + + MEMBER_SIZEOF(struct tc_flower, rewrite) + + sizeof(uint32_t) - 2 < sizeof(struct tc_flower)); + int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle, struct tc_flower *flower); int tc_del_filter(int ifindex, int prio, int handle); -- 2.7.5
_______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev