On Tue, Aug 29, 2017 at 08:11:23AM +0000, Darrell Ball wrote: > > On 8/22/17, 11:24 PM, "Yuanhan Liu" <y...@fridaylinux.org> wrote: > > From: Finn Christensen <f...@napatech.com> > > The basic yet the major part of this patch is to translate the "match" > to rte flow patterns. And then, we create a rte flow with a MARK > action. > Afterwards, all pkts matches the flow will have the mark id in the > mbuf. > > For any unsupported flows, such as MPLS, -1 is returned, meaning the > flow offload is failed and then skipped. > > Signed-off-by: Finn Christensen <f...@napatech.com> > Signed-off-by: Yuanhan Liu <y...@fridaylinux.org> > --- > lib/netdev-dpdk.c | 385 > +++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 384 insertions(+), 1 deletion(-) > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 15fef1e..8089da8 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -58,6 +58,7 @@ > #include "smap.h" > #include "sset.h" > #include "unaligned.h" > +#include "uuid.h" > #include "timeval.h" > #include "unixctl.h" > > @@ -3398,6 +3399,388 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid) > } > > > +#define MAX_RTE_FLOW_ITEMS 100 > +#define MAX_RTE_FLOW_ACTIONS 100 > > I guess these are temporary
Yes, the hardcoded number is really hacky. > Do we need to do a rte query during initialization ? query on what? > + > +static int > +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev, > + const struct match *match, > + struct nlattr *nl_actions > OVS_UNUSED, > + size_t actions_len, > + const ovs_u128 *ufid, > + struct offload_info *info) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + const struct rte_flow_attr flow_attr = { > + .group = 0, > + .priority = 0, > + .ingress = 1, > + .egress = 0 > + }; > + struct flow_patterns { > + struct rte_flow_item items[MAX_RTE_FLOW_ITEMS]; > + uint32_t cnt; > + } patterns; > + struct flow_actions { > + struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS]; > + uint32_t cnt; > + } actions; > + struct rte_flow *flow; > + struct rte_flow_error error; > + > + uint8_t *ipv4_next_proto_mask = NULL; > + > +#define ADD_FLOW_PATTERN(type_, spec_, mask_) do { \ > + patterns.items[patterns.cnt].type = type_; \ > + patterns.items[patterns.cnt].spec = spec_; \ > + patterns.items[patterns.cnt].mask = mask_; \ > + patterns.items[patterns.cnt].last = NULL; \ > + if (patterns.cnt < MAX_RTE_FLOW_ITEMS) { \ > + patterns.cnt++; \ > + } else { \ > + VLOG_ERR("too many rte flow patterns (> %d)", > MAX_RTE_FLOW_ITEMS); \ > + goto err; \ > + } \ > +} while (0) > > static (inline) function maybe ? Indeed. I'm not a big fan of macro like this. Let me turn it to function next time. I see no reason to make it inline (explicitly) though: it's not in data path. Moreover, it's likely it will be inlined implicitly by compiler. > > > + > +#define ADD_FLOW_ACTION(type_, conf_) do { \ > + actions.actions[actions.cnt].type = type_; \ > + actions.actions[actions.cnt].conf = conf_; \ > + actions.cnt++; \ > + ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \ > +} while (0) > > > static (inline) function maybe ? Ditto. > > + > + patterns.cnt = 0; > + actions.cnt = 0; > + > + /* Eth */ > + struct rte_flow_item_eth eth_spec; > + struct rte_flow_item_eth eth_mask; > + memset(ð_mask, 0, sizeof(eth_mask)); > + if (match->wc.masks.dl_src.be16[0] || > + match->wc.masks.dl_src.be16[1] || > + match->wc.masks.dl_src.be16[2] || > + match->wc.masks.dl_dst.be16[0] || > + match->wc.masks.dl_dst.be16[1] || > + match->wc.masks.dl_dst.be16[2]) { > + rte_memcpy(ð_spec.dst, &match->flow.dl_dst, > sizeof(eth_spec.dst)); > + rte_memcpy(ð_spec.src, &match->flow.dl_src, > sizeof(eth_spec.src)); > + eth_spec.type = match->flow.dl_type; > + > + rte_memcpy(ð_mask.dst, &match->wc.masks.dl_dst, > sizeof(eth_mask.dst)); > + rte_memcpy(ð_mask.src, &match->wc.masks.dl_src, > sizeof(eth_mask.src)); > + eth_mask.type = match->wc.masks.dl_type; > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, ð_spec, > ð_mask); > + } else { > + /* > + * If user specifies a flow (like UDP flow) without L2 > patterns, > + * OVS will at least set the dl_type. Normally, it's enough > to > + * create an eth pattern just with it. Unluckily, some > Intel's > + * NIC (such as XL710) doesn't support that. Below is a > workaround, > + * which simply matches any L2 pkts. > + */ > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, ð_spec, > ð_mask); > + } > + > + /* VLAN */ > + struct rte_flow_item_vlan vlan_spec; > + struct rte_flow_item_vlan vlan_mask; > + memset(&vlan_mask, 0, sizeof(vlan_mask)); > + if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) { > + vlan_spec.tci = match->flow.vlans[0].tci; > + vlan_mask.tci = match->wc.masks.vlans[0].tci; > + > + /* match any protocols */ > + vlan_mask.tpid = 0; > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec, > &vlan_mask); > + } > + > + /* IP v4 */ > + uint8_t proto = 0; > + struct rte_flow_item_ipv4 ipv4_spec; > + struct rte_flow_item_ipv4 ipv4_mask; > + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); > + if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) && > + (match->wc.masks.nw_src || match->wc.masks.nw_dst || > + match->wc.masks.nw_tos || match->wc.masks.nw_ttl || > + match->wc.masks.nw_proto)) { > + ipv4_spec.hdr.type_of_service = match->flow.nw_tos; > + ipv4_spec.hdr.time_to_live = match->flow.nw_tos; > + ipv4_spec.hdr.next_proto_id = match->flow.nw_proto; > + ipv4_spec.hdr.src_addr = match->flow.nw_src; > + ipv4_spec.hdr.dst_addr = match->flow.nw_dst; > + > + ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos; > + ipv4_mask.hdr.time_to_live = match->wc.masks.nw_tos; > + ipv4_mask.hdr.next_proto_id = match->wc.masks.nw_proto; > + ipv4_mask.hdr.src_addr = match->wc.masks.nw_src; > + ipv4_mask.hdr.dst_addr = match->wc.masks.nw_dst; > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec, > &ipv4_mask); > + > + /* Save proto for L4 protocol setup */ > + proto = ipv4_spec.hdr.next_proto_id & > ipv4_mask.hdr.next_proto_id; > + > + /* Remember proto mask address for later modification */ > + ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id; > + } > + > + if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP && > + proto != IPPROTO_SCTP && proto != IPPROTO_TCP && > + (match->wc.masks.tp_src || > + match->wc.masks.tp_dst || > + match->wc.masks.tcp_flags)) { > + VLOG_INFO("L4 Protocol (%u) not supported", proto); > + goto err; > + } > + > + struct rte_flow_item_udp udp_spec; > + struct rte_flow_item_udp udp_mask; > + memset(&udp_mask, 0, sizeof(udp_mask)); > + if ((proto == IPPROTO_UDP) && > + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) { > + udp_spec.hdr.src_port = match->flow.tp_src; > + udp_spec.hdr.dst_port = match->flow.tp_dst; > + > + udp_mask.hdr.src_port = match->wc.masks.tp_src; > + udp_mask.hdr.dst_port = match->wc.masks.tp_dst; > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec, > &udp_mask); > + > + /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto > match */ > + if (ipv4_next_proto_mask) { > + *ipv4_next_proto_mask = 0; > + } > + } > + > + struct rte_flow_item_sctp sctp_spec; > + struct rte_flow_item_sctp sctp_mask; > + memset(&sctp_mask, 0, sizeof(sctp_mask)); > + if ((proto == IPPROTO_SCTP) && > + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) { > + sctp_spec.hdr.src_port = match->flow.tp_src; > + sctp_spec.hdr.dst_port = match->flow.tp_dst; > + > + sctp_mask.hdr.src_port = match->wc.masks.tp_src; > + sctp_mask.hdr.dst_port = match->wc.masks.tp_dst; > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec, > &sctp_mask); > + > + /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto > match */ > + if (ipv4_next_proto_mask) { > + *ipv4_next_proto_mask = 0; > + } > + } > + > + struct rte_flow_item_icmp icmp_spec; > + struct rte_flow_item_icmp icmp_mask; > + memset(&icmp_mask, 0, sizeof(icmp_mask)); > + if ((proto == IPPROTO_ICMP) && > + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) { > + icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src); > + icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst); > + > + icmp_mask.hdr.icmp_type = > (uint8_t)ntohs(match->wc.masks.tp_src); > + icmp_mask.hdr.icmp_code = > (uint8_t)ntohs(match->wc.masks.tp_dst); > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec, > &icmp_mask); > + > + /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto > match */ > + if (ipv4_next_proto_mask) { > + *ipv4_next_proto_mask = 0; > + } > + } > + > + struct rte_flow_item_tcp tcp_spec; > + struct rte_flow_item_tcp tcp_mask; > + memset(&tcp_mask, 0, sizeof(tcp_mask)); > + if ((proto == IPPROTO_TCP) && > + (match->wc.masks.tp_src || > + match->wc.masks.tp_dst || > + match->wc.masks.tcp_flags)) { > + tcp_spec.hdr.src_port = match->flow.tp_src; > + tcp_spec.hdr.dst_port = match->flow.tp_dst; > + tcp_spec.hdr.data_off = ntohs(match->flow.tcp_flags) >> 8; > + tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff; > + > + tcp_mask.hdr.src_port = match->wc.masks.tp_src; > + tcp_mask.hdr.dst_port = match->wc.masks.tp_dst; > + tcp_mask.hdr.data_off = ntohs(match->wc.masks.tcp_flags) >> > 8; > + tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & > 0xff; > + > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec, > &tcp_mask); > + > + /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto > match */ > + if (ipv4_next_proto_mask) { > + *ipv4_next_proto_mask = 0; > + } > + } > + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL); > + > + struct rte_flow_action_mark mark; > + if (actions_len) { > + mark.id = info->flow_mark; > + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark); > + } else { > + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL); > + VLOG_INFO("no action given; drop pkts in hardware\n"); > + } > + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL); > + > + flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items, > + actions.actions, &error); > + if (!flow) { > + VLOG_ERR("rte flow creat error: %u : message : %s\n", > + error.type, error.message); > + goto err; > + } > + add_ufid_dpdk_flow_mapping(ufid, flow); > + VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n", > + flow, UUID_ARGS((struct uuid *)ufid)); > + > + return 0; > + > +err: > + return -1; > +} > + > +/* > + * Validate for later rte flow offload creation. If any unsupported > + * flow are specified, return -1. > + */ > +static int > +netdev_dpdk_validate_flow(const struct match *match) > +{ > + struct match match_zero_wc; > + > + /* Create a wc-zeroed version of flow */ > + match_init(&match_zero_wc, &match->flow, &match->wc); > + > +#define CHECK_NONZERO(addr, size) do { \ > + uint8_t *padr = (uint8_t *)(addr); \ > + int i; \ > + for (i = 0; i < (size); i++) { \ > + if (padr[i] != 0) { \ > + goto err; \ > + } \ > + } \ > +} while (0) > + > +#define CHECK_NONZEROSIMPLE(var) do { \ > > CHECK_NONZERO ? Yep, it's better (as it's shorter; also, SIMPLE is too vague). > > + if ((var) != 0) { \ > + goto err; \ > + } \ > +} while (0) > + > + CHECK_NONZERO(&match_zero_wc.flow.tunnel, > sizeof(match_zero_wc.flow.tunnel)); > + CHECK_NONZEROSIMPLE(match->wc.masks.metadata); > + CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority); > + CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark); > + CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash); > + > + /* recirc id must be zero */ > + CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id); > + > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_state); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src); > + CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst); > + CHECK_NONZEROSIMPLE(match->wc.masks.conj_id); > + CHECK_NONZEROSIMPLE(match->wc.masks.actset_output); > + > + /* unsupported L2 */ > + CHECK_NONZERO(&match->wc.masks.mpls_lse, > + sizeof(match_zero_wc.flow.mpls_lse) / > sizeof(ovs_be32)); > + > + /* unsupported L3 */ > + CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct > in6_addr)); > + CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct > in6_addr)); > + CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label); > + CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct > in6_addr)); > + CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct eth_addr)); > + CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct eth_addr)); > > CHECK_NONZERO_BYTES ? Again, better. Thanks! > + > + /* If fragmented, then don't HW accelerate - for now */ > + CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag); > + > + /* unsupported L4 */ > + CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4); > + > + return 0; > + > +err: > + VLOG_INFO("Cannot HW accelerate this flow"); > + return -1; > +} > + > +static int > +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev, > + const ovs_u128 *ufid, > + struct rte_flow *rte_flow) > +{ > + struct rte_flow_error error; > + int ret; > + > + ret = rte_flow_destroy(dev->port_id, rte_flow, &error); > + if (ret == 0) { > + del_ufid_dpdk_flow_mapping(ufid); > + VLOG_INFO("removed rte flow %p associated with ufid " > UUID_FMT "\n", > + rte_flow, UUID_ARGS((struct uuid *)ufid)); > + } else { > + VLOG_ERR("rte flow destroy error: %u : message : %s\n", > + error.type, error.message); > + } > + > + return ret; > +} > + > +static int > +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match, > + struct nlattr *actions, size_t actions_len, > + const ovs_u128 *ufid, struct offload_info *info, > + struct dpif_flow_stats *stats OVS_UNUSED) > +{ > + struct rte_flow *rte_flow; > + int ret; > + > + /* > + * If an old rte_flow exists, it means it's a flow modification. > + * Here destroy the old rte flow first before adding a new one. > + */ > + rte_flow = get_rte_flow_by_ufid(ufid); > + if (rte_flow) { > + ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev), > + ufid, rte_flow); > + if (ret < 0) > + return ret; > + } > + > + ret = netdev_dpdk_validate_flow(match); > + if (ret < 0) { > + return ret; > + } > + > + return netdev_dpdk_add_rte_flow_offload(netdev, match, actions, > + actions_len, ufid, info); > +} > + > > Would you mind adding some comments for below NULL elements ? Like what the method it's pointing to? --yliu > > > +#define DPDK_FLOW_OFFLOAD_API \ > + NULL, \ > + NULL, \ > + NULL, \ > + NULL, \ > + netdev_dpdk_flow_put, \ > + NULL, \ > + NULL, \ > + NULL > + > + > #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \ > SET_CONFIG, SET_TX_MULTIQ, SEND, \ > GET_CARRIER, GET_STATS, \ > @@ -3470,7 +3853,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid) > RXQ_RECV, \ > NULL, /* rx_wait */ \ > NULL, /* rxq_drain */ \ > - NO_OFFLOAD_API \ > + DPDK_FLOW_OFFLOAD_API \ > } > > static const struct netdev_class dpdk_class = > -- > 2.7.4 > > > > > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev