Thanks Joe for your feedback. I'm continuing this discussion on the v2 thread http://patchwork.ozlabs.org/patch/767007/ because this patch didn't show up on patchwork, due to my wrong git setting.
Antonio > -----Original Message----- > From: Joe Stringer [mailto:[email protected]] > Sent: Thursday, May 25, 2017 9:01 PM > To: Fischetti, Antonio <[email protected]> > Cc: ovs dev <[email protected]> > Subject: Re: [ovs-dev] [PATCH RFC] Conntrack: Avoid recirculation for > established connections. > > On 25 May 2017 at 04:00, <[email protected]> wrote: > > From: Antonio Fischetti <[email protected]> > > > > With conntrack enabled, packets get recirculated and this impacts > > the performance with thousands of active concurrent connections. > > > > This patch is aimed at avoiding recirculation for packets belonging to > > established connections in steady state. This is achieved by > > manipulating the megaflows and actions. In this case the actions of the > > megaflow of recirculated packet is initially updated with new 'CT_SKIP' > > action and later updated with the actions of the megaflow of the > > original packet(to handle zones). There after the EMC entry > > belonging to the original packet is updated to point to the 'megaflow' > of > > the recirculated packet there by avoiding recirculation. > > > > Signed-off-by: Antonio Fischetti <[email protected]> > > Signed-off-by: Bhanuprakash Bodireddy <[email protected]> > > Co-authored-by: Bhanuprakash Bodireddy > <[email protected]> > > --- > > Hi Antonio, Bhanuprakash, > > A few high level comments: > > I'm a bit confused about how this is logically supposed to satisfy the > API we have exposed at OpenFlow. Furthermore it concerns me that this > may only improve a specific micro-benchmark but then end up not > particularly useful as soon as you get real-world mixed traffic. The > implication of this would be that we make the code more complex, with > more cases where behaviour can differ, for little to no benefit. > > Recirculate is not only used for connection tracking, but also things > like MPLS. Here it seems that you're modifying the recirculate action > EMC fastpath execution to perform shortcut CT, which could plausibly > end up making conntrack state visible to the OpenFlow pipeline just > because someone did a pop_mpls(0x0800). > > Having "ct_state=+est" doesn't mean that conntrack action becomes a > no-op. Controllers may program (or re-program) a pipeline to later > change things like labels or mark for a connection, and in particular > could use the OVS_CT_ATTR_FORCE_COMMIT flag to change direction of a > connection and so on. This is explicitly provided to deal with > established connections. Or if the next ct action occurs in a > different zone, or modifies other ct_* fields, again it can't be > skipped. > > What does the test setup / flows look like? > > I picked on a couple of details further below. > > > - ~10% performance improvement is observed in our testing with UDP > streams. > > - Limited testing is done with RFC patch and the tests include hundreds > of > > concurrent active TCP connections. > > - This is very early implementation and we are posting here to get > initial > > feedback. > > - This RFC patch is implemented leveraging EMC, but optimization could > be > > extended to dpcls as well to handle higher flows. > > - No VXLAN testing is done with this patch. > > > > datapath/linux/compat/include/linux/openvswitch.h | 1 + > > lib/dpif-netdev.c | 148 > ++++++++++++++++++++-- > > lib/packets.h | 2 + > > 3 files changed, 143 insertions(+), 8 deletions(-) > > > > diff --git a/datapath/linux/compat/include/linux/openvswitch.h > b/datapath/linux/compat/include/linux/openvswitch.h > > index d22102e..6dc5674 100644 > > --- a/datapath/linux/compat/include/linux/openvswitch.h > > +++ b/datapath/linux/compat/include/linux/openvswitch.h > > @@ -762,6 +762,7 @@ enum ovs_ct_attr { > > OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ > > OVS_CT_ATTR_FORCE_COMMIT, /* No argument */ > > OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */ > > + OVS_CT_ATTR_SKIP, /* No argument, does not invoke CT. */ > > __OVS_CT_ATTR_MAX > > }; > > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > > index b50164b..fbbb42e 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -2101,7 +2101,8 @@ emc_probabilistic_insert(struct > dp_netdev_pmd_thread *pmd, > > } > > > > static inline struct dp_netdev_flow * > > -emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key) > > +emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key, > > + void **pfound_entry) > > { > > struct emc_entry *current_entry; > > > > @@ -2110,11 +2111,13 @@ emc_lookup(struct emc_cache *cache, const struct > netdev_flow_key *key) > > && emc_entry_alive(current_entry) > > && netdev_flow_key_equal_mf(¤t_entry->key, &key->mf)) > { > > > > + *pfound_entry = current_entry; > > /* We found the entry with the 'key->mf' miniflow */ > > return current_entry->flow; > > } > > } > > > > + *pfound_entry = NULL; > > return NULL; > > } > > > > @@ -4583,10 +4586,12 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, > > key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > > > > /* If EMC is disabled skip emc_lookup */ > > - flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key); > > + flow = (cur_min == 0) ? NULL: > > + emc_lookup(flow_cache, key, &packet- > >md.prev_emc_entry); > > if (OVS_LIKELY(flow)) { > > dp_netdev_queue_batches(packet, flow, &key->mf, batches, > > n_batches); > > + packet->md.prev_flow = flow; > > } else { > > /* Exact match cache missed. Group missed packets together > at > > * the beginning of the 'packets' array. */ > > @@ -4595,6 +4600,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, > > * must be returned to the caller. The next key should be > extracted > > * to 'keys[n_missed + 1]'. */ > > key = &keys[++n_missed]; > > + packet->md.prev_flow = NULL; > > } > > } > > > > @@ -4604,6 +4610,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, > > return dp_packet_batch_size(packets_); > > } > > > > +#define CT_PREPEND_ACTION_LEN 0x0C > > +#define CT_PREPEND_ACTION_SPARE_LEN 32 > > +#define CT_TOT_ACTIONS_LEN (CT_PREPEND_ACTION_LEN + > CT_PREPEND_ACTION_SPARE_LEN) > > static inline void > > handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, > > struct dp_packet *packet, > > @@ -4663,12 +4672,44 @@ handle_packet_upcall(struct dp_netdev_pmd_thread > *pmd, > > ovs_mutex_lock(&pmd->flow_mutex); > > netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); > > if (OVS_LIKELY(!netdev_flow)) { > > - netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, > > - add_actions->data, > > - add_actions->size); > > + /* Recirculated packets that belong to established > connections > > + * are treated differently. A place-holder is prepended to > the > > + * list of actions. */ > > + if (!(packet->md.ct_state & CS_INVALID) && > > + (packet->md.ct_state & CS_TRACKED) && > > + (packet->md.ct_state & CS_ESTABLISHED) && > > + (packet->md.recirc_id)) { > > Just because the value of ct_state in the packet metadata of this > particular packet happens to have those particular state flags doesn't > mean that the OpenFlow pipeline is enforcing this constraint for all > packets hitting this flow. You would have to ensure that the mask in > the flow is also ensuring that all future packets hitting the new flow > will also have that particular ct_state. > > > + > > + /* Prepend an OVS_CT_ATTR_SKIP to the list of actions > inside > > + * add_actions. It does not really invoke the > ConnTrack module, > > + * it's a Ct action that works as a place-holder. It > will be > > + * overwritten inside emc_patch_established_conn() with > > + * the proper ct(zone=X) action. */ > > + struct dp_netdev_actions *new_actions; > > + uint8_t tmp_action[CT_TOT_ACTIONS_LEN] = { > > + 0x0C, 0x00, 0x0C, 0x00, > > + 0x06, 0x00, 0x09, 0x00, > > + 0x00, 0x00, 0x00, 0x00, > > + /* will append here add_actions. */ > > + }; > > + memcpy(&tmp_action[CT_PREPEND_ACTION_LEN], > > + add_actions->data, > > + add_actions->size); > > + new_actions = dp_netdev_actions_create( > > + (const struct nlattr *)tmp_action, > > + CT_PREPEND_ACTION_LEN + add_actions->size); > > Hmm. In some ways I think this would be cleaner to be implemented in > the xlate code, around compose_conntrack_action().. but that code is > currently pretty much dataplane-oblivious and this suggestion is a > userspace-specific optimization. > > > + netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, > > + new_actions->actions, > > + new_actions->size); > > + } else { > > + netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, > > + add_actions->data, > > + add_actions->size); > > + } > > } > > ovs_mutex_unlock(&pmd->flow_mutex); > > - emc_probabilistic_insert(pmd, key, netdev_flow); > > + /* For this RFC, probabilistic emc insertion is disabled. */ > > + emc_insert(&pmd->flow_cache, key, netdev_flow); > > } > > } > > > > @@ -4761,7 +4802,8 @@ fast_path_processing(struct dp_netdev_pmd_thread > *pmd, > > > > flow = dp_netdev_flow_cast(rules[i]); > > > > - emc_probabilistic_insert(pmd, &keys[i], flow); > > + /* For testing purposes the emc_insert is restored here. */ > > + emc_insert(&pmd->flow_cache, &keys[i], flow); > > dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, > n_batches); > > } > > > > @@ -4998,6 +5040,78 @@ dp_execute_userspace_action(struct > dp_netdev_pmd_thread *pmd, > > } > > } > > > > +/* Search into EMC to retrieve the entry related to the original packet > > + * and the entry related to the recirculated packet. > > + * If both EMC entries are alive, then: > > + * - The flow actions of the recirculated packet are updated with > > + * 'ct(zone=N)' as retrieved from the actions of the original flow. > > + * - The EMC orig entry flow is updated with the flow pointer of > recirc entry. > > + */ > > +static inline void > > +emc_patch_established_conn(struct dp_netdev_pmd_thread *pmd, > > + struct dp_packet *packet, long long now OVS_UNUSED) > > +{ > > + struct emc_cache *cache = &pmd->flow_cache; > > + struct netdev_flow_key key; > > + struct emc_entry *orig_entry; /* EMC entry hit by the original > packet. */ > > + struct emc_entry *recirc_entry; /* EMC entry for recirculated > packet. */ > > + uint32_t old_hash; > > + > > + if (!packet->md.prev_emc_entry) { > > + return; > > + } > > + > > + orig_entry = packet->md.prev_emc_entry; > > + if (!emc_entry_alive(orig_entry)) { > > + return; > > + } > > + > > + /* Check that the original EMC entry was not overwritten. */ > > + struct dp_netdev_flow *orig_flow = orig_entry->flow; > > + if (packet->md.prev_flow && (packet->md.prev_flow != orig_flow)) { > > + return; > > + } > > + > > + /* TODO as we are calling miniflow_extract now, can we avoid to > invoke > > + * it again when we'll classify this recirculated packet? */ > > + miniflow_extract(packet, &key.mf); > > + key.len = 0; /* Not computed yet. */ > > + old_hash = dp_packet_get_rss_hash(packet); > > + key.hash = dpif_netdev_packet_get_rss_hash(packet, NULL); > > + dp_packet_set_rss_hash(packet, old_hash); > > + > > + EMC_FOR_EACH_POS_WITH_HASH(cache, recirc_entry, key.hash) { > > + if (recirc_entry->key.hash == key.hash > > + && emc_entry_alive(recirc_entry) > > + && netdev_flow_key_equal_mf(&recirc_entry->key, &key.mf)) { > > + > > + if (orig_entry->flow == recirc_entry->flow) { > > + /* Nothing to do, already patched by a packet of this > > + * same batch. */ > > + return; > > + } > > + /* We found the entry related to the recirculated packet. > > + * Retrieve the actions for orig and recirc entries. * */ > > + struct dp_netdev_actions * orig_actions = > > + dp_netdev_flow_get_actions(orig_entry->flow); > > + struct dp_netdev_actions * recirc_actions = > > + dp_netdev_flow_get_actions(recirc_entry->flow); > > + > > + /* The orig entry action contains a CT action with the zone > info. */ > > + struct nlattr *p = &orig_actions->actions[0]; > > + > > + /* Overwrite the CT Skip action of recirc entry with > ct(zone=N). */ > > + memcpy(recirc_actions->actions, p, p->nla_len); > > + > > + /* Update orig EMC entry. */ > > + orig_entry->flow = recirc_entry->flow; > > + dp_netdev_flow_ref(orig_entry->flow); > > + > > + return; > > + } > > + } > > +} > > + > > static void > > dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, > > const struct nlattr *a, bool may_steal) > > @@ -5024,7 +5138,6 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > } else { > > tx_qid = pmd->static_tx_qid; > > } > > - > > netdev_send(p->port->netdev, tx_qid, packets_, may_steal, > > dynamic_txqs); > > return; > > Unrelated style change. > > > @@ -5145,10 +5258,21 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > struct dp_packet *packet; > > DP_PACKET_BATCH_FOR_EACH (packet, packets_) { > > packet->md.recirc_id = nl_attr_get_u32(a); > > + > > + if (!(packet->md.ct_state & CS_INVALID) && > > + (packet->md.ct_state & CS_TRACKED) && > > + (packet->md.ct_state & CS_ESTABLISHED)) { > > + (*depth)++; > > + emc_patch_established_conn(pmd, packet, now); > > + (*depth)--; > > + } > > } > > > > (*depth)++; > > dp_netdev_recirculate(pmd, packets_); > > + DP_PACKET_BATCH_FOR_EACH (packet, packets_) { > > + packet->md.recirc_id = 0; > > + } > > (*depth)--; > > > > return; > > Can you give some more detail about why resetting the recirc_id is > correct here? Could/should this be applied as an independent change > regardless of the rest of this series? > > > @@ -5166,6 +5290,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > const char *helper = NULL; > > const uint32_t *setmark = NULL; > > const struct ovs_key_ct_labels *setlabel = NULL; > > + bool skip_ct = false; > > > > NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), > > nl_attr_get_size(a)) { > > @@ -5194,6 +5319,9 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > /* Silently ignored, as userspace datapath does not > generate > > * netlink events. */ > > break; > > + case OVS_CT_ATTR_SKIP: > > + skip_ct = true; > > + break; > > case OVS_CT_ATTR_NAT: > > case OVS_CT_ATTR_UNSPEC: > > case __OVS_CT_ATTR_MAX: > > @@ -5201,6 +5329,10 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > } > > } > > > > + if (OVS_UNLIKELY(skip_ct)) { > > + break; > > + } > > + > > conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, > force, > > commit, zone, setmark, setlabel, helper); > > break; > > diff --git a/lib/packets.h b/lib/packets.h > > index 7dbf6dd..2c46a15 100644 > > --- a/lib/packets.h > > +++ b/lib/packets.h > > @@ -112,6 +112,8 @@ struct pkt_metadata { > > struct flow_tnl tunnel; /* Encapsulating tunnel parameters. > Note that > > * if 'ip_dst' == 0, the rest of the > fields may > > * be uninitialized. */ > > + void *prev_emc_entry; /* EMC entry that this packet matched. > */ > > + void *prev_flow; /* Flow pointed by the matching EMC > entry. */ > > }; > > > > static inline void > > -- > > 2.4.11 > > > > _______________________________________________ > > dev mailing list > > [email protected] > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
