[ovs-dev] [PATCH v1 10/10] system-dpdk.at: Add DPIF test for ipv4 vxlan packet types.
This patch adds a test-case for DPIF for vxlan inner paket handling. 9: OVS-DPDK - Dpif Vxlan_decap The pcap added along with it contains all the traffic patterns in combination of vxlan tunnel. Signed-off-by: Kumar Amber --- tests/automake.mk | 1 + tests/pcap/dpif_vxlan.pcap | Bin 0 -> 488 bytes tests/system-dpdk.at | 76 + 3 files changed, 77 insertions(+) create mode 100644 tests/pcap/dpif_vxlan.pcap diff --git a/tests/automake.mk b/tests/automake.mk index 8a9151f81..aa5006e64 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -145,6 +145,7 @@ $(srcdir)/tests/fuzz-regression-list.at: tests/automake.mk EXTRA_DIST += $(MFEX_AUTOVALIDATOR_TESTS) MFEX_AUTOVALIDATOR_TESTS = \ + tests/pcap/dpif_vxlan.pcap \ tests/pcap/mfex_test.pcap \ tests/mfex_fuzzy.py diff --git a/tests/pcap/dpif_vxlan.pcap b/tests/pcap/dpif_vxlan.pcap new file mode 100644 index ..902f1b7aa898f409320495c90b59d812d8ade07d GIT binary patch literal 488 zcmca|c+)~A1{MYw`2U}Qff2~L{LLZp>kU4J1RxuP1(b|kLdzMHI%bA;a4@(sFa&{= zIS3Z@F)}f;05Pl3R-vs7c3%(jFfcGPFu!0>3QcqAn8_e0IMb!P5ojn_mkdh1Wg}ur9WuV{rRK7y^LCgFyh30z-Q# x!mlwvQ4q$k+Yf5@Bow=?%1&dmn>c@F0PO)`47;MBcD13{https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v1 07/10] mfex-study: Modify study func to select outer and inner mfex funcs.
The Mfex study function is split into outer and inner to allow for independent selection and studying of packets in outer and inner flows to different ISA optimized Mfexs. Signed-off-by: Kumar Amber --- lib/dpif-netdev-extract-study.c | 126 +--- 1 file changed, 83 insertions(+), 43 deletions(-) diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c index 71354cc4c..03d97c64e 100644 --- a/lib/dpif-netdev-extract-study.c +++ b/lib/dpif-netdev-extract-study.c @@ -30,7 +30,9 @@ static atomic_uint32_t mfex_study_pkts_count = MFEX_MAX_PKT_COUNT; /* Struct to hold miniflow study stats. */ struct study_stats { uint32_t pkt_count; +uint32_t pkt_inner_count; uint32_t impl_hitcount[MFEX_IMPL_MAX]; +uint32_t impl_inner_hitcount[MFEX_IMPL_MAX]; }; /* Define per thread data to hold the study stats. */ @@ -67,6 +69,58 @@ mfex_set_study_pkt_cnt(uint32_t pkt_cmp_count, const char *name) return -EINVAL; } + +static inline void +mfex_reset_stats(uint32_t *impls_hitcount, uint32_t *pkt_cnt) { +/* Reset stats so that study function can be called again + * for next traffic type and optimal function ptr can be + * chosen. + */ +memset(impls_hitcount, 0, sizeof(uint32_t) * MFEX_IMPL_MAX); +*pkt_cnt = 0; +} + +static inline void +mfex_study_select_best_impls(struct dpif_miniflow_extract_impl *mfex_funcs, + uint32_t pkt_cnt, uint32_t *impls_arr, + atomic_uintptr_t *pmd_func, char *name) +{ + +uint32_t best_func_index = MFEX_IMPL_START_IDX; +uint32_t max_hits = 0; + +for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) { +if (impls_arr[i] > max_hits) { +max_hits = impls_arr[i]; +best_func_index = i; +} +} + +/* If 50% of the packets hit, enable the function. */ +if (max_hits >= (mfex_study_pkts_count / 2)) { +atomic_store_relaxed(pmd_func, +(uintptr_t) mfex_funcs[best_func_index].extract_func); +VLOG_INFO("MFEX %s study chose impl %s: (hits %u/%u pkts)", + name, mfex_funcs[best_func_index].name, max_hits, + pkt_cnt); +} else { +/* Set the implementation to null for default miniflow. */ +atomic_store_relaxed(pmd_func, +(uintptr_t) mfex_funcs[MFEX_IMPL_SCALAR].extract_func); +VLOG_INFO("Not enough packets matched (%u/%u), disabling" + " optimized MFEX.", max_hits, pkt_cnt); +} + +/* In debug mode show stats for all the counters. */ +if (VLOG_IS_DBG_ENABLED()) { +for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) { +VLOG_DBG("MFEX study results for implementation %s:" + " (hits %u/%u pkts)", mfex_funcs[i].name, + impls_arr[i], pkt_cnt); +} +} +} + uint32_t mfex_study_traffic(struct dp_packet_batch *packets, struct netdev_flow_key *keys, @@ -76,10 +130,12 @@ mfex_study_traffic(struct dp_packet_batch *packets, { uint32_t hitmask = 0; uint32_t mask = 0; +uint32_t study_cnt_pkts; struct dp_netdev_pmd_thread *pmd = pmd_handle; struct dpif_miniflow_extract_impl *miniflow_funcs; struct study_stats *stats = mfex_study_get_study_stats_ptr(); miniflow_funcs = dpif_mfex_impl_info_get(); +atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts); /* Run traffic optimized miniflow_extract to collect the hitmask * to be compared after certain packets have been hit to choose @@ -93,7 +149,11 @@ mfex_study_traffic(struct dp_packet_batch *packets, hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size, in_port, pmd_handle, md_is_valid); -stats->impl_hitcount[i] += count_1bits(hitmask); +if (!md_is_valid) { +stats->impl_hitcount[i] += count_1bits(hitmask); +} else { +stats->impl_inner_hitcount[i] += count_1bits(hitmask); +} /* If traffic is not classified then we dont overwrite the keys * array in minfiflow implementations so its safe to create a @@ -102,54 +162,34 @@ mfex_study_traffic(struct dp_packet_batch *packets, mask |= hitmask; } -stats->pkt_count += dp_packet_batch_size(packets); - /* Choose the best implementation after a minimum packets have been * processed. */ -uint32_t study_cnt_pkts; -atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts); - -if (stats->pkt_count >= study_cnt_pkts) { -uint32_t best_func_index = MFEX_IMPL_START_IDX; -uint32_t max_hits = 0; -for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) { -if (stats->impl_hitcount[i] > max_hits) { -max_hits = stats->impl_hitcoun
[ovs-dev] [PATCH v1 05/10] dpif-mfex: Modify set/get mfex commands to include inner.
The set command in MFEX is changed as to allow the user to select different optimized mfex ISA for processing Inner packets in case of tunneling. $ ovs-appctl dpif-netdev/miniflow-parser-set -pmd 3 study 1024 -inner The get command is modified to indcitate both inner and Outer MFEXs in use. Signed-off-by: Kumar Amber Signed-off-by: Cian Ferriter Co-authored-by: Cian Ferriter --- Documentation/topics/dpdk/bridge.rst | 19 +-- lib/dpif-netdev-private-extract.c| 24 +++- lib/dpif-netdev-private-extract.h| 8 +++- lib/dpif-netdev-private-thread.h | 3 +++ lib/dpif-netdev.c| 21 ++--- 5 files changed, 64 insertions(+), 11 deletions(-) diff --git a/Documentation/topics/dpdk/bridge.rst b/Documentation/topics/dpdk/bridge.rst index ceee91015..7a442369d 100644 --- a/Documentation/topics/dpdk/bridge.rst +++ b/Documentation/topics/dpdk/bridge.rst @@ -293,13 +293,15 @@ command also shows whether the CPU supports each implementation:: An implementation can be selected manually by the following command:: $ ovs-appctl dpif-netdev/miniflow-parser-set [-pmd core_id] name \ - [study_cnt] + [study_cnt] [-recirc] -The above command has two optional parameters: ``study_cnt`` and ``core_id``. -The ``core_id`` sets a particular packet parsing function to a specific -PMD thread on the core. The third parameter ``study_cnt``, which is specific -to ``study`` and ignored by other implementations, means how many packets -are needed to choose the best implementation. +The above command has three optional parameters: ``study_cnt``, ``core_id`` +and ``-inner``. The ``core_id`` sets a particular packet parsing function +to a specific PMD thread on the core. The third parameter ``study_cnt``, +which is specific to ``study`` and ignored by other implementations, means +how many packets are needed to choose the best implementation. The fourth +parameter ``-recirc`` acts like flag which indicates to MFEX to use optimized +MFEX inner for processing tunneled inner packets. Also user can select the ``study`` implementation which studies the traffic for a specific number of packets by applying all available implementations of @@ -321,3 +323,8 @@ following command:: ``scalar`` can be selected on core ``3`` by the following command:: $ ovs-appctl dpif-netdev/miniflow-parser-set -pmd 3 scalar + +``study`` can be selected with packet count and explicit PMD selection along +with the ``recirc`` by following command:: + +$ ovs-appctl dpif-netdev/miniflow-parser-set -pmd 3 study 1024 -recirc diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c index 4b2f12015..c70f1fca9 100644 --- a/lib/dpif-netdev-private-extract.c +++ b/lib/dpif-netdev-private-extract.c @@ -33,6 +33,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract); /* Variable to hold the default MFEX implementation. */ static ATOMIC(miniflow_extract_func) default_mfex_func; +/* Variable to hold the default MFEX inner implementation. */ +static ATOMIC(miniflow_extract_func) default_mfex_inner_func; + /* Implementations of available extract options and * the implementations are always in order of preference. */ @@ -141,16 +144,31 @@ dp_mfex_impl_get_default(void) return return_func; } +miniflow_extract_func +dp_mfex_inner_impl_get_default(void) +{ +miniflow_extract_func return_func; +atomic_uintptr_t *mfex_func = (void *)&default_mfex_inner_func; + +atomic_read_relaxed(mfex_func, (uintptr_t *) &return_func); + +return return_func; +} + int -dp_mfex_impl_set_default_by_name(const char *name) +dp_mfex_impl_set_default_by_name(const char *name, bool mfex_inner) { miniflow_extract_func new_default; atomic_uintptr_t *mfex_func = (void *)&default_mfex_func; +atomic_uintptr_t *mfex_inner_func = (void *)&default_mfex_inner_func; int err = dp_mfex_impl_get_by_name(name, &new_default); if (!err) { atomic_store_relaxed(mfex_func, (uintptr_t) new_default); +if (mfex_inner) { +atomic_store_relaxed(mfex_inner_func, (uintptr_t) new_default); +} } return err; @@ -178,6 +196,10 @@ dp_mfex_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list, if (pmd->miniflow_extract_opt == mfex_impls[i].extract_func) { ds_put_format(reply, "%u,", pmd->core_id); } +if (pmd->miniflow_extract_inner_opt == +mfex_impls[i].extract_func) { +ds_put_format(reply, "%u,", pmd->core_id); +} } ds_chomp(reply, ','); diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h index f9a757ba4..14365219e 100644 --- a/lib/dpif-netdev-private-extract.h +++ b/lib/dpif-netdev-private-extract.h @@ -128,8 +128,12 @@ dp_mfex_impl_get_by_name(const char *name, miniflow_extract_func *out_func); * ove
[ovs-dev] [PATCH v1 09/10] dpif-avx512: Add mfex inner support to avx512 dpif.
Signed-off-by: Kumar Amber --- lib/dpif-netdev-avx512.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index 5cf1264f8..439d5e09b 100644 --- a/lib/dpif-netdev-avx512.c +++ b/lib/dpif-netdev-avx512.c @@ -161,7 +161,14 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, uint32_t mf_mask = 0; miniflow_extract_func mfex_func; atomic_read_relaxed(&pmd->miniflow_extract_opt, &mfex_func); -if (mfex_func) { + +miniflow_extract_func mfex_inner_func; +atomic_read_relaxed(&pmd->miniflow_extract_inner_opt, &mfex_inner_func); + +if (md_is_valid && mfex_inner_func) { +mf_mask = mfex_inner_func(packets, keys, batch_size, in_port, pmd, + md_is_valid); +} else if (!md_is_valid && mfex_func) { mf_mask = mfex_func(packets, keys, batch_size, in_port, pmd, md_is_valid); } -- 2.25.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v1 06/10] dpif-mfex: Change mfex fn pointer prototype to include md_is_valid.
The md_is_valid parameter is passed from DPIF to MFEX to allow mfex functions to detect the tunneling and decide the processing of Inner packets in static predictable branches. Signed-off-by: Kumar Amber --- lib/dpif-netdev-avx512.c | 3 ++- lib/dpif-netdev-extract-avx512.c | 9 + lib/dpif-netdev-extract-study.c | 6 -- lib/dpif-netdev-private-extract.c | 6 -- lib/dpif-netdev-private-extract.h | 13 - 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index fa8773331..5cf1264f8 100644 --- a/lib/dpif-netdev-avx512.c +++ b/lib/dpif-netdev-avx512.c @@ -162,7 +162,8 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, miniflow_extract_func mfex_func; atomic_read_relaxed(&pmd->miniflow_extract_opt, &mfex_func); if (mfex_func) { -mf_mask = mfex_func(packets, keys, batch_size, in_port, pmd); +mf_mask = mfex_func(packets, keys, batch_size, in_port, pmd, +md_is_valid); } uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1; diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index c1c1fefb6..02056b731 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -480,6 +480,7 @@ mfex_avx512_process(struct dp_packet_batch *packets, uint32_t keys_size OVS_UNUSED, odp_port_t in_port, void *pmd_handle OVS_UNUSED, +bool md_is_valid OVS_UNUSED, const enum MFEX_PROFILES profile_id, const uint32_t use_vbmi) { @@ -634,10 +635,10 @@ __attribute__((__target__("avx512vbmi"))) \ mfex_avx512_vbmi_##name(struct dp_packet_batch *packets,\ struct netdev_flow_key *keys, uint32_t keys_size,\ odp_port_t in_port, struct dp_netdev_pmd_thread \ -*pmd_handle)\ +*pmd_handle, bool md_is_valid) \ { \ return mfex_avx512_process(packets, keys, keys_size, in_port, \ - pmd_handle, profile, 1); \ + pmd_handle, md_is_valid, profile, 1);\ } \ \ uint32_t\ @@ -646,10 +647,10 @@ __attribute__((__target__("avx512vl"))) \ mfex_avx512_##name(struct dp_packet_batch *packets, \ struct netdev_flow_key *keys, uint32_t keys_size,\ odp_port_t in_port, struct dp_netdev_pmd_thread \ - *pmd_handle) \ + *pmd_handle, bool md_is_valid) \ { \ return mfex_avx512_process(packets, keys, keys_size, in_port, \ - pmd_handle, profile, 0); \ + pmd_handle, md_is_valid, profile, 0);\ } /* Each profile gets a single declare here, which specializes the function diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c index 69077c844..71354cc4c 100644 --- a/lib/dpif-netdev-extract-study.c +++ b/lib/dpif-netdev-extract-study.c @@ -71,7 +71,8 @@ uint32_t mfex_study_traffic(struct dp_packet_batch *packets, struct netdev_flow_key *keys, uint32_t keys_size, odp_port_t in_port, - struct dp_netdev_pmd_thread *pmd_handle) + struct dp_netdev_pmd_thread *pmd_handle, + bool md_is_valid) { uint32_t hitmask = 0; uint32_t mask = 0; @@ -90,7 +91,8 @@ mfex_study_traffic(struct dp_packet_batch *packets, } hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size, - in_port, pmd_handle); + in_port, pmd_handle, + md_is_valid); stats->impl_hitcount[i] += count_1bits(hitmask); /* If traffic is not classified then we dont overwrite the keys diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c index c70f1fca9..b26603a57 100644 --- a/lib/dpif-netdev-private-extract.c +++ b/lib/dpif-netdev-private-extract.c @@ -251,7 +251,8 @@ uint32_t dpif_miniflow_extract_autovalidator(struct dp_packet_batch *packets, struct netdev_flow_key *keys,
[ovs-dev] [PATCH v1 08/10] mfex-avx512: Add support for tunnel packets in avx512 mfex.
This patch adds the necessary support to avx512 mfex to support handling of tunnel packet type. Signed-off-by: Kumar Amber --- lib/dpif-netdev-extract-avx512.c | 45 +-- lib/dpif-netdev-private-extract.c | 5 +++- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 02056b731..69bd2d7e2 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -508,6 +508,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, DP_PACKET_BATCH_FOR_EACH (i, packet, packets) { /* If the packet is smaller than the probe size, skip it. */ const uint32_t size = dp_packet_size(packet); +const struct pkt_metadata *md = &packet->md; +bool tunnel_present = flow_tnl_dst_is_set(&md->tunnel); if (size < dp_pkt_min_size) { continue; } @@ -554,7 +556,17 @@ mfex_avx512_process(struct dp_packet_batch *packets, } __m512i v_blk0_strip = _mm512_and_si512(v_blk0, v_strp); -_mm512_storeu_si512(&blocks[2], v_blk0_strip); + +/* Handle inner meta-data if valid. */ +if (tunnel_present) { +__m512i v_tun = _mm512_loadu_si512(&md->tunnel); +_mm512_storeu_si512(&blocks[0], v_tun); +_mm512_storeu_si512(&blocks[11], v_blk0_strip); +blocks[9] = md->dp_hash | +((uint64_t) odp_to_u32(md->in_port.odp_port) << 32); +} else { +_mm512_storeu_si512(&blocks[2], v_blk0_strip); +} /* Perform "post-processing" per profile, handling details not easily * handled in the above generic AVX512 code. Examples include TCP flag @@ -566,8 +578,6 @@ mfex_avx512_process(struct dp_packet_batch *packets, break; case PROFILE_ETH_VLAN_IPV4_TCP: { -mfex_vlan_pcp(pkt[14], &keys[i].buf[4]); - uint32_t size_from_ipv4 = size - VLAN_ETH_HEADER_LEN; struct ip_header *nh = (void *)&pkt[VLAN_ETH_HEADER_LEN]; if (mfex_ipv4_set_l2_pad_size(packet, nh, size_from_ipv4, @@ -577,25 +587,41 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process TCP flags, and store to blocks. */ const struct tcp_header *tcp = (void *)&pkt[38]; -mfex_handle_tcp_flags(tcp, &blocks[7]); +if (!tunnel_present) { +mfex_vlan_pcp(pkt[14], &keys[i].buf[4]); +mfex_handle_tcp_flags(tcp, &blocks[7]); +} else { +mfex_vlan_pcp(pkt[14], &keys[i].buf[13]); +mfex_handle_tcp_flags(tcp, &blocks[16]); +mf->map.bits[0] = 0x38a001ff; +} } break; case PROFILE_ETH_VLAN_IPV4_UDP: { -mfex_vlan_pcp(pkt[14], &keys[i].buf[4]); - uint32_t size_from_ipv4 = size - VLAN_ETH_HEADER_LEN; struct ip_header *nh = (void *)&pkt[VLAN_ETH_HEADER_LEN]; if (mfex_ipv4_set_l2_pad_size(packet, nh, size_from_ipv4, UDP_HEADER_LEN)) { continue; } + +if (!tunnel_present) { +mfex_vlan_pcp(pkt[14], &keys[i].buf[4]); +} else { +mf->map.bits[0] = 0x38a001ff; +mfex_vlan_pcp(pkt[14], &keys[i].buf[13]); +} } break; case PROFILE_ETH_IPV4_TCP: { /* Process TCP flags, and store to blocks. */ const struct tcp_header *tcp = (void *)&pkt[34]; -mfex_handle_tcp_flags(tcp, &blocks[6]); - +if (!tunnel_present) { +mfex_handle_tcp_flags(tcp, &blocks[6]); +} else { +mfex_handle_tcp_flags(tcp, &blocks[15]); +mf->map.bits[0] = 0x18a001ff; +} /* Handle dynamic l2_pad_size. */ uint32_t size_from_ipv4 = size - sizeof(struct eth_header); struct ip_header *nh = (void *)&pkt[sizeof(struct eth_header)]; @@ -614,6 +640,9 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } +if (tunnel_present) { +mf->map.bits[0] = 0x18a001ff; +} } break; default: break; diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c index b26603a57..362463d67 100644 --- a/lib/dpif-netdev-private-extract.c +++ b/lib/dpif-netdev-private-extract.c @@ -273,7 +273,10 @@ dpif_miniflow_extract_autovalidator(struct dp_packet_batch *packets, /* Run scalar miniflow_extract to get default result. */ DP_PACKET_BATCH_FOR_EACH (i, packet,
[ovs-dev] [PATCH v1 04/10] dpif-netdev-avx512: Add inner packet handling to dpif.
This patch adds the necessary changes required to support tunnel packet types in avx512 dpif. Signed-off-by: Kumar Amber Signed-off-by: Cian Ferriter Co-authored-by: Cian Ferriter --- lib/dpif-netdev-avx512.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index ef672adcf..fa8773331 100644 --- a/lib/dpif-netdev-avx512.c +++ b/lib/dpif-netdev-avx512.c @@ -80,7 +80,7 @@ dp_netdev_input_avx512_probe(void) static inline int32_t ALWAYS_INLINE dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets, - bool md_is_valid OVS_UNUSED, odp_port_t in_port) + bool md_is_valid, odp_port_t in_port) { /* Allocate DPIF userdata. */ if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) { @@ -92,6 +92,7 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, struct netdev_flow_key *keys = ud->keys; struct netdev_flow_key **key_ptrs = ud->key_ptrs; struct pkt_flow_meta *pkt_meta = ud->pkt_meta; +const uint32_t recirc_depth = *recirc_depth_get(); /* The AVX512 DPIF implementation handles rules in a way that is optimized * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is @@ -179,7 +180,9 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, /* Get packet pointer from bitmask and packet md. */ struct dp_packet *packet = packets->packets[i]; -pkt_metadata_init(&packet->md, in_port); +if (!md_is_valid) { +pkt_metadata_init(&packet->md, in_port); +} struct dp_netdev_flow *f = NULL; struct netdev_flow_key *key = &keys[i]; @@ -191,7 +194,7 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, bool mfex_hit = !!(mf_mask & (1 << i)); /* Check for a partial hardware offload match. */ -if (hwol_enabled) { +if (hwol_enabled && recirc_depth == 0) { if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &f))) { /* Packet restoration failed and it was dropped, do not * continue processing. */ @@ -224,7 +227,9 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf); key->len = netdev_flow_key_size(miniflow_n_values(&key->mf)); -key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf); +key->hash = (md_is_valid == false) +? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf) +: dpif_netdev_packet_get_rss_hash(packet, &key->mf); if (emc_enabled) { f = emc_lookup(&cache->emc_cache, key); @@ -262,7 +267,8 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, * dpcls_rules[] array. */ if (dpcls_key_idx > 0) { -struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); +odp_port_t port_no = packets->packets[0]->md.in_port.odp_port; +struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, port_no); if (OVS_UNLIKELY(!cls)) { return -1; } @@ -318,7 +324,9 @@ dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, /* At this point we don't return error anymore, so commit stats here. */ uint32_t mfex_hit_cnt = __builtin_popcountll(mf_mask); -pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size); +pmd_perf_update_counter(&pmd->perf_stats, +md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV, +batch_size); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, phwol_hits); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT, mfex_hit_cnt); -- 2.25.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v1 03/10] dpif-netdev: Add function pointer for dpif re-circulate.
The patch adds and re-uses the dpif set command to set the function pointers to be used to switch between different inner dpifs. Signed-off-by: Kumar Amber Signed-off-by: Cian Ferriter Co-authored-by: Cian Ferriter --- lib/dpif-netdev-private-dpif.c | 53 +++- lib/dpif-netdev-private-dpif.h | 14 + lib/dpif-netdev-private-thread.h | 3 ++ lib/dpif-netdev.c| 22 +++-- 4 files changed, 83 insertions(+), 9 deletions(-) diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c index afce6947a..70281203b 100644 --- a/lib/dpif-netdev-private-dpif.c +++ b/lib/dpif-netdev-private-dpif.c @@ -37,18 +37,21 @@ enum dpif_netdev_impl_info_idx { static struct dpif_netdev_impl_info_t dpif_impls[] = { /* The default scalar C code implementation. */ [DPIF_NETDEV_IMPL_SCALAR] = { .input_func = dp_netdev_input, + .recirc_func = dp_netdev_recirculate, .probe = NULL, .name = "dpif_scalar", }, #if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__) /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */ [DPIF_NETDEV_IMPL_AVX512] = { .input_func = dp_netdev_input_avx512, + .recirc_func = dp_netdev_input_avx512_recirc, .probe = dp_netdev_input_avx512_probe, .name = "dpif_avx512", }, #endif }; static dp_netdev_input_func default_dpif_func; +static dp_netdev_recirc_func default_dpif_recirc_func; dp_netdev_input_func dp_netdev_impl_get_default(void) @@ -79,6 +82,35 @@ dp_netdev_impl_get_default(void) return default_dpif_func; } +dp_netdev_recirc_func +dp_netdev_recirc_impl_get_default(void) +{ +/* For the first call, this will be NULL. Compute the compile time default. + */ +if (!default_dpif_recirc_func) { +int dpif_idx = DPIF_NETDEV_IMPL_SCALAR; + +/* Configure-time overriding to run test suite on all implementations. */ +#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__) +#ifdef DPIF_AVX512_DEFAULT +dp_netdev_input_func_probe probe; + +/* Check if the compiled default is compatible. */ +probe = dpif_impls[DPIF_NETDEV_IMPL_AVX512].probe; +if (!probe || !probe()) { +dpif_idx = DPIF_NETDEV_IMPL_AVX512; +} +#endif +#endif + +VLOG_INFO("Default re-circulate DPIF implementation is %s.\n", + dpif_impls[dpif_idx].name); +default_dpif_recirc_func = dpif_impls[dpif_idx].recirc_func; +} + +return default_dpif_recirc_func; +} + void dp_netdev_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list, size_t n) @@ -114,10 +146,12 @@ dp_netdev_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list, * returns the function pointer to the one requested by "name". */ static int32_t -dp_netdev_impl_get_by_name(const char *name, dp_netdev_input_func *out_func) +dp_netdev_impl_get_by_name(const char *name, dp_netdev_input_func *dpif_func, + dp_netdev_recirc_func *dpif_recirc_func) { ovs_assert(name); -ovs_assert(out_func); +ovs_assert(dpif_func); +ovs_assert(dpif_recirc_func); uint32_t i; @@ -127,11 +161,13 @@ dp_netdev_impl_get_by_name(const char *name, dp_netdev_input_func *out_func) if (dpif_impls[i].probe) { int probe_err = dpif_impls[i].probe(); if (probe_err) { -*out_func = NULL; +*dpif_func = NULL; +*dpif_recirc_func = NULL; return probe_err; } } -*out_func = dpif_impls[i].input_func; +*dpif_func = dpif_impls[i].input_func; +*dpif_recirc_func = dpif_impls[i].recirc_func; return 0; } } @@ -142,12 +178,15 @@ dp_netdev_impl_get_by_name(const char *name, dp_netdev_input_func *out_func) int32_t dp_netdev_impl_set_default_by_name(const char *name) { -dp_netdev_input_func new_default; +dp_netdev_input_func new_dpif_default; +dp_netdev_recirc_func new_dpif_recirc_default; -int32_t err = dp_netdev_impl_get_by_name(name, &new_default); +int32_t err = dp_netdev_impl_get_by_name(name, &new_dpif_default, + &new_dpif_recirc_default); if (!err) { -default_dpif_func = new_default; +default_dpif_func = new_dpif_default; +default_dpif_recirc_func = new_dpif_recirc_default; } return err; diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h index 0d18b748e..dcd5273f2 100644 --- a/lib/dpif-netdev-private-dpif.h +++ b/lib/dpif-netdev-private-dpif.h @@ -36,6 +36,12 @@ typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets, odp_port_t port_no); +/* Typedef f
[ovs-dev] [PATCH v1 01/10] dpif-netdev: Refactor recirc data allocation and hash fn.
The patch removes static allocation of recirc_depth parameter and moves the hash function to dpcls to allow for re-use by avx512 dpif and others. Signed-off-by: Kumar Amber Signed-off-by: Cian Ferriter Co-authored-by: Cian Ferriter --- lib/dpif-netdev-private-dpcls.h | 23 +++ lib/dpif-netdev-private-dpif.c | 2 ++ lib/dpif-netdev-private-dpif.h | 9 + lib/dpif-netdev.c | 29 ++--- 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/lib/dpif-netdev-private-dpcls.h b/lib/dpif-netdev-private-dpcls.h index 0d5da73c7..a86ea449b 100644 --- a/lib/dpif-netdev-private-dpcls.h +++ b/lib/dpif-netdev-private-dpcls.h @@ -25,6 +25,7 @@ #include "cmap.h" #include "openvswitch/thread.h" +#include "dpif-netdev-private-dpif.h" #ifdef __cplusplus extern "C" { @@ -124,6 +125,28 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, return hash; } +static inline uint32_t +dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, +const struct miniflow *mf) +{ +uint32_t hash; + +if (OVS_LIKELY(dp_packet_rss_valid(packet))) { +hash = dp_packet_get_rss_hash(packet); +} else { +hash = miniflow_hash_5tuple(mf, 0); +dp_packet_set_rss_hash(packet, hash); +} + +/* The RSS hash must account for the recirculation depth to avoid + * collisions in the exact match cache */ +uint32_t recirc_depth = *recirc_depth_get(); +if (OVS_UNLIKELY(recirc_depth)) { +hash = hash_finish(hash, recirc_depth); +} +return hash; +} + /* Allow other implementations to call dpcls_lookup() for subtable search. */ bool dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[], diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c index 84d4ec156..4fe67a3c5 100644 --- a/lib/dpif-netdev-private-dpif.c +++ b/lib/dpif-netdev-private-dpif.c @@ -153,3 +153,5 @@ dp_netdev_impl_set_default_by_name(const char *name) return err; } + +DEFINE_EXTERN_PER_THREAD_DATA(recirc_depth, 0); diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h index 0da639c55..0e29a02db 100644 --- a/lib/dpif-netdev-private-dpif.h +++ b/lib/dpif-netdev-private-dpif.h @@ -18,6 +18,11 @@ #define DPIF_NETDEV_PRIVATE_DPIF_H 1 #include "openvswitch/types.h" +#include "ovs-thread.h" + + +#define MAX_RECIRC_DEPTH 6 +DECLARE_EXTERN_PER_THREAD_DATA(uint32_t, recirc_depth); /* Forward declarations to avoid including files. */ struct dp_netdev_pmd_thread; @@ -76,4 +81,8 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets, odp_port_t in_port); +int32_t +dp_netdev_recirculate(struct dp_netdev_pmd_thread *, + struct dp_packet_batch *); + #endif /* netdev-private.h */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 720818e30..829f1dedf 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -98,8 +98,6 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev); #define FLOW_DUMP_MAX_BATCH 50 /* Use per thread recirc_depth to prevent recirculation loop. */ -#define MAX_RECIRC_DEPTH 6 -DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) /* Use instant packet send by default. */ #define DEFAULT_TX_FLUSH_INTERVAL 0 @@ -547,8 +545,6 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, const struct flow *flow, const struct nlattr *actions, size_t actions_len); -static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, - struct dp_packet_batch *); static void dp_netdev_disable_upcall(struct dp_netdev *); static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); @@ -7789,28 +7785,6 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, actions, wc, put_actions, dp->upcall_aux); } -static inline uint32_t -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, -const struct miniflow *mf) -{ -uint32_t hash, recirc_depth; - -if (OVS_LIKELY(dp_packet_rss_valid(packet))) { -hash = dp_packet_get_rss_hash(packet); -} else { -hash = miniflow_hash_5tuple(mf, 0); -dp_packet_set_rss_hash(packet, hash); -} - -/* The RSS hash must account for the recirculation depth to avoid - * collisions in the exact match cache */ -recirc_depth = *recirc_depth_get_unsafe(); -if (OVS_UNLIKELY(recirc_depth)) { -hash = hash_finish(hash, recirc_depth); -} -return hash; -} - struct packet_batch_per_flow { unsigned int byte_count; uint16_t tcp_flags; @@ -8497,11 +8471,12 @@ dp_netdev_input(struct dp_netdev_pmd_thread *pmd, return 0; } -static void +int32
[ovs-dev] [PATCH v1 02/10] dpif-netdev-avx512: Refactor avx512 dpif and create new APIs.
This Patch creates new APIs for avx512 dpif. Signed-off-by: Kumar Amber Signed-off-by: Cian Ferriter Co-authored-by: Cian Ferriter --- lib/dpif-netdev-avx512.c | 32 +++- lib/dpif-netdev-private-dpif.c | 4 ++-- lib/dpif-netdev-private-dpif.h | 12 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index b7131ba3f..ef672adcf 100644 --- a/lib/dpif-netdev-avx512.c +++ b/lib/dpif-netdev-avx512.c @@ -59,8 +59,13 @@ struct dpif_userdata { struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST]; }; +static int32_t +dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + bool md_is_valid, odp_port_t in_port); + int32_t -dp_netdev_input_outer_avx512_probe(void) +dp_netdev_input_avx512_probe(void) { bool avx512f_available = cpu_has_isa(OVS_CPU_ISA_X86_AVX512F); bool bmi2_available = cpu_has_isa(OVS_CPU_ISA_X86_BMI2); @@ -72,10 +77,10 @@ dp_netdev_input_outer_avx512_probe(void) return 0; } -int32_t -dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, - struct dp_packet_batch *packets, - odp_port_t in_port) +static inline int32_t ALWAYS_INLINE +dp_netdev_input_avx512__(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + bool md_is_valid OVS_UNUSED, odp_port_t in_port) { /* Allocate DPIF userdata. */ if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) { @@ -380,5 +385,22 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, return 0; } +int32_t +dp_netdev_input_avx512(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + odp_port_t in_port) +{ +int ret = dp_netdev_input_avx512__(pmd, packets, false, in_port); +return ret; +} + +int32_t +dp_netdev_input_avx512_recirc(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets) +{ +int ret = dp_netdev_input_avx512__(pmd, packets, true, 0); +return ret; +} + #endif #endif diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c index 4fe67a3c5..afce6947a 100644 --- a/lib/dpif-netdev-private-dpif.c +++ b/lib/dpif-netdev-private-dpif.c @@ -42,8 +42,8 @@ static struct dpif_netdev_impl_info_t dpif_impls[] = { #if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__) /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */ -[DPIF_NETDEV_IMPL_AVX512] = { .input_func = dp_netdev_input_outer_avx512, - .probe = dp_netdev_input_outer_avx512_probe, +[DPIF_NETDEV_IMPL_AVX512] = { .input_func = dp_netdev_input_avx512, + .probe = dp_netdev_input_avx512_probe, .name = "dpif_avx512", }, #endif }; diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h index 0e29a02db..0d18b748e 100644 --- a/lib/dpif-netdev-private-dpif.h +++ b/lib/dpif-netdev-private-dpif.h @@ -74,12 +74,16 @@ dp_netdev_input(struct dp_netdev_pmd_thread *pmd, /* AVX512 enabled DPIF implementation and probe functions. */ int32_t -dp_netdev_input_outer_avx512_probe(void); +dp_netdev_input_avx512_probe(void); int32_t -dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, - struct dp_packet_batch *packets, - odp_port_t in_port); +dp_netdev_input_avx512(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + odp_port_t in_port); + +int32_t +dp_netdev_input_avx512_recirc(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets); int32_t dp_netdev_recirculate(struct dp_netdev_pmd_thread *, -- 2.25.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v1 00/10] DPIF + MFEX Inner Vxlan AVX512 Opts
This Series of Patchsets introduce the Optimizations for supporting Vxlan tunneled packets in DPIF and MFEX. Along with the optimization various tests and scalar refactoring of scalar path is done to be used accross without duplication. Over the Tests we have observed a gain of approximate 20~25% gain in performance over the scalar path. Kumar Amber (10): dpif-netdev: Refactor recirc data allocation and hash fn. dpif-netdev-avx512: Refactor avx512 dpif and create new APIs. dpif-netdev: Add function pointer for dpif re-circulate. dpif-netdev-avx512: Add inner packet handling to dpif. dpif-mfex: Modify set/get mfex commands to include inner. dpif-mfex: Change mfex fn pointer prototype to include md_is_valid. mfex-study: Modify study func to select outer and inner mfex funcs. mfex-avx512: Add support for tunnel packets in avx512 mfex. dpif-avx512: Add mfex inner support to avx512 dpif. system-dpdk.at: Add DPIF test for ipv4 vxlan packet types. Documentation/topics/dpdk/bridge.rst | 19 ++-- lib/dpif-netdev-avx512.c | 62 ++--- lib/dpif-netdev-extract-avx512.c | 54 --- lib/dpif-netdev-extract-study.c | 132 ++- lib/dpif-netdev-private-dpcls.h | 23 + lib/dpif-netdev-private-dpif.c | 59 ++-- lib/dpif-netdev-private-dpif.h | 35 ++- lib/dpif-netdev-private-extract.c| 35 ++- lib/dpif-netdev-private-extract.h| 21 +++-- lib/dpif-netdev-private-thread.h | 6 ++ lib/dpif-netdev.c| 72 --- tests/automake.mk| 1 + tests/pcap/dpif_vxlan.pcap | Bin 0 -> 488 bytes tests/system-dpdk.at | 76 +++ 14 files changed, 465 insertions(+), 130 deletions(-) create mode 100644 tests/pcap/dpif_vxlan.pcap -- 2.25.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH] dp-packet: Allow DPDK packet resize.
Hi, do you have future plan to support mbuf chaining for dp-packet? as I see you've actually relax some check on dp-packet in this patch. thanks David Marchand 于2022年3月18日周五 23:34写道: > DPDK based dp-packets points to data buffers that can't be expanded > dynamically. > Their layout is as follows: > - a 128 bytes headroom chosen at DPDK build time (RTE_PKTMBUF_HEADROOM), > - a maximum size chosen at mempool creation, > > In some usecases though (like encapsulating with multiple tunnels), > a 128 bytes headroom is too short. > > Dynamically allocate buffers in DPDK memory and make use of DPDK > external buffers API (previously used for userspace TSO). > > Signed-off-by: David Marchand > --- > lib/dp-packet.c | 17 - > lib/netdev-dpdk.c | 47 +++ > lib/netdev-dpdk.h | 3 +++ > 3 files changed, 54 insertions(+), 13 deletions(-) > > diff --git a/lib/dp-packet.c b/lib/dp-packet.c > index 35c72542a2..07fa67b1a1 100644 > --- a/lib/dp-packet.c > +++ b/lib/dp-packet.c > @@ -250,8 +250,23 @@ dp_packet_resize(struct dp_packet *b, size_t > new_headroom, size_t new_tailroom) > new_allocated = new_headroom + dp_packet_size(b) + new_tailroom; > > switch (b->source) { > -case DPBUF_DPDK: > +case DPBUF_DPDK: { > +#ifdef DPDK_NETDEV > +uint32_t buf_len; > + > +buf_len = new_allocated; > +new_base = netdev_dpdk_extbuf_allocate(&buf_len); > +if (!new_base) { > +out_of_memory(); > +} > +ovs_assert(buf_len <= UINT16_MAX); > +dp_packet_copy__(b, new_base, new_headroom, new_tailroom); > +netdev_dpdk_extbuf_replace(b, new_base, buf_len); > +break; > +#else > OVS_NOT_REACHED(); > +#endif > +} > > case DPBUF_MALLOC: > if (new_headroom == dp_packet_headroom(b)) { > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index fbc3b42d84..47e16f22c5 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -2646,41 +2646,64 @@ out: > } > } > > +void * > +netdev_dpdk_extbuf_allocate(uint32_t *data_len) > +{ > +*data_len += sizeof(struct rte_mbuf_ext_shared_info) + > sizeof(uintptr_t); > +*data_len = RTE_ALIGN_CEIL(*data_len, sizeof(uintptr_t)); > +return rte_malloc(NULL, *data_len, RTE_CACHE_LINE_SIZE); > +} > + > static void > netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) > { > rte_free(opaque); > } > > +void > +netdev_dpdk_extbuf_replace(struct dp_packet *b, void *buf, uint32_t > data_len) > +{ > +struct rte_mbuf *pkt = (struct rte_mbuf *) b; > +struct rte_mbuf_ext_shared_info *shinfo; > +uint16_t buf_len = data_len; > + > +shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, > +netdev_dpdk_extbuf_free, > +buf); > +ovs_assert(shinfo != NULL); > + > +if (RTE_MBUF_HAS_EXTBUF(pkt)) { > +rte_pktmbuf_detach_extbuf(pkt); > +} > +rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), > buf_len, > + shinfo); > +} > + > static struct rte_mbuf * > dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) > { > uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len; > -struct rte_mbuf_ext_shared_info *shinfo = NULL; > +struct rte_mbuf_ext_shared_info *shinfo; > uint16_t buf_len; > void *buf; > > -total_len += sizeof *shinfo + sizeof(uintptr_t); > -total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t)); > - > +buf = netdev_dpdk_extbuf_allocate(&total_len); > +if (OVS_UNLIKELY(buf == NULL)) { > +VLOG_ERR("Failed to allocate memory using rte_malloc: %u", > total_len); > +return NULL; > +} > if (OVS_UNLIKELY(total_len > UINT16_MAX)) { > +netdev_dpdk_extbuf_free(NULL, buf); > VLOG_ERR("Can't copy packet: too big %u", total_len); > return NULL; > } > > buf_len = total_len; > -buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE); > -if (OVS_UNLIKELY(buf == NULL)) { > -VLOG_ERR("Failed to allocate memory using rte_malloc: %u", > buf_len); > -return NULL; > -} > - > -/* Initialize shinfo. */ > shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, > netdev_dpdk_extbuf_free, > buf); > if (OVS_UNLIKELY(shinfo == NULL)) { > -rte_free(buf); > +netdev_dpdk_extbuf_free(NULL, buf); > VLOG_ERR("Failed to initialize shared info for mbuf while " > "attempting to attach an external buffer."); > return NULL; > diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h > index 699be3fb41..95594f07fb 100644 > --- a/lib/netdev-dpdk.h > +++ b/lib/netdev-dpdk.h > @@ -31,6 +31,9 @@ struct netdev; > void netdev_dpdk_register(void); > void free_dpdk_buf(struct dp
[ovs-dev] [PATCH] python: idl: set cond_changed to false if last id is zero
after reconnection, cond_changed will be set to true, poll will be called and never block, cpu high load forever Signed-off-by: Wentao Jia --- python/ovs/db/idl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index 4ecdcaa19..166fa38e6 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -356,6 +356,9 @@ class Idl(object): flushing the local cached DB contents. """ ack_all = self.last_id == str(uuid.UUID(int=0)) +if ack_all: +self.cond_changed = False + for table in self.tables.values(): if ack_all: table.condition.request() -- 2.32.0 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v20 5/8] dpif-offload-netlink: Implement dpif-offload-provider API
On 2022-03-18 7:55 PM, Eelco Chaudron wrote: On 17 Mar 2022, at 2:01, Chris Mi wrote: On 2022-03-11 8:53 PM, Eelco Chaudron wrote: @@ -449,6 +462,7 @@ dpif_close(struct dpif *dpif) if (dpif) { struct registered_dpif_class *rc; +dpif_offload_close(dpif); ** Not sure I understand, but why are we destroying the offload dpif class here, it can be used by another dpif type. ** I guess this is all because your design has a 1:1 mapping? Guess it should be two dpif_types that could share the same offload class type. Now it is moved to dpif_netlink_close(). Except the 1:1 mapping comment which I think need Ilya's feedback, I have addressed your other comments. Thanks for your comments. The dpif-offload for dummy is not needed and removed. If needed, I can send v21. Thanks for taking care of the questions and fixing them in your sandbox. I would prefer for you to not send any more revisions until we have a clear answer from Ilya. Since Ilya didn't reply, I'll send a new version to reflect the latest change. Well, my goal was to not do any more reviews until Ilya would reply, as every review cycle takes up quite some time. OK, hopefully Ilya could review it soon. However I could not get v21 to apply cleanly, so I will hold off on any further reviews of this series until we have a clear direction from Ilya. The reason is because we thought patch "tc: Keep header rewrite actions order" should be merged first. So this series is rebased on that. -Chris //Eelco ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v3 5/6] conntrack: Use an atomic conn expiration value
Bleep bloop. Greetings Paolo Valerio, I am a robot and I have tried out your patch. Thanks for your contribution. I encountered some error that I wasn't expecting. See the details below. checkpatch: WARNING: Unexpected sign-offs from developers who are not authors or co-authors or committers: Paolo Valerio Lines checked: 138, Warnings: 1, Errors: 0 Please check this out. If you feel there has been an error, please email acon...@redhat.com Thanks, 0-day Robot ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v3 3/6] conntrack: Replaces nat_conn introducing key directionality.
Bleep bloop. Greetings Paolo Valerio, I am a robot and I have tried out your patch. Thanks for your contribution. I encountered some error that I wasn't expecting. See the details below. checkpatch: WARNING: Line is 81 characters long (recommended limit is 79) #1137 FILE: lib/conntrack.c:3277: &conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv6, Lines checked: 1162, Warnings: 1, Errors: 0 Please check this out. If you feel there has been an error, please email acon...@redhat.com Thanks, 0-day Robot ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v3 2/6] conntrack-tp: Use a cmap to store timeout policies
Bleep bloop. Greetings Paolo Valerio, I am a robot and I have tried out your patch. Thanks for your contribution. I encountered some error that I wasn't expecting. See the details below. checkpatch: WARNING: Unexpected sign-offs from developers who are not authors or co-authors or committers: Paolo Valerio Lines checked: 202, Warnings: 1, Errors: 0 Please check this out. If you feel there has been an error, please email acon...@redhat.com Thanks, 0-day Robot ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v3 1/6] conntrack: Use a cmap to store zone limits
Bleep bloop. Greetings Paolo Valerio, I am a robot and I have tried out your patch. Thanks for your contribution. I encountered some error that I wasn't expecting. See the details below. checkpatch: WARNING: Unexpected sign-offs from developers who are not authors or co-authors or committers: Paolo Valerio Lines checked: 272, Warnings: 1, Errors: 0 Please check this out. If you feel there has been an error, please email acon...@redhat.com Thanks, 0-day Robot ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH RFC v3 6/6] conntrack: Make ovs-appctl dpctl/ct-bkts work with multiple buckets
Without this patch "ovs-appctl dpctl/ct-bkts" produces the following output: Total Buckets: 1 Current Connections: 10246 +---+-+ | Buckets | Connections per Buckets | +---+-+ 0.. 7 | 10246 with this patch applied, the output becomes: Total Buckets: 1024 Current Connections: 95956 +---+-+ | Buckets | Connections per Buckets | +---+-+ 0.. 7 |87 100 90 91 92 92 101 83 8.. 15 |98 86 80 91 92 93 92 84 16.. 23 |82 114 103 90 94 80 95 96 ... 1000..1007 |98 88 106 100 91 99 89 81 1008..1015 |97 99 93 67 102 97 89 86 1016..1023 | 113 79 97 86 106 93 80 90 Signed-off-by: Paolo Valerio --- lib/conntrack.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 99cec60fa..b7990588f 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2796,7 +2796,7 @@ tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone, static void conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, - long long now) + long long now, unsigned int bkt) { const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key, *rev_key = &conn->key_node[CT_DIR_REV].key; @@ -2820,6 +2820,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, ovs_mutex_unlock(&conn->lock); entry->timeout = (expiration > 0) ? expiration / 1000 : 0; +entry->bkt = bkt; if (conn->alg) { /* Caller is responsible for freeing. */ @@ -2845,7 +2846,7 @@ conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump, } dump->ct = ct; -*ptot_bkts = 1; /* Need to clean up the callers. */ +*ptot_bkts = CONNTRACK_BUCKETS; return 0; } @@ -2877,7 +2878,7 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) if ((!dump->filter_zone || keyn->key.zone == dump->zone) && (keyn->key.dir == CT_DIR_FWD)) { -conn_to_ct_dpif_entry(conn, entry, now); +conn_to_ct_dpif_entry(conn, entry, now, dump->bucket); break; } } ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v3 5/6] conntrack: Use an atomic conn expiration value
From: Gaetan Rivet A lock is taken during conn_lookup() to check whether a connection is expired before returning it. This lock can have some contention. Even though this lock ensures a consistent sequence of writes, it does not imply a specific order. A ct_clean thread taking the lock first could read a value that would be updated immediately after by a PMD waiting on the same lock, just as well as the inverse order. As such, the expiration time can be stale anytime it is read. In this context, using an atomic will ensure the same guarantees for either writes or reads, i.e. writes are consistent and reads are not undefined behaviour. Reading an atomic is however less costly than taking and releasing a lock. Signed-off-by: Gaetan Rivet Reviewed-by: Eli Britstein Acked-by: William Tu Signed-off-by: Paolo Valerio --- lib/conntrack-private.h |2 +- lib/conntrack-tp.c |2 +- lib/conntrack.c | 27 +++ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index a89ff96fa..a7abe158a 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -110,7 +110,7 @@ struct conn { /* Mutable data. */ struct ovs_mutex lock; /* Guards all mutable fields. */ ovs_u128 label; -long long expiration; +atomic_llong expiration; uint32_t mark; int seq_skew; diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index 117810528..cdb3639de 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -257,7 +257,7 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn, ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, conn->tp_id, val); -conn->expiration = now + val * 1000; +atomic_store_relaxed(&conn->expiration, now + val * 1000); } /* ct_lock must be held. */ diff --git a/lib/conntrack.c b/lib/conntrack.c index c7a96ea59..99cec60fa 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -102,6 +102,7 @@ static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, long long now); +static long long int conn_expiration(const struct conn *conn); static bool conn_expired(struct conn *, long long now); static void set_mark(struct dp_packet *, struct conn *, uint32_t val, uint32_t mask); @@ -579,7 +580,6 @@ conn_key_lookup__(struct conntrack *ct, unsigned bucket, conn_clean(ct, conn); continue; } - for (int i = CT_DIR_FWD; i < CT_DIR_MAX; i++) { if (!conn_key_cmp(&conn->key_node[i].key, key)) { found = true; @@ -1063,13 +1063,10 @@ un_nat_packet(struct dp_packet *pkt, const struct conn *conn, static void conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in, long long now, int seq_skew, bool seq_skew_dir) -OVS_NO_THREAD_SAFETY_ANALYSIS { struct conn *conn; -ovs_mutex_unlock(&conn_in->lock); -conn_lookup_gc(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL); -ovs_mutex_lock(&conn_in->lock); +conn_lookup_gc(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL); if (conn && seq_skew) { conn->seq_skew = seq_skew; conn->seq_skew_dir = seq_skew_dir; @@ -1661,9 +1658,7 @@ sweep_bucket(struct conntrack *ct, struct ct_bucket *bucket, } conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->key.dir]); -ovs_mutex_lock(&conn->lock); -expiration = conn->expiration; -ovs_mutex_unlock(&conn->lock); +expiration = conn_expiration(conn); if (now >= expiration) { conn_clean(ct, conn); @@ -2671,12 +2666,20 @@ conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt, return update_res; } +static long long int +conn_expiration(const struct conn *conn) +{ +long long int expiration; + +atomic_read_relaxed(&CONST_CAST(struct conn *, conn)->expiration, +&expiration); +return expiration; +} + static bool conn_expired(struct conn *conn, long long now) { -ovs_mutex_lock(&conn->lock); -bool expired = now >= conn->expiration ? true : false; -ovs_mutex_unlock(&conn->lock); +bool expired = now >= conn_expiration(conn) ? true : false; return expired; } @@ -2808,7 +2811,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, entry->mark = conn->mark; memcpy(&entry->labels, &conn->label, sizeof entry->labels); -long long expiration = conn->expiration - now; +long long expiration = conn_expiration(conn) - now; struct ct_l4_proto *class = l4_protos[key->nw_proto]; if (class->conn_get_protoinfo) { ___ dev mailing lis
[ovs-dev] [PATCH RFC v3 4/6] conntrack: Split single cmap to multiple buckets.
The purpose of this commit is to split the current way of storing the conn nodes. Before this patch the nodes were stored into a single cmap using ct->lock to avoid concurrent write access. With this commit a single connection can be stored into one or two (at most) CONNTRACK_BUCKETS available based on the outcome of the function hash_scale() on the key. Every bucket has its local lock that needs to be acquired every time a node has to be removed/inserted from/to the cmap. This means that, in case the hash of the CT_DIR_FWD key differs from the one of the CT_DIR_REV, we can end up having the reference of the two key nodes in different buckets, and consequently acquiring two locks (one per bucket). This approach may be handy in different ways, depending on the way the stale connection removal gets designed. The attempt of this patch is to remove the expiration lists, removing the stale entries mostly in two ways: - during the key lookup - when the sweeper task wakes up the first case is not very strict, as we remove only expired entries with the same hash. To increase its effectiveness, we should probably increase the number of buckets and replace the cmaps with other data structures like rcu lists. The sweeper task instead takes charge of the remaining stale entries removal. The heuristics used in the sweeper task are mostly an example, but could be modified to match any possible uncovered use case. Signed-off-by: Paolo Valerio --- RFC v2: - removed reference to conntrack_long_cleanup - removed cached_key_hash() - turned recursive locks into adaptive - removed redundant modulo - renamed buckets_{lock,unlock} to ct_buckets_{lock,unlock} - rearranged buckets_unlock conditionals - added zone limit eviction handling (in case the zone is full) during connection creation RFC v3: - fixed some of the long lines warning missed in v2 - removed OVS_{ACQUIRES,RELEASES} directives as they were an unintentional leftover of a previous attempt to keep the thread safe analisys in ct_buckets_{lock,unlock}. While at it, replaced them with OVS_NO_THREAD_SAFETY_ANALYSIS: https://clang.llvm.org/docs/ThreadSafetyAnalysis.html#conditional-locks --- lib/conntrack-private.h | 34 ++- lib/conntrack-tp.c | 42 --- lib/conntrack.c | 571 +-- lib/conntrack.h |2 tests/system-traffic.at |5 5 files changed, 427 insertions(+), 227 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index ea5ba3d9e..a89ff96fa 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -95,6 +95,7 @@ struct alg_exp_node { struct conn_key_node { struct conn_key key; +uint32_t key_hash; struct cmap_node cm_node; }; @@ -102,7 +103,6 @@ struct conn { /* Immutable data. */ struct conn_key_node key_node[CT_DIR_MAX]; struct conn_key parent_key; /* Only used for orig_tuple support. */ -struct ovs_list exp_node; uint16_t nat_action; char *alg; @@ -121,7 +121,9 @@ struct conn { /* Mutable data. */ bool seq_skew_dir; /* TCP sequence skew direction due to NATTing of FTP * control messages; true if reply direction. */ -bool cleaned; /* True if cleaned from expiry lists. */ +atomic_flag cleaned; /* True if the entry was stale and one of the + * cleaner (i.e. packet path or sweeper) took + * charge of it. */ /* Immutable data. */ bool alg_related; /* True if alg data connection. */ @@ -192,10 +194,25 @@ enum ct_timeout { N_CT_TM }; -struct conntrack { -struct ovs_mutex ct_lock; /* Protects 2 following fields. */ +#define CONNTRACK_BUCKETS_SHIFT 10 +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) + +struct ct_bucket { +/* Protects 'conns'. In case of natted conns, there's a high + * chance that the forward and the reverse key stand in different + * buckets. buckets_lock() should be the preferred way to acquire + * these locks (unless otherwise needed), as it deals with the + * acquisition order. */ +struct ovs_mutex lock; +/* Contains the connections in the bucket, indexed by + * 'struct conn_key'. */ struct cmap conns OVS_GUARDED; -struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; +}; + +struct conntrack { +struct ct_bucket buckets[CONNTRACK_BUCKETS]; +unsigned int next_bucket; +struct ovs_mutex ct_lock; struct cmap zone_limits OVS_GUARDED; struct cmap timeout_policies OVS_GUARDED; uint32_t hash_basis; /* Salt for hashing a connection key. */ @@ -220,9 +237,10 @@ struct conntrack { }; /* Lock acquisition order: - *1. 'ct_lock' - *2. 'conn->lock' - *3. 'resources_lock' + *1. 'buckets[p1]->lock' + *2 'buckets[p2]->lock' (with p1 < p2) + *3. 'conn->lock' + *4. 'resources_lock' */ extern struct ct_l4_proto ct_proto_tcp; diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.
[ovs-dev] [PATCH v3 3/6] conntrack: Replaces nat_conn introducing key directionality.
From: Peng He Currently, when doing NAT, the userspace conntrack will use an extra conn for the two directions in a flow. However, each conn has actually the two keys for both orig and rev directions. This patch introduces a key_node[CT_DIR_MAX] member in the conn which consists of key and a cmap_node for hash lookup. Both keys can now be accessed in the following way: conn->key_node[CT_DIR_{FWD,REV}].key similarly to what Aaron Conole suggested. This patch avoids the extra allocation for nat_conn, and makes userspace code cleaner. Signed-off-by: Peng He Co-authored-by: Paolo Valerio Signed-off-by: Paolo Valerio Reviewed-by: Gaetan Rivet Acked-by: Aaron Conole --- This patch got posted separately: https://patchwork.ozlabs.org/project/openvswitch/patch/164665111586.3858280.2879577107463178457.st...@fed.void/ but it's been included as part of this series to ease the review process as the next patch partly depends on it. --- lib/conntrack-private.h | 20 +- lib/conntrack-tp.c |6 - lib/conntrack.c | 507 --- 3 files changed, 233 insertions(+), 300 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index 34c688821..ea5ba3d9e 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -48,9 +48,16 @@ struct ct_endpoint { * hashing in ct_endpoint_hash_add(). */ BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4); +enum key_dir { +CT_DIR_FWD = 0, +CT_DIR_REV, +CT_DIR_MAX, +}; + /* Changes to this structure need to be reflected in conn_key_hash() * and conn_key_cmp(). */ struct conn_key { +enum key_dir dir; struct ct_endpoint src; struct ct_endpoint dst; @@ -86,21 +93,19 @@ struct alg_exp_node { bool nat_rpl_dst; }; -enum OVS_PACKED_ENUM ct_conn_type { -CT_CONN_TYPE_DEFAULT, -CT_CONN_TYPE_UN_NAT, +struct conn_key_node { +struct conn_key key; +struct cmap_node cm_node; }; struct conn { /* Immutable data. */ -struct conn_key key; -struct conn_key rev_key; +struct conn_key_node key_node[CT_DIR_MAX]; struct conn_key parent_key; /* Only used for orig_tuple support. */ struct ovs_list exp_node; -struct cmap_node cm_node; + uint16_t nat_action; char *alg; -struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */ /* Mutable data. */ struct ovs_mutex lock; /* Guards all mutable fields. */ @@ -120,7 +125,6 @@ struct conn { /* Immutable data. */ bool alg_related; /* True if alg data connection. */ -enum ct_conn_type conn_type; uint32_t tp_id; /* Timeout policy ID. */ }; diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index c2245038b..9ecb06978 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -282,7 +282,8 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn, ovs_mutex_lock(&conn->lock); VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d " "val=%u sec.", -ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); +ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, +conn->tp_id, val); conn_update_expiration__(ct, conn, tm, now, val); } @@ -313,7 +314,8 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn, } VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.", -ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); +ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, +conn->tp_id, val); conn_init_expiration__(ct, conn, tm, now, val); } diff --git a/lib/conntrack.c b/lib/conntrack.c index 13a1dd519..b5bab3cc1 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -96,7 +96,6 @@ static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt, uint32_t tp_id); static void delete_conn_cmn(struct conn *); static void delete_conn(struct conn *); -static void delete_conn_one(struct conn *conn); static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, @@ -110,8 +109,7 @@ static void set_label(struct dp_packet *, struct conn *, static void *clean_thread_main(void *f_); static bool -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, - struct conn *nat_conn, +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, const struct nat_action_info_t *nat_info); static uint8_t @@ -231,61 +229,6 @@ conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2) return 1; } -static void -ct_print_conn_info(const struct conn *c, const char *log_msg, - enum vlog_level vll, bool force, bool rl_on) -{ -#define CT_VLOG(RL_ON, LEVEL, ...)
[ovs-dev] [PATCH v3 2/6] conntrack-tp: Use a cmap to store timeout policies
From: Gaetan Rivet Multiple lookups are done to stored timeout policies, each time blocking the global 'ct_lock'. This is usually not necessary and it should be acceptable to get policy updates slightly delayed (by one RCU sync at most). Using a CMAP reduces multiple lock taking and releasing in the connection insertion path. Signed-off-by: Gaetan Rivet Reviewed-by: Eli Britstein Acked-by: William Tu Signed-off-by: Paolo Valerio --- lib/conntrack-private.h |2 +- lib/conntrack-tp.c | 54 ++- lib/conntrack.c |9 +--- lib/conntrack.h |2 +- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index d9461b811..34c688821 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -193,7 +193,7 @@ struct conntrack { struct cmap conns OVS_GUARDED; struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; struct cmap zone_limits OVS_GUARDED; -struct hmap timeout_policies OVS_GUARDED; +struct cmap timeout_policies OVS_GUARDED; uint32_t hash_basis; /* Salt for hashing a connection key. */ pthread_t clean_thread; /* Periodically cleans up connection tracker. */ struct latch clean_thread_exit; /* To destroy the 'clean_thread'. */ diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index a586d3a8d..c2245038b 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -47,14 +47,15 @@ static unsigned int ct_dpif_netdev_tp_def[] = { }; static struct timeout_policy * -timeout_policy_lookup(struct conntrack *ct, int32_t tp_id) +timeout_policy_lookup_protected(struct conntrack *ct, int32_t tp_id) OVS_REQUIRES(ct->ct_lock) { struct timeout_policy *tp; uint32_t hash; hash = hash_int(tp_id, ct->hash_basis); -HMAP_FOR_EACH_IN_BUCKET (tp, node, hash, &ct->timeout_policies) { +CMAP_FOR_EACH_WITH_HASH_PROTECTED (tp, node, hash, + &ct->timeout_policies) { if (tp->policy.id == tp_id) { return tp; } @@ -62,20 +63,25 @@ timeout_policy_lookup(struct conntrack *ct, int32_t tp_id) return NULL; } -struct timeout_policy * -timeout_policy_get(struct conntrack *ct, int32_t tp_id) +static struct timeout_policy * +timeout_policy_lookup(struct conntrack *ct, int32_t tp_id) { struct timeout_policy *tp; +uint32_t hash; -ovs_mutex_lock(&ct->ct_lock); -tp = timeout_policy_lookup(ct, tp_id); -if (!tp) { -ovs_mutex_unlock(&ct->ct_lock); -return NULL; +hash = hash_int(tp_id, ct->hash_basis); +CMAP_FOR_EACH_WITH_HASH (tp, node, hash, &ct->timeout_policies) { +if (tp->policy.id == tp_id) { +return tp; +} } +return NULL; +} -ovs_mutex_unlock(&ct->ct_lock); -return tp; +struct timeout_policy * +timeout_policy_get(struct conntrack *ct, int32_t tp_id) +{ +return timeout_policy_lookup(ct, tp_id); } static void @@ -125,27 +131,30 @@ timeout_policy_create(struct conntrack *ct, init_default_tp(tp, tp_id); update_existing_tp(tp, new_tp); hash = hash_int(tp_id, ct->hash_basis); -hmap_insert(&ct->timeout_policies, &tp->node, hash); +cmap_insert(&ct->timeout_policies, &tp->node, hash); } static void timeout_policy_clean(struct conntrack *ct, struct timeout_policy *tp) OVS_REQUIRES(ct->ct_lock) { -hmap_remove(&ct->timeout_policies, &tp->node); -free(tp); +uint32_t hash = hash_int(tp->policy.id, ct->hash_basis); +cmap_remove(&ct->timeout_policies, &tp->node, hash); +ovsrcu_postpone(free, tp); } static int -timeout_policy_delete__(struct conntrack *ct, uint32_t tp_id) +timeout_policy_delete__(struct conntrack *ct, uint32_t tp_id, +bool warn_on_error) OVS_REQUIRES(ct->ct_lock) { +struct timeout_policy *tp; int err = 0; -struct timeout_policy *tp = timeout_policy_lookup(ct, tp_id); +tp = timeout_policy_lookup_protected(ct, tp_id); if (tp) { timeout_policy_clean(ct, tp); -} else { +} else if (warn_on_error) { VLOG_WARN_RL(&rl, "Failed to delete a non-existent timeout " "policy: id=%d", tp_id); err = ENOENT; @@ -159,7 +168,7 @@ timeout_policy_delete(struct conntrack *ct, uint32_t tp_id) int err; ovs_mutex_lock(&ct->ct_lock); -err = timeout_policy_delete__(ct, tp_id); +err = timeout_policy_delete__(ct, tp_id, true); ovs_mutex_unlock(&ct->ct_lock); return err; } @@ -170,7 +179,7 @@ timeout_policy_init(struct conntrack *ct) { struct timeout_policy tp; -hmap_init(&ct->timeout_policies); +cmap_init(&ct->timeout_policies); /* Create default timeout policy. */ memset(&tp, 0, sizeof tp); @@ -182,14 +191,11 @@ int timeout_policy_update(struct conntrack *ct, struct timeout_policy *new_tp) { -int err = 0; uint32_t tp_id = new
[ovs-dev] [PATCH v3 1/6] conntrack: Use a cmap to store zone limits
From: Gaetan Rivet Change the data structure from hmap to cmap for zone limits. As they are shared amongst multiple conntrack users, multiple readers want to check the current zone limit state before progressing in their processing. Using a CMAP allows doing lookups without taking the global 'ct_lock', thus reducing contention. Signed-off-by: Gaetan Rivet Reviewed-by: Eli Britstein Signed-off-by: Paolo Valerio --- lib/conntrack-private.h |2 + lib/conntrack.c | 70 --- lib/conntrack.h |2 + lib/dpif-netdev.c |5 ++- 4 files changed, 53 insertions(+), 26 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index dfdf4e676..d9461b811 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -192,7 +192,7 @@ struct conntrack { struct ovs_mutex ct_lock; /* Protects 2 following fields. */ struct cmap conns OVS_GUARDED; struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; -struct hmap zone_limits OVS_GUARDED; +struct cmap zone_limits OVS_GUARDED; struct hmap timeout_policies OVS_GUARDED; uint32_t hash_basis; /* Salt for hashing a connection key. */ pthread_t clean_thread; /* Periodically cleans up connection tracker. */ diff --git a/lib/conntrack.c b/lib/conntrack.c index 40690e5f0..bb605bac9 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -81,7 +81,7 @@ enum ct_alg_ctl_type { }; struct zone_limit { -struct hmap_node node; +struct cmap_node node; struct conntrack_zone_limit czl; }; @@ -311,7 +311,7 @@ conntrack_init(void) for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) { ovs_list_init(&ct->exp_lists[i]); } -hmap_init(&ct->zone_limits); +cmap_init(&ct->zone_limits); ct->zone_limit_seq = 0; timeout_policy_init(ct); ovs_mutex_unlock(&ct->ct_lock); @@ -346,12 +346,25 @@ zone_key_hash(int32_t zone, uint32_t basis) } static struct zone_limit * -zone_limit_lookup(struct conntrack *ct, int32_t zone) +zone_limit_lookup_protected(struct conntrack *ct, int32_t zone) OVS_REQUIRES(ct->ct_lock) { uint32_t hash = zone_key_hash(zone, ct->hash_basis); struct zone_limit *zl; -HMAP_FOR_EACH_IN_BUCKET (zl, node, hash, &ct->zone_limits) { +CMAP_FOR_EACH_WITH_HASH_PROTECTED (zl, node, hash, &ct->zone_limits) { +if (zl->czl.zone == zone) { +return zl; +} +} +return NULL; +} + +static struct zone_limit * +zone_limit_lookup(struct conntrack *ct, int32_t zone) +{ +uint32_t hash = zone_key_hash(zone, ct->hash_basis); +struct zone_limit *zl; +CMAP_FOR_EACH_WITH_HASH (zl, node, hash, &ct->zone_limits) { if (zl->czl.zone == zone) { return zl; } @@ -361,7 +374,6 @@ zone_limit_lookup(struct conntrack *ct, int32_t zone) static struct zone_limit * zone_limit_lookup_or_default(struct conntrack *ct, int32_t zone) -OVS_REQUIRES(ct->ct_lock) { struct zone_limit *zl = zone_limit_lookup(ct, zone); return zl ? zl : zone_limit_lookup(ct, DEFAULT_ZONE); @@ -370,13 +382,16 @@ zone_limit_lookup_or_default(struct conntrack *ct, int32_t zone) struct conntrack_zone_limit zone_limit_get(struct conntrack *ct, int32_t zone) { -ovs_mutex_lock(&ct->ct_lock); -struct conntrack_zone_limit czl = {DEFAULT_ZONE, 0, 0, 0}; +struct conntrack_zone_limit czl = { +.zone = DEFAULT_ZONE, +.limit = 0, +.count = ATOMIC_COUNT_INIT(0), +.zone_limit_seq = 0, +}; struct zone_limit *zl = zone_limit_lookup_or_default(ct, zone); if (zl) { czl = zl->czl; } -ovs_mutex_unlock(&ct->ct_lock); return czl; } @@ -384,13 +399,19 @@ static int zone_limit_create(struct conntrack *ct, int32_t zone, uint32_t limit) OVS_REQUIRES(ct->ct_lock) { +struct zone_limit *zl = zone_limit_lookup_protected(ct, zone); + +if (zl) { +return 0; +} + if (zone >= DEFAULT_ZONE && zone <= MAX_ZONE) { -struct zone_limit *zl = xzalloc(sizeof *zl); +zl = xzalloc(sizeof *zl); zl->czl.limit = limit; zl->czl.zone = zone; zl->czl.zone_limit_seq = ct->zone_limit_seq++; uint32_t hash = zone_key_hash(zone, ct->hash_basis); -hmap_insert(&ct->zone_limits, &zl->node, hash); +cmap_insert(&ct->zone_limits, &zl->node, hash); return 0; } else { return EINVAL; @@ -401,13 +422,14 @@ int zone_limit_update(struct conntrack *ct, int32_t zone, uint32_t limit) { int err = 0; -ovs_mutex_lock(&ct->ct_lock); struct zone_limit *zl = zone_limit_lookup(ct, zone); if (zl) { zl->czl.limit = limit; VLOG_INFO("Changed zone limit of %u for zone %d", limit, zone); } else { +ovs_mutex_lock(&ct->ct_lock); err = zone_limit_create(ct, zone, limit); +ovs_mutex_unlock(&ct->ct_lock); if (!err) { VLOG_INFO("Crea
[ovs-dev] [PATCH RFC v3 0/6] conntrack: Introduce buckets and reduce contention.
This series aims to share the work done so far, and to start a discussion of a slightly different approach in terms of the way we handle the connections. It's not considered ready as further work and extensive tests are needed. These are some performance numbers obtained in the following way: ./tests/ovstest test-conntrack benchmark $i 16777216 32 1 $i baseline ct-scale ct-bucket 2 16480 ms 3527 ms3180 ms 4 37375 ms 4997 ms4477 ms 8 63239 ms 9692 ms8954 ms 16 131583 ms 19303 ms 18062 ms Both ct-scale and ct-bucket introduce a noticeable improvement in terms of performace. It's worth to mention that the ct-scale, keeping the exp_lists, is more efficient during the sweeping. There are two main logical changes that the series tries to introduce. The first one is the reintroduction of buckets, the second one is the removal of the expiration lists. The first two patches are a prereq and were picked (untouched) from this series (by Gaetan): https://patchwork.ozlabs.org/project/openvswitch/list/?series=249027&state=* The third patch is a follow up of the following: https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0...@bytedance.com/ Some logic has changed, but the overall idea remained the same. Additional details in the patch description. The fourth patch introduces the buckets and removes the expiration lists. The buckets got reintroduced as cmaps instead of hmaps, trying to narrow down the critical sections as well. An alternative approach may involve the replacement of cmaps with linked lists (without increasing the number of locks), increasing the number of buckets (it could involve the ability to change the buckets number when the user changes the connection limit). This could improve the stale conns eviction during the processing of the packet, but this is currently out of scope for this series (can be discussed, though). The insertion, namely the creation of the connection, can probably be improved (or at least an improvement can be evaluated), as, in case of nat, it acquires the bucket locks calling nat_get_unique_tuple_lock(). This is needed because we may need to know the reverse tuple before locking in order to acquire the locks in the right way to avoid ABBA deadlocks. Further details about locking may be found in patch #4 description. There's a known limitation, and it is related mostly to zone limit. Keeping the stale entries, leads to a mismatch between the current number of per zone connections and the actual number of valid connections. This means that if we have a small zone limit number, we may wait a whole "next_wakeup" time before the entries get cleaned up (assuming we have candidates for removal). To avoid this, a zone clean up from the packet path has been added, but alternative approaches as triggering a per zone cleanup when a threshold of connections has been reached (storing the connections per bucket and per zone during the creation) may be still viable. Expiration lists have been removed for the sake of evaluation, but they could be included and duplicated among buckets. In this way we could distribute the contention (as a matter of fact reducing the number of connections per list) that affects the write access to a single data structure during every connection update. Patch #6 reintegrates the command below with multiple buckets: ovs-appctl dpctl/ct-bkts A couple of minor patches have been kept out of the series, for the time being, as they strongly depend on #4. Gaetan Rivet (3): conntrack: Use a cmap to store zone limits conntrack-tp: Use a cmap to store timeout policies conntrack: Use an atomic conn expiration value Paolo Valerio (2): conntrack: Split single cmap to multiple buckets. conntrack: Make ovs-appctl dpctl/ct-bkts work with multiple buckets Peng He (1): conntrack: Replaces nat_conn introducing key directionality. lib/conntrack-private.h | 60 ++- lib/conntrack-tp.c | 102 ++-- lib/conntrack.c | 1061 +++ lib/conntrack.h |6 +- lib/dpif-netdev.c |5 +- tests/system-traffic.at |5 +- 6 files changed, 706 insertions(+), 533 deletions(-) ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH] conntrack: Support packets/bytes stats
Hello Yifeng, thanks for the patch. Yifeng Sun writes: > Userspace conntrack doesn't support conntrack stats for packets and > bytes. This patch implements it. > > Signed-off-by: Yifeng Sun > --- > lib/conntrack-private.h | 9 + > lib/conntrack.c | 28 > tests/system-common-macros.at | 2 +- > tests/system-traffic.at | 30 ++ > 4 files changed, 68 insertions(+), 1 deletion(-) > > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > index dfdf4e676..7f21d3772 100644 > --- a/lib/conntrack-private.h > +++ b/lib/conntrack-private.h > @@ -91,6 +91,11 @@ enum OVS_PACKED_ENUM ct_conn_type { > CT_CONN_TYPE_UN_NAT, > }; > > +struct conn_counter { > +atomic_uint64_t packets; > +atomic_uint64_t bytes; > +}; > + > struct conn { > /* Immutable data. */ > struct conn_key key; > @@ -123,6 +128,10 @@ struct conn { > enum ct_conn_type conn_type; > > uint32_t tp_id; /* Timeout policy ID. */ > + > +/* Counters. */ > +struct conn_counter counters_orig; > +struct conn_counter counters_reply; > }; > > enum ct_update_res { > diff --git a/lib/conntrack.c b/lib/conntrack.c > index 33a1a9295..177154cd8 100644 > --- a/lib/conntrack.c > +++ b/lib/conntrack.c > @@ -1245,6 +1245,21 @@ conn_update_state_alg(struct conntrack *ct, struct > dp_packet *pkt, > return false; > } > > +static void > +conn_update_counters(struct conn *conn, > + const struct dp_packet *pkt, bool reply) > +{ > +if (conn) { > +struct conn_counter *counter = (reply > + ? &conn->counters_reply > + : &conn->counters_orig); > +uint64_t old; > + > +atomic_count_inc64(&counter->packets); > +atomic_add(&counter->bytes, dp_packet_size(pkt), &old); > +} > +} > + > static void > set_cached_conn(const struct nat_action_info_t *nat_action_info, > const struct conn_lookup_ctx *ctx, struct conn *conn, > @@ -1283,6 +1298,8 @@ process_one_fast(uint16_t zone, const uint32_t *setmark, > if (setlabel) { > set_label(pkt, conn, &setlabel[0], &setlabel[1]); > } > + > +conn_update_counters(conn, pkt, pkt->md.reply); > } > > static void > @@ -1420,6 +1437,8 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, > set_label(pkt, conn, &setlabel[0], &setlabel[1]); > } > > +conn_update_counters(conn, pkt, ctx->reply); > + > handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info); > > set_cached_conn(nat_action_info, ctx, conn, pkt); > @@ -2641,6 +2660,15 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct > ct_dpif_entry *entry, > } > ovs_mutex_unlock(&conn->lock); > > +entry->counters_orig.packets = atomic_count_get64( > +(atomic_uint64_t *)&conn->counters_orig.packets); > +entry->counters_orig.bytes = atomic_count_get64( > +(atomic_uint64_t *)&conn->counters_orig.bytes); > +entry->counters_reply.packets = atomic_count_get64( > +(atomic_uint64_t *)&conn->counters_reply.packets); > +entry->counters_reply.bytes = atomic_count_get64( > +(atomic_uint64_t *)&conn->counters_reply.bytes); > + > entry->timeout = (expiration > 0) ? expiration / 1000 : 0; > > if (conn->alg) { > diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at > index 19a0b125b..89cd7b83c 100644 > --- a/tests/system-common-macros.at > +++ b/tests/system-common-macros.at > @@ -240,7 +240,7 @@ m4_define([STRIP_MONITOR_CSUM], [grep "csum:" | sed > 's/csum:.*/csum: /']) > # and limit the output to the rows containing 'ip-addr'. > # > m4_define([FORMAT_CT], > -[[grep "dst=$1" | sed -e 's/port=[0-9]*/port=/g' -e > 's/id=[0-9]*/id=/g' -e 's/state=[0-9_A-Z]*/state=/g' | sort > | uniq]]) > +[[grep "dst=$1" | sed -e 's/port=[0-9]*/port=/g' -e > 's/id=[0-9]*/id=/g' -e 's/state=[0-9_A-Z]*/state=/g' -e > 's/timeout=[0-9]*/timeout=/g' | sort | uniq]]) > > # NETNS_DAEMONIZE([namespace], [command], [pidfile]) > # > diff --git a/tests/system-traffic.at b/tests/system-traffic.at > index f22d86e46..15b2c288c 100644 > --- a/tests/system-traffic.at > +++ b/tests/system-traffic.at > @@ -6743,6 +6743,36 @@ AT_CHECK([ovs-ofctl dump-flows br0 | grep table=2, | > OFPROTO_CLEAR_DURATION_IDLE > OVS_TRAFFIC_VSWITCHD_STOP > AT_CLEANUP > > +AT_SETUP([conntrack - stats]) nit: maybe we can use a more descriptive message in terms of what the test aims to check, like: "Stats with packets and bytes counters" anything else you think it would best describe the test will work. > +CHECK_CONNTRACK() > +OVS_TRAFFIC_VSWITCHD_START() > + > +ADD_NAMESPACES(at_ns0, at_ns1) > + > +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") > + > +AT_DATA([flows.txt], [dnl > +priority=1,action=drop > +priority
Re: [ovs-dev] IPsec/test: skip the test if tcpdump are not installed
Mohammad Heib writes: > IPsec unit tests uses tcpdump to capture and validate the ESP > traffic so the test must be skipped in environment that don't > have the tcpdump tool installed. > > Signed-off-by: Mohammad Heib > --- > tests/system-ipsec.at | 7 --- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at > index f45a153ed..888f79120 100644 > --- a/tests/system-ipsec.at > +++ b/tests/system-ipsec.at > @@ -99,9 +99,10 @@ dnl Check if necessary Libreswan dependencies are > available on the test machine > m4_define([CHECK_LIBRESWAN], >[dnl Skip tests if system has not been set up for Libreswan >AT_SKIP_IF([!(ipsec --version | grep Libreswan)]) > - AT_SKIP_IF([test ! -x $(which certutil)]) > - AT_SKIP_IF([test ! -x $(which pk12util)]) > - AT_SKIP_IF([test ! -x $(which openssl)]) > + AT_SKIP_IF([test ! $(which certutil)]) > + AT_SKIP_IF([test ! $(which pk12util)]) > + AT_SKIP_IF([test ! $(which openssl)]) > + AT_SKIP_IF([test ! $(which tcpdump)]) Since you're changing this block anyway, prefer: AT_SKIP_IF([test $HAVE_CERTUTIL = no]) AT_SKIP_IF([test $HAVE_PK12UTIL = no]) AT_SKIP_IF([test $HAVE_OPENSSL = no]) AT_SKIP_IF([test $HAVE_TCPDUMP = no]) and updating atlocal.in to run: find_command certutil find_command pk12util find_command openssl >dnl If '$ovs_base' is too long, the following Libreswan issue will > trigger >dnl so we check that it is not too long and skip test if it is. >dnl https://github.com/libreswan/libreswan/issues/428 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev