[ovs-dev] [PATCH v14 4/5] dpif-netdev: Refactor generic implementation

Harry van Haaren Thu, 18 Jul 2019 06:05:39 -0700

This commit refactors the generic implementation. The
goal of this refactor is to simplify the code to enable
"specialization" of the functions at compile time.


Given compile-time optimizations, the compiler is able
to unroll loops, and create optimized code sequences due
to compile time knowledge of loop-trip counts.

In order to enable these compiler optimizations, we must
refactor the code to pass the loop-trip counts to functions
as compile time constants.

This patch allows the number of miniflow-bits set per "unit"
in the miniflow to be passed around as a function argument.

Note that this patch does NOT yet take advantage of doing so,
this is only a refactor to enable it in the next patches.

Signed-off-by: Harry van Haaren <harry.van.haa...@intel.com>
Tested-by: Malvika Gupta <malvika.gu...@arm.com>

---

v14:
- Rework blocks array to use DEFINE_PER_THREAD_MALLOCED_DATA() which
  is capable of correct cleanup when a thread exits. (Ilya)
- Pull out block allocation and resizing to seperate function (Ilya)
- Various smaller fixups (PRIu32, xrealloc() instead of manual etc) (Ilya)

v13:
- Moved blocks scratch array to thread local storage. This encapsulates
  the blocks array better inside the implementation where it is
  required, and avoids bleeding the details to the dpcls or PMD level.
  Thanks Ilya for suggesting the DEFINE_STATIC_PER_THREAD_DATA method.
- Removed blocks scratch array from struct dpcls
- Removed blocks_scratch parameter from lookup_func prototype

v12:
- Fix Caps and . (Ilya)
- Fixed typos (Ilya)
- Added mf_bits and mf_masks in this patch (Ilya)
- Fixed rebase conflicts

v11:
- Rebased to previous changes
- Fix typo in commit message (Ian)
- Fix variable declaration spacing (Ian)
- Remove function names from comments (Ian)
- Replace magic 8 with sizeof(uint64_t) (Ian)
- Captialize and end comments with a stop. (Ian/Ilya)
- Add build time assert to validate FLOWMAP_UNITS (Ilya)
- Add note on ALWAYS_INLINE operation
- Add space after ULLONG_FOR_EACH_1 (Ilya)
- Use hash_add_words64() instead of rolling own loop (Ilya)
    Note that hash_words64_inline() calls hash_finish() with an
    fixed value, so it was not the right hash function for this
    usage. Used hash_add_words64() and manual hash_finish() to
    re-use as much of hashing code as we can.

v10:
- Rebase updates from previous patches
- Fix whitespace indentation of func params
- Removed restrict keyword, Windows CI failing when it is used (Ian)
- Fix integer 0 used to set NULL pointer (Ilya)
- Postpone free() call on cls->blocks_scratch (Ilya)
- Fix indentation of a function

v9:
- Use count_1bits in favour of __builtin_popcount (Ilya)
- Use ALWAYS_INLINE instead of __attribute__ synatx (Ilya)

v8:
- Rework block_cache and mf_masks to avoid variable-lenght array
  due to compiler issues. Provisioning for worst case is not a
  good solution due to magnitue of over-provisioning required.
- Rework netdev_flatten function removing unused parameter
---
 lib/dpif-netdev-lookup-generic.c | 232 ++++++++++++++++++++++++++-----
 lib/dpif-netdev-private.h        |  12 +-
 lib/dpif-netdev.c                |  51 ++++++-
 3 files changed, 260 insertions(+), 35 deletions(-)

diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index 8064911b3..4dcf4640e 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -27,61 +27,217 @@
 #include "dpif-netdev-perf.h"
 #include "dpif-provider.h"
 #include "flow.h"
+#include "ovs-thread.h"
 #include "packets.h"
 #include "pvector.h"
 
-/* Returns a hash value for the bits of 'key' where there are 1-bits in
- * 'mask'. */
-static inline uint32_t
-netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
-                             const struct netdev_flow_key *mask)
+VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic);
+
+/* Lookup functions below depends on the internal structure of flowmap. */
+BUILD_ASSERT_DECL(FLOWMAP_UNITS == 2);
+
+struct block_array {
+    uint32_t count; /* Number of items allocated in 'blocks' */
+    uint64_t blocks[];
+};
+
+DEFINE_PER_THREAD_MALLOCED_DATA(struct block_array *, block_array);
+
+static inline uint64_t *
+get_blocks_scratch(uint32_t required_count)
 {
-    const uint64_t *p = miniflow_get_values(&mask->mf);
-    uint32_t hash = 0;
-    uint64_t value;
+    struct block_array *array = block_array_get();
 
-    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP (value, key, mask->mf.map) {
-        hash = hash_add64(hash, value & *p);
-        p++;
+    /* Check if this thread already has a large enough array allocated.
+     * This is a predictable and unlikely branch, as it occurs only once at
+     * startup, or if a subtable with higher block count is added.
+     */
+    if (OVS_UNLIKELY(!array || array->count < required_count)) {
+        array = xrealloc(array, sizeof *array +
+                         (required_count * sizeof array->blocks[0]));
+        array->count = required_count;
+        block_array_set_unsafe(array);
+        VLOG_DBG("Block array resized to %"PRIu32, required_count);
     }
 
-    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
+    return &array->blocks[0];
 }
 
-uint32_t
-dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
-                              uint32_t keys_map,
-                              const struct netdev_flow_key *keys[],
-                              struct dpcls_rule **rules)
+static inline void
+netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks,
+                             const uint64_t *tbl_blocks,
+                             const uint64_t *mf_masks,
+                             uint64_t *blocks_scratch,
+                             const uint64_t pkt_mf_bits,
+                             const uint32_t count)
 {
-    int i;
-    uint32_t found_map;
+    uint32_t i;
+
+    for (i = 0; i < count; i++) {
+        uint64_t mf_mask = mf_masks[i];
+        /* Calculate the block index for the packet metadata. */
+        uint64_t idx_bits = mf_mask & pkt_mf_bits;
+        const uint32_t pkt_idx = count_1bits(idx_bits);
+
+        /* Check if the packet has the subtable miniflow bit set. If yes, the
+         * block at the above pkt_idx will be stored, otherwise it is masked
+         * out to be zero.
+         */
+        uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
+        uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
+
+        /* Mask packet block by table block, and mask to zero if packet
+         * doesn't actually contain this block of metadata.
+         */
+        blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
+    }
+}
+
+/* This function takes a packet, and subtable and writes an array of uint64_t
+ * blocks. The blocks contain the metadata that the subtable matches on, in
+ * the same order as the subtable, allowing linear iteration over the blocks.
+ *
+ * To calculate the blocks contents, the netdev_flow_key_flatten_unit function
+ * is called twice, once for each "unit" of the miniflow. This call can be
+ * inlined by the compiler for performance.
+ *
+ * Note that the u0_count and u1_count variables can be compile-time constants,
+ * allowing the loop in the inlined flatten_unit() function to be compile-time
+ * unrolled, or possibly removed totally by unrolling by the loop iterations.
+ * The compile time optimizations enabled by this design improves performance.
+ */
+static inline void
+netdev_flow_key_flatten(const struct netdev_flow_key *key,
+                        const struct netdev_flow_key *mask,
+                        const uint64_t *mf_masks,
+                        uint64_t *blocks_scratch,
+                        const uint32_t u0_count,
+                        const uint32_t u1_count)
+{
+    /* Load mask from subtable, mask with packet mf, popcount to get idx. */
+    const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
+    const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
+
+    /* Packet miniflow bits to be masked by pre-calculated mf_masks. */
+    const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
+    const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
+    const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
+
+    /* Unit 0 flattening */
+    netdev_flow_key_flatten_unit(&pkt_blocks[0],
+                                 &tbl_blocks[0],
+                                 &mf_masks[0],
+                                 &blocks_scratch[0],
+                                 pkt_bits_u0,
+                                 u0_count);
+
+    /* Unit 1 flattening:
+     * Move the pointers forward in the arrays based on u0 offsets, NOTE:
+     * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
+     *    the same as the amount of bits set in the subtable.
+     * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
+     *    the index is always u0_count.
+     */
+    netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
+                                 &tbl_blocks[u0_count],
+                                 &mf_masks[u0_count],
+                                 &blocks_scratch[u0_count],
+                                 pkt_bits_u1,
+                                 u1_count);
+}
+
+/* Compares a rule and the blocks representing a key, returns 1 on a match. */
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule *rule,
+                        const uint32_t mf_bits_total,
+                        const uint64_t *blocks_scratch)
+{
+    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+    uint64_t not_match = 0;
+
+    for (int i = 0; i < mf_bits_total; i++) {
+        not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
+    }
 
-    /* Compute hashes for the remaining keys.  Each search-key is
-     * masked with the subtable's mask to avoid hashing the wildcarded
-     * bits. */
+    /* Invert result to show match as 1. */
+    return !not_match;
+}
+
+/* Const prop version of the function: note that mf bits total and u0 are
+ * explicitly passed in here, while they're also available at runtime from the
+ * subtable pointer. By making them compile time, we enable the compiler to
+ * unroll loops and flatten out code-sequences based on the knowledge of the
+ * mf_bits_* compile time values. This results in improved performance.
+ *
+ * Note: this function is marked with ALWAYS_INLINE to ensure the compiler
+ * inlines the below code, and then uses the compile time constants to make
+ * specialized versions of the runtime code. Without ALWAYS_INLINE, the
+ * compiler might decide to not inline, and performance will suffer.
+ */
+static inline uint32_t ALWAYS_INLINE
+lookup_generic_impl(struct dpcls_subtable *subtable,
+                    uint32_t keys_map,
+                    const struct netdev_flow_key *keys[],
+                    struct dpcls_rule **rules,
+                    const uint32_t bit_count_u0,
+                    const uint32_t bit_count_u1)
+{
+    const uint32_t n_pkts = count_1bits(keys_map);
+    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
     uint32_t hashes[NETDEV_MAX_BURST];
+
+    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
+    const uint32_t block_count_required = bit_count_total * NETDEV_MAX_BURST;
+    uint64_t *mf_masks = subtable->mf_masks;
+    int i;
+
+    /* Blocks scratch is an optimization to re-use the same packet miniflow
+     * block data when doing rule-verify. This reduces work done during lookup
+     * and hence improves performance. The blocks_scratch array is stored as a
+     * thread local variable, as each thread requires its own blocks memory.
+     */
+    uint64_t *blocks_scratch = get_blocks_scratch(block_count_required);
+
+    /* Flatten the packet metadata into the blocks_scratch[] using subtable. */
+    ULLONG_FOR_EACH_1 (i, keys_map) {
+            netdev_flow_key_flatten(keys[i],
+                                    &subtable->mask,
+                                    mf_masks,
+                                    &blocks_scratch[i * bit_count_total],
+                                    bit_count_u0,
+                                    bit_count_u1);
+    }
+
+    /* Hash the now linearized blocks of packet metadata. */
     ULLONG_FOR_EACH_1 (i, keys_map) {
-        hashes[i] = netdev_flow_key_hash_in_mask(keys[i], &subtable->mask);
+        uint64_t *block_ptr = &blocks_scratch[i * bit_count_total];
+        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
+        hashes[i] = hash_finish(hash, bit_count_total * 8);
     }
 
-    /* Lookup. */
+    /* Lookup: this returns a bitmask of packets where the hash table had
+     * an entry for the given hash key. Presence of a hash key does not
+     * guarantee matching the key, as there can be hash collisions.
+     */
+    uint32_t found_map;
     const struct cmap_node *nodes[NETDEV_MAX_BURST];
+
     found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
 
-    /* Check results.  When the i-th bit of found_map is set, it means
-     * that a set of nodes with a matching hash value was found for the
-     * i-th search-key.  Due to possible hash collisions we need to check
-     * which of the found rules, if any, really matches our masked
-     * search-key. */
+    /* Verify that packet actually matched rule. If not found, a hash
+     * collision has taken place, so continue searching with the next node.
+     */
     ULLONG_FOR_EACH_1 (i, found_map) {
         struct dpcls_rule *rule;
 
         CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
-            if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
+            const uint32_t cidx = i * bit_count_total;
+            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
+                                                     &blocks_scratch[cidx]);
+
+            if (OVS_LIKELY(match)) {
                 rules[i] = rule;
-                /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
-                 * within one second optimization interval. */
                 subtable->hit_cnt++;
                 goto next;
             }
@@ -96,3 +252,15 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable 
*subtable,
 
     return found_map;
 }
+
+/* Generic lookup function that uses runtime provided mf bits for iterating. */
+uint32_t
+dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
+                              uint32_t keys_map,
+                              const struct netdev_flow_key *keys[],
+                              struct dpcls_rule **rules)
+{
+    return lookup_generic_impl(subtable, keys_map, keys, rules,
+                               subtable->mf_bits_set_unit0,
+                               subtable->mf_bits_set_unit1);
+}
diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
index 555856482..610851a10 100644
--- a/lib/dpif-netdev-private.h
+++ b/lib/dpif-netdev-private.h
@@ -60,7 +60,7 @@ uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable 
*subtable,
                                        const struct netdev_flow_key *keys[],
                                        struct dpcls_rule **rules);
 
-/* Prototype for generic lookup func, using same code path as before. */
+/* Prototype for generic lookup func, using generic scalar code path. */
 uint32_t
 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
                               uint32_t keys_map,
@@ -77,12 +77,22 @@ struct dpcls_subtable {
     uint32_t hit_cnt;            /* Number of match hits in subtable in current
                                     optimization interval. */
 
+    /* Miniflow fingerprint that the subtable matches on. The miniflow "bits"
+     * are used to select the actual dpcls lookup implementation at subtable
+     * creation time.
+     */
+    uint8_t mf_bits_set_unit0;
+    uint8_t mf_bits_set_unit1;
+
     /* The lookup function to use for this subtable. If there is a known
      * property of the subtable (eg: only 3 bits of miniflow metadata is
      * used for the lookup) then this can point at an optimized version of
      * the lookup function for this particular subtable. */
     dpcls_subtable_lookup_func lookup_func;
 
+    /* Caches the masks to match a packet to, reducing runtime calculations. */
+    uint64_t *mf_masks;
+
     struct netdev_flow_key mask; /* Wildcards for fields (const). */
     /* 'mask' must be the last field, additional space is allocated here. */
 };
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b42ca35e3..702170698 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -7649,6 +7649,7 @@ static void
 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
 {
     cmap_destroy(&subtable->rules);
+    ovsrcu_postpone(free, subtable->mf_masks);
     ovsrcu_postpone(free, subtable);
 }
 
@@ -7701,7 +7702,17 @@ dpcls_create_subtable(struct dpcls *cls, const struct 
netdev_flow_key *mask)
     subtable->hit_cnt = 0;
     netdev_flow_key_clone(&subtable->mask, mask);
 
-    /* Decide which hash/lookup/verify function to use. */
+    /* The count of bits in the mask defines the space required for masks.
+     * Then call gen_masks() to create the appropriate masks, avoiding the cost
+     * of doing runtime calculations. */
+    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
+    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
+    subtable->mf_bits_set_unit0 = unit0;
+    subtable->mf_bits_set_unit1 = unit1;
+    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
+    netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
+
+    /* Assign the generic lookup - this works with any miniflow fingerprint. */
     subtable->lookup_func = dpcls_subtable_lookup_generic;
 
     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
@@ -7846,6 +7857,43 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
     }
 }
 
+/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
+static inline void
+netdev_flow_key_gen_mask_unit(uint64_t iter,
+                              const uint64_t count,
+                              uint64_t *mf_masks)
+{
+    int i;
+    for (i = 0; i < count; i++) {
+        uint64_t lowest_bit = (iter & -iter);
+        iter &= ~lowest_bit;
+        mf_masks[i] = (lowest_bit - 1);
+    }
+    /* Checks that count has covered all bits in the iter bitmap. */
+    ovs_assert(iter == 0);
+}
+
+/* Generate a mask for each block in the miniflow, based on the bits set. This
+ * allows easily masking packets with the generated array here, without
+ * calculations. This replaces runtime-calculating the masks.
+ * @param key The table to generate the mf_masks for
+ * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
+ * @param mf_bits_total Number of bits set in the whole miniflow (both units)
+ * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
+ */
+void
+netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
+                          uint64_t *mf_masks,
+                          const uint32_t mf_bits_u0,
+                          const uint32_t mf_bits_u1)
+{
+    uint64_t iter_u0 = tbl->mf.map.bits[0];
+    uint64_t iter_u1 = tbl->mf.map.bits[1];
+
+    netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
+    netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
+}
+
 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
  * in 'mask' the values in 'key' and 'target' are the same. */
 bool
@@ -7886,7 +7934,6 @@ dpcls_lookup(struct dpcls *cls, const struct 
netdev_flow_key *keys[],
     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
 
     struct dpcls_subtable *subtable;
-
     uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
 
     if (cnt != MAP_BITS) {
-- 
2.17.1

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v14 4/5] dpif-netdev: Refactor generic implementation

Reply via email to