On 08.05.2019 13:22, Harry van Haaren wrote:
> This commit refactors the generic implementation. The
> goal of this refactor is to simply the code to enable
> "specialization" of the functions at compile time.
> 
> Given compile-time optimizations, the compiler is able
> to unroll loops, and create optimized code sequences due
> to compile time knowledge of loop-trip counts.
> 
> In order to enable these compiler optimizations, we must
> refactor the code to pass the loop-trip counts to functions
> as compile time constants.
> 
> This patch allows the number of miniflow-bits set per "unit"
> in the miniflow to be passed around as a function argument.
> 
> Note that this patch does NOT yet take advantage of doing so,
> this is only a refactor to enable it in the next patches.
> 
> Signed-off-by: Harry van Haaren <harry.van.haa...@intel.com>
> 
> ---
> 
> v8:
> - Rework block_cache and mf_masks to avoid variable-lenght array
>   due to compiler issues. Provisioning for worst case is not a
>   good solution due to magnitue of over-provisioning required.
> - Rework netdev_flatten function removing unused parameter
> ---
>  lib/dpif-netdev-lookup-generic.c | 239 ++++++++++++++++++++++++-------
>  lib/dpif-netdev.c                |  79 +++++++++-
>  lib/dpif-netdev.h                |  20 ++-
>  3 files changed, 283 insertions(+), 55 deletions(-)
> 
> diff --git a/lib/dpif-netdev-lookup-generic.c 
> b/lib/dpif-netdev-lookup-generic.c
> index 2e4003408..3e1b704fe 100644
> --- a/lib/dpif-netdev-lookup-generic.c
> +++ b/lib/dpif-netdev-lookup-generic.c
> @@ -28,67 +28,204 @@
>  #include "packets.h"
>  #include "pvector.h"
>  
> -/* Returns a hash value for the bits of 'key' where there are 1-bits in
> - * 'mask'. */
> -static inline uint32_t
> -netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
> -                             const struct netdev_flow_key *mask)
> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic);
> +
> +/* netdev_flow_key_flatten_unit:
> + * Given a packet, table and mf_masks, this function iterates over each bit
> + * set in the subtable, and calculates the appropriate metadata to store in 
> the
> + * blocks_scratch[].
> + *
> + * The results of the blocks_scratch[] can be used for hashing, and later for
> + * verification of if a rule matches the given packet.
> + */
> +static inline void
> +netdev_flow_key_flatten_unit(const uint64_t * restrict pkt_blocks,
> +                             const uint64_t * restrict tbl_blocks,
> +                             const uint64_t * restrict mf_masks,
> +                             uint64_t * restrict blocks_scratch,
> +                             const uint64_t pkt_mf_bits,
> +                             const uint32_t count)
>  {
> -    const uint64_t *p = miniflow_get_values(&mask->mf);
> -    uint32_t hash = 0;
> -    uint64_t value;
> +    uint32_t i;
> +    for (i = 0; i < count; i++) {
> +        uint64_t mf_mask = mf_masks[i];
> +        /* Calculate the block index for the packet metadata */
> +        uint64_t idx_bits = mf_mask & pkt_mf_bits;
> +        const uint32_t pkt_idx = __builtin_popcountll(idx_bits);

I guess, we need to use 'count_1bits' from lib/util.h instead to avoid
issues with different compilers or systems without builtin support.
Same for all other palces.

>  
> -    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP (value, key, mask->mf.map) {
> -        hash = hash_add64(hash, value & *p);
> -        p++;
> +        /* check if the packet has the subtable miniflow bit set. If yes, the
> +         * block at the above pkt_idx will be stored, otherwise it is masked
> +         * out to be zero.
> +         */
> +        uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
> +        uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
> +
> +        /* mask packet block by table block, and mask to zero if packet
> +         * doesn't actually contain this block of metadata
> +         */
> +        blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
>      }
> +}
> +
> +/* netdev_flow_key_flatten:
> + * This function takes a packet, and subtable and writes an array of uint64_t
> + * blocks. The blocks contain the metadata that the subtable matches on, in
> + * the same order as the subtable, allowing linear iteration over the blocks.
> + *
> + * To calculate the blocks contents, the netdev_flow_key_flatten_unit 
> function
> + * is called twice, once for each "unit" of the miniflow. This call can be
> + * inlined by the compiler for performance.
> + *
> + * Note that the u0_count and u1_count variables can be compile-time 
> constants,
> + * allowing the loop in the inlined flatten_unit() function to be 
> compile-time
> + * unrolled, or possibly removed totally by unrolling by the loop iterations.
> + * The compile time optimizations enabled by this design improves 
> performance.
> + */
> +static inline void
> +netdev_flow_key_flatten(const struct netdev_flow_key * restrict key,
> +                        const struct netdev_flow_key * restrict mask,
> +                        const uint64_t * restrict mf_masks,
> +                        uint64_t * restrict blocks_scratch,
> +                        const uint32_t u0_count,
> +                        const uint32_t u1_count)
> +{
> +    /* load mask from subtable, mask with packet mf, popcount to get idx */
> +    const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
> +    const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
> +
> +    /* packet miniflow bits to be masked by pre-calculated mf_masks */
> +    const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
> +    const uint32_t pkt_bits_u0_pop = __builtin_popcountll(pkt_bits_u0);
> +    const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
>  
> -    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
> +    /* Unit 0 flattening */
> +    netdev_flow_key_flatten_unit(&pkt_blocks[0],
> +                            &tbl_blocks[0],
> +                            &mf_masks[0],
> +                            &blocks_scratch[0],
> +                            pkt_bits_u0,
> +                            u0_count);
> +
> +    /* Unit 1 flattening:
> +     * Move the pointers forward in the arrays based on u0 offsets, NOTE:
> +     * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
> +     *    the same as the amount of bits set in the subtable.
> +     * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
> +     *    the index is always u0_count.
> +     */
> +    netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
> +                                 &tbl_blocks[u0_count],
> +                                 &mf_masks[u0_count],
> +                                 &blocks_scratch[u0_count],
> +                                 pkt_bits_u1,
> +                                 u1_count);
> +}
> +
> +static inline uint64_t
> +netdev_rule_matches_key(const struct dpcls_rule * restrict rule,
> +                        const uint32_t mf_bits_total,
> +                        const uint64_t * restrict blocks_scratch)
> +{
> +    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
> +    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
> +
> +    uint64_t not_match = 0;
> +    for (int i = 0; i < mf_bits_total; i++) {
> +        not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
> +    }
> +
> +    /* invert result to show match as 1 */
> +    return !not_match;
>  }
>  
> +/* const prop version of the function: note that mf bits total and u0 are
> + * explicitly passed in here, while they're also available at runtime from 
> the
> + * subtable pointer. By making them compile time, we enable the compiler to
> + * unroll loops and flatten out code-sequences based on the knowledge of the
> + * mf_bits_* compile time values. This results in improved performance.
> + */
> +static inline uint32_t __attribute__((always_inline))
> +lookup_generic_impl(struct dpcls_subtable *subtable,
> +                    uint64_t *blocks_scratch,
> +                    uint32_t keys_map,
> +                    const struct netdev_flow_key *keys[],
> +                    struct dpcls_rule **rules,
> +                    const uint32_t bit_count_u0,
> +                    const uint32_t bit_count_u1)
> +{
> +    const uint32_t n_pkts = __builtin_popcountll(keys_map);
> +    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
> +    uint32_t hashes[NETDEV_MAX_BURST];
> +
> +    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
> +    uint64_t *mf_masks = subtable->mf_masks;
> +    int i;
> +
> +    /* Flatten the packet metadata into the blocks_scratch[] using subtable 
> */
> +    ULLONG_FOR_EACH_1(i, keys_map) {
> +            netdev_flow_key_flatten(keys[i],
> +                                    &subtable->mask,
> +                                    mf_masks,
> +                                    &blocks_scratch[i * bit_count_total],
> +                                    bit_count_u0,
> +                                    bit_count_u1);
> +    }
> +
> +    /* Hash the now linearized blocks of packet metadata */
> +    ULLONG_FOR_EACH_1(i, keys_map) {
> +         uint32_t hash = 0;
> +         uint32_t i_off = i * bit_count_total;
> +         for (int h = 0; h < bit_count_total; h++) {
> +             hash = hash_add64(hash, blocks_scratch[i_off + h]);
> +         }
> +         hashes[i] = hash_finish(hash, bit_count_total * 8);
> +    }
> +
> +    /* Lookup: this returns a bitmask of packets where the hash table had
> +     * an entry for the given hash key. Presence of a hash key does not
> +     * guarantee matching the key, as there can be hash collisions.
> +     */
> +    uint32_t found_map;
> +    const struct cmap_node *nodes[NETDEV_MAX_BURST];
> +    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
> +
> +    /* Verify that packet actually matched rule. If not found, a hash
> +     * collision has taken place, so continue searching with the next node.
> +     */
> +    ULLONG_FOR_EACH_1(i, found_map) {
> +        struct dpcls_rule *rule;
> +
> +        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> +            const uint32_t cidx = i * bit_count_total;
> +            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
> +                                                     &blocks_scratch[cidx]);
> +
> +            if (OVS_LIKELY(match)) {
> +                rules[i] = rule;
> +                subtable->hit_cnt++;
> +                goto next;
> +            }
> +        }
> +
> +        /* None of the found rules was a match.  Clear the i-th bit to
> +         * search for this key in the next subtable. */
> +        ULLONG_SET0(found_map, i);
> +    next:
> +        ;                     /* Keep Sparse happy. */
> +    }
> +
> +    return found_map;
> +}
> +
> +/* Generic - use runtime provided mf bits */
>  uint32_t
>  dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
> +                              uint64_t *blocks_scratch,
>                                uint32_t keys_map,
>                                const struct netdev_flow_key *keys[],
>                                struct dpcls_rule **rules)
>  {
> -        int i;
> -        /* Compute hashes for the remaining keys.  Each search-key is
> -         * masked with the subtable's mask to avoid hashing the wildcarded
> -         * bits. */
> -        uint32_t hashes[NETDEV_MAX_BURST];
> -        ULLONG_FOR_EACH_1(i, keys_map) {
> -            hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
> -                                                     &subtable->mask);
> -        }
> -
> -        /* Lookup. */
> -        const struct cmap_node *nodes[NETDEV_MAX_BURST];
> -        uint32_t found_map =
> -                cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
> -        /* Check results.  When the i-th bit of found_map is set, it means
> -         * that a set of nodes with a matching hash value was found for the
> -         * i-th search-key.  Due to possible hash collisions we need to check
> -         * which of the found rules, if any, really matches our masked
> -         * search-key. */
> -        ULLONG_FOR_EACH_1(i, found_map) {
> -            struct dpcls_rule *rule;
> -
> -            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> -                if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
> -                    rules[i] = rule;
> -                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> -                     * within one second optimization interval. */
> -                    subtable->hit_cnt++;
> -                    goto next;
> -                }
> -            }
> -            /* None of the found rules was a match.  Reset the i-th bit to
> -             * keep searching this key in the next subtable. */
> -            ULLONG_SET0(found_map, i);  /* Did not match. */
> -        next:
> -            ;                     /* Keep Sparse happy. */
> -        }
> -
> -        return found_map;
> +        return lookup_generic_impl(subtable, blocks_scratch, keys_map, keys,
> +                                   rules, subtable->mf_bits_set_unit0,
> +                                   subtable->mf_bits_set_unit1);
>  }
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index fe7171baa..267efde6d 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -232,6 +232,15 @@ struct dpcls {
>      odp_port_t in_port;
>      struct cmap subtables_map;
>      struct pvector subtables;
> +
> +    /* Region of memory for this DPCLS instance to use as scratch.
> +     * Size is garaunteed to be large enough to hold all blocks required for
> +     * the subtable's to match on. This allows each dpcls lookup to flatten
> +     * the packet miniflows into this blocks_scratch area, without using
> +     * variable lenght arrays. This region is allocated on subtable create, 
> and
> +     * will be resized as required if a larger subtable is added. */
> +    uint64_t *blocks_scratch;
> +    uint32_t blocks_scratch_size;
>  };
>  
>  /* Data structure to keep packet order till fastpath processing. */
> @@ -7546,6 +7555,8 @@ dpcls_init(struct dpcls *cls)
>  {
>      cmap_init(&cls->subtables_map);
>      pvector_init(&cls->subtables);
> +    cls->blocks_scratch = 0;
> +    cls->blocks_scratch_size = 0;
>  }
>  
>  static void
> @@ -7556,6 +7567,7 @@ dpcls_destroy_subtable(struct dpcls *cls, struct 
> dpcls_subtable *subtable)
>      cmap_remove(&cls->subtables_map, &subtable->cmap_node,
>                  subtable->mask.hash);
>      cmap_destroy(&subtable->rules);
> +    ovsrcu_postpone(free, subtable->mf_masks);
>      ovsrcu_postpone(free, subtable);
>  }
>  
> @@ -7574,6 +7586,7 @@ dpcls_destroy(struct dpcls *cls)
>          }
>          cmap_destroy(&cls->subtables_map);
>          pvector_destroy(&cls->subtables);
> +        free(cls->blocks_scratch);
>      }
>  }
>  
> @@ -7589,7 +7602,29 @@ dpcls_create_subtable(struct dpcls *cls, const struct 
> netdev_flow_key *mask)
>      subtable->hit_cnt = 0;
>      netdev_flow_key_clone(&subtable->mask, mask);
>  
> -    /* decide which hash/lookup/verify function to use */
> +    /* The count of bits in the mask defines the space required for masks.
> +     * Then call gen_masks() to create the appropriate masks, avoiding the 
> cost
> +     * of doing runtime calculations */
> +    uint32_t unit0 = __builtin_popcountll(mask->mf.map.bits[0]);
> +    uint32_t unit1 = __builtin_popcountll(mask->mf.map.bits[1]);
> +    subtable->mf_bits_set_unit0 = unit0;
> +    subtable->mf_bits_set_unit1 = unit1;
> +
> +    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
> +    netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
> +
> +
> +    /* allocate blocks scratch space only if subtable requires more size than
> +     * is currently allocated */
> +    const uint32_t blocks_required_per_pkt = unit0 + unit1;
> +    if (cls->blocks_scratch_size < blocks_required_per_pkt) {
> +        free(cls->blocks_scratch);
> +        cls->blocks_scratch = xmalloc(sizeof(uint64_t) * NETDEV_MAX_BURST *
> +                                      blocks_required_per_pkt);
> +        cls->blocks_scratch_size = blocks_required_per_pkt;
> +    }
> +
> +    /* Assign the generic lookup - this works with any miniflow fingerprint 
> */
>      subtable->lookup_func = dpcls_subtable_lookup_generic;
>  
>      cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
> @@ -7734,6 +7769,43 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule 
> *rule)
>      }
>  }
>  
> +/* inner loop for mask generation of a unit, see netdev_flow_key_gen_masks */
> +static inline void
> +netdev_flow_key_gen_mask_unit(uint64_t iter,
> +                              const uint64_t count,
> +                              uint64_t *mf_masks)
> +{
> +    int i;
> +    for (i = 0; i < count; i++) {
> +        uint64_t lowest_bit = (iter & -iter);
> +        iter &= ~lowest_bit;
> +        mf_masks[i] = (lowest_bit - 1);
> +    }
> +    /* checks that count has covered all bits in the iter bitmap */
> +    ovs_assert(iter == 0);
> +}
> +
> +/* generate a mask for each block in the miniflow, based on the bits set. 
> This
> + * allows easily masking packets with the generated array here, without
> + * calculations. This replaces runtime-calculating the masks.
> + * @param key The table to generate the mf_masks for
> + * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
> + * @param mf_bits_total Number of bits set in the whole miniflow (both units)
> + * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
> + */
> +void
> +netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
> +                          uint64_t *mf_masks,
> +                          const uint32_t mf_bits_u0,
> +                          const uint32_t mf_bits_u1)
> +{
> +    uint64_t iter_u0 = tbl->mf.map.bits[0];
> +    uint64_t iter_u1 = tbl->mf.map.bits[1];
> +
> +    netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
> +    netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, 
> &mf_masks[mf_bits_u0]);
> +}
> +
>  /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
>   * in 'mask' the values in 'key' and 'target' are the same. */
>  bool
> @@ -7774,6 +7846,7 @@ dpcls_lookup(struct dpcls *cls, const struct 
> netdev_flow_key *keys[],
>      BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
>  
>      struct dpcls_subtable *subtable;
> +    uint64_t *blocks_scratch = cls->blocks_scratch;
>  
>      uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
>  
> @@ -7793,8 +7866,8 @@ dpcls_lookup(struct dpcls *cls, const struct 
> netdev_flow_key *keys[],
>       * non-overlapping. */
>      PVECTOR_FOR_EACH (subtable, &cls->subtables) {
>          /* call the subtable specific lookup function */
> -        uint32_t found_map = subtable->lookup_func(subtable, keys_map,
> -                                                   keys, rules);
> +        uint32_t found_map = subtable->lookup_func(subtable, blocks_scratch,
> +                                                   keys_map, keys, rules);
>  
>          /* Count the number of subtables searched for this packet match. This
>           * estimates the "spread" of subtables looked at per matched packet 
> */
> diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
> index 27145d721..9263256a9 100644
> --- a/lib/dpif-netdev.h
> +++ b/lib/dpif-netdev.h
> @@ -66,12 +66,14 @@ struct dpcls_rule {
>   * CPU instruction set available at runtime.
>   */
>  typedef uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable 
> *subtable,
> -                uint32_t keys_map, const struct netdev_flow_key *keys[],
> +                uint64_t *blocks_scratch, uint32_t keys_map,
> +                const struct netdev_flow_key *keys[],
>                  struct dpcls_rule **rules);
>  
>  /* Prototype for generic lookup func, using same code path as before */
>  uint32_t
>  dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
> +                              uint64_t *blocks_scratch,
>                                uint32_t keys_map,
>                                const struct netdev_flow_key *keys[],
>                                struct dpcls_rule **rules);
> @@ -92,8 +94,18 @@ struct dpcls_subtable {
>       * subtable matches on. The miniflow "bits" are used to select the actual
>       * dpcls lookup implementation at subtable creation time.
>       */
> +    uint8_t mf_bits_set_unit0;
> +    uint8_t mf_bits_set_unit1;
> +
> +    /* the lookup function to use for this subtable. If there is a known
> +     * property of the subtable (eg: only 3 bits of miniflow metadata is
> +     * used for the lookup) then this can point at an optimized version of
> +     * the lookup function for this particular subtable. */
>      dpcls_subtable_lookup_func lookup_func;
>  
> +    /* caches the masks to match a packet to, reducing runtime calculations 
> */
> +    uint64_t *mf_masks;
> +
>      struct netdev_flow_key mask; /* Wildcards for fields (const). */
>      /* 'mask' must be the last field, additional space is allocated here. */
>  };
> @@ -102,6 +114,12 @@ struct dpcls_subtable {
>  #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
>      MINIFLOW_FOR_EACH_IN_FLOWMAP (VALUE, &(KEY)->mf, FLOWMAP)
>  
> +void
> +netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
> +                          uint64_t *mf_masks,
> +                          const uint32_t mf_bits_u0,
> +                          const uint32_t mf_bits_u1);
> +
>  bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
>                              const struct netdev_flow_key *target);
>  
> 
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to