On 08.05.2019 13:22, Harry van Haaren wrote: > This commit refactors the generic implementation. The > goal of this refactor is to simply the code to enable > "specialization" of the functions at compile time. > > Given compile-time optimizations, the compiler is able > to unroll loops, and create optimized code sequences due > to compile time knowledge of loop-trip counts. > > In order to enable these compiler optimizations, we must > refactor the code to pass the loop-trip counts to functions > as compile time constants. > > This patch allows the number of miniflow-bits set per "unit" > in the miniflow to be passed around as a function argument. > > Note that this patch does NOT yet take advantage of doing so, > this is only a refactor to enable it in the next patches. > > Signed-off-by: Harry van Haaren <harry.van.haa...@intel.com> > > --- > > v8: > - Rework block_cache and mf_masks to avoid variable-lenght array > due to compiler issues. Provisioning for worst case is not a > good solution due to magnitue of over-provisioning required. > - Rework netdev_flatten function removing unused parameter > --- > lib/dpif-netdev-lookup-generic.c | 239 ++++++++++++++++++++++++------- > lib/dpif-netdev.c | 79 +++++++++- > lib/dpif-netdev.h | 20 ++- > 3 files changed, 283 insertions(+), 55 deletions(-) > > diff --git a/lib/dpif-netdev-lookup-generic.c > b/lib/dpif-netdev-lookup-generic.c > index 2e4003408..3e1b704fe 100644 > --- a/lib/dpif-netdev-lookup-generic.c > +++ b/lib/dpif-netdev-lookup-generic.c > @@ -28,67 +28,204 @@ > #include "packets.h" > #include "pvector.h" > > -/* Returns a hash value for the bits of 'key' where there are 1-bits in > - * 'mask'. */ > -static inline uint32_t > -netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key, > - const struct netdev_flow_key *mask) > +VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic); > + > +/* netdev_flow_key_flatten_unit: > + * Given a packet, table and mf_masks, this function iterates over each bit > + * set in the subtable, and calculates the appropriate metadata to store in > the > + * blocks_scratch[]. > + * > + * The results of the blocks_scratch[] can be used for hashing, and later for > + * verification of if a rule matches the given packet. > + */ > +static inline void > +netdev_flow_key_flatten_unit(const uint64_t * restrict pkt_blocks, > + const uint64_t * restrict tbl_blocks, > + const uint64_t * restrict mf_masks, > + uint64_t * restrict blocks_scratch, > + const uint64_t pkt_mf_bits, > + const uint32_t count) > { > - const uint64_t *p = miniflow_get_values(&mask->mf); > - uint32_t hash = 0; > - uint64_t value; > + uint32_t i; > + for (i = 0; i < count; i++) { > + uint64_t mf_mask = mf_masks[i]; > + /* Calculate the block index for the packet metadata */ > + uint64_t idx_bits = mf_mask & pkt_mf_bits; > + const uint32_t pkt_idx = __builtin_popcountll(idx_bits);
I guess, we need to use 'count_1bits' from lib/util.h instead to avoid issues with different compilers or systems without builtin support. Same for all other palces. > > - NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP (value, key, mask->mf.map) { > - hash = hash_add64(hash, value & *p); > - p++; > + /* check if the packet has the subtable miniflow bit set. If yes, the > + * block at the above pkt_idx will be stored, otherwise it is masked > + * out to be zero. > + */ > + uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits; > + uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1; > + > + /* mask packet block by table block, and mask to zero if packet > + * doesn't actually contain this block of metadata > + */ > + blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit; > } > +} > + > +/* netdev_flow_key_flatten: > + * This function takes a packet, and subtable and writes an array of uint64_t > + * blocks. The blocks contain the metadata that the subtable matches on, in > + * the same order as the subtable, allowing linear iteration over the blocks. > + * > + * To calculate the blocks contents, the netdev_flow_key_flatten_unit > function > + * is called twice, once for each "unit" of the miniflow. This call can be > + * inlined by the compiler for performance. > + * > + * Note that the u0_count and u1_count variables can be compile-time > constants, > + * allowing the loop in the inlined flatten_unit() function to be > compile-time > + * unrolled, or possibly removed totally by unrolling by the loop iterations. > + * The compile time optimizations enabled by this design improves > performance. > + */ > +static inline void > +netdev_flow_key_flatten(const struct netdev_flow_key * restrict key, > + const struct netdev_flow_key * restrict mask, > + const uint64_t * restrict mf_masks, > + uint64_t * restrict blocks_scratch, > + const uint32_t u0_count, > + const uint32_t u1_count) > +{ > + /* load mask from subtable, mask with packet mf, popcount to get idx */ > + const uint64_t *pkt_blocks = miniflow_get_values(&key->mf); > + const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf); > + > + /* packet miniflow bits to be masked by pre-calculated mf_masks */ > + const uint64_t pkt_bits_u0 = key->mf.map.bits[0]; > + const uint32_t pkt_bits_u0_pop = __builtin_popcountll(pkt_bits_u0); > + const uint64_t pkt_bits_u1 = key->mf.map.bits[1]; > > - return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8); > + /* Unit 0 flattening */ > + netdev_flow_key_flatten_unit(&pkt_blocks[0], > + &tbl_blocks[0], > + &mf_masks[0], > + &blocks_scratch[0], > + pkt_bits_u0, > + u0_count); > + > + /* Unit 1 flattening: > + * Move the pointers forward in the arrays based on u0 offsets, NOTE: > + * 1) pkt blocks indexed by actual popcount of u0, which is NOT always > + * the same as the amount of bits set in the subtable. > + * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so > + * the index is always u0_count. > + */ > + netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop], > + &tbl_blocks[u0_count], > + &mf_masks[u0_count], > + &blocks_scratch[u0_count], > + pkt_bits_u1, > + u1_count); > +} > + > +static inline uint64_t > +netdev_rule_matches_key(const struct dpcls_rule * restrict rule, > + const uint32_t mf_bits_total, > + const uint64_t * restrict blocks_scratch) > +{ > + const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); > + const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); > + > + uint64_t not_match = 0; > + for (int i = 0; i < mf_bits_total; i++) { > + not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i]; > + } > + > + /* invert result to show match as 1 */ > + return !not_match; > } > > +/* const prop version of the function: note that mf bits total and u0 are > + * explicitly passed in here, while they're also available at runtime from > the > + * subtable pointer. By making them compile time, we enable the compiler to > + * unroll loops and flatten out code-sequences based on the knowledge of the > + * mf_bits_* compile time values. This results in improved performance. > + */ > +static inline uint32_t __attribute__((always_inline)) > +lookup_generic_impl(struct dpcls_subtable *subtable, > + uint64_t *blocks_scratch, > + uint32_t keys_map, > + const struct netdev_flow_key *keys[], > + struct dpcls_rule **rules, > + const uint32_t bit_count_u0, > + const uint32_t bit_count_u1) > +{ > + const uint32_t n_pkts = __builtin_popcountll(keys_map); > + ovs_assert(NETDEV_MAX_BURST >= n_pkts); > + uint32_t hashes[NETDEV_MAX_BURST]; > + > + const uint32_t bit_count_total = bit_count_u0 + bit_count_u1; > + uint64_t *mf_masks = subtable->mf_masks; > + int i; > + > + /* Flatten the packet metadata into the blocks_scratch[] using subtable > */ > + ULLONG_FOR_EACH_1(i, keys_map) { > + netdev_flow_key_flatten(keys[i], > + &subtable->mask, > + mf_masks, > + &blocks_scratch[i * bit_count_total], > + bit_count_u0, > + bit_count_u1); > + } > + > + /* Hash the now linearized blocks of packet metadata */ > + ULLONG_FOR_EACH_1(i, keys_map) { > + uint32_t hash = 0; > + uint32_t i_off = i * bit_count_total; > + for (int h = 0; h < bit_count_total; h++) { > + hash = hash_add64(hash, blocks_scratch[i_off + h]); > + } > + hashes[i] = hash_finish(hash, bit_count_total * 8); > + } > + > + /* Lookup: this returns a bitmask of packets where the hash table had > + * an entry for the given hash key. Presence of a hash key does not > + * guarantee matching the key, as there can be hash collisions. > + */ > + uint32_t found_map; > + const struct cmap_node *nodes[NETDEV_MAX_BURST]; > + found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes); > + > + /* Verify that packet actually matched rule. If not found, a hash > + * collision has taken place, so continue searching with the next node. > + */ > + ULLONG_FOR_EACH_1(i, found_map) { > + struct dpcls_rule *rule; > + > + CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) { > + const uint32_t cidx = i * bit_count_total; > + uint32_t match = netdev_rule_matches_key(rule, bit_count_total, > + &blocks_scratch[cidx]); > + > + if (OVS_LIKELY(match)) { > + rules[i] = rule; > + subtable->hit_cnt++; > + goto next; > + } > + } > + > + /* None of the found rules was a match. Clear the i-th bit to > + * search for this key in the next subtable. */ > + ULLONG_SET0(found_map, i); > + next: > + ; /* Keep Sparse happy. */ > + } > + > + return found_map; > +} > + > +/* Generic - use runtime provided mf bits */ > uint32_t > dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable, > + uint64_t *blocks_scratch, > uint32_t keys_map, > const struct netdev_flow_key *keys[], > struct dpcls_rule **rules) > { > - int i; > - /* Compute hashes for the remaining keys. Each search-key is > - * masked with the subtable's mask to avoid hashing the wildcarded > - * bits. */ > - uint32_t hashes[NETDEV_MAX_BURST]; > - ULLONG_FOR_EACH_1(i, keys_map) { > - hashes[i] = netdev_flow_key_hash_in_mask(keys[i], > - &subtable->mask); > - } > - > - /* Lookup. */ > - const struct cmap_node *nodes[NETDEV_MAX_BURST]; > - uint32_t found_map = > - cmap_find_batch(&subtable->rules, keys_map, hashes, nodes); > - /* Check results. When the i-th bit of found_map is set, it means > - * that a set of nodes with a matching hash value was found for the > - * i-th search-key. Due to possible hash collisions we need to check > - * which of the found rules, if any, really matches our masked > - * search-key. */ > - ULLONG_FOR_EACH_1(i, found_map) { > - struct dpcls_rule *rule; > - > - CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) { > - if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) { > - rules[i] = rule; > - /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap > - * within one second optimization interval. */ > - subtable->hit_cnt++; > - goto next; > - } > - } > - /* None of the found rules was a match. Reset the i-th bit to > - * keep searching this key in the next subtable. */ > - ULLONG_SET0(found_map, i); /* Did not match. */ > - next: > - ; /* Keep Sparse happy. */ > - } > - > - return found_map; > + return lookup_generic_impl(subtable, blocks_scratch, keys_map, keys, > + rules, subtable->mf_bits_set_unit0, > + subtable->mf_bits_set_unit1); > } > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index fe7171baa..267efde6d 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -232,6 +232,15 @@ struct dpcls { > odp_port_t in_port; > struct cmap subtables_map; > struct pvector subtables; > + > + /* Region of memory for this DPCLS instance to use as scratch. > + * Size is garaunteed to be large enough to hold all blocks required for > + * the subtable's to match on. This allows each dpcls lookup to flatten > + * the packet miniflows into this blocks_scratch area, without using > + * variable lenght arrays. This region is allocated on subtable create, > and > + * will be resized as required if a larger subtable is added. */ > + uint64_t *blocks_scratch; > + uint32_t blocks_scratch_size; > }; > > /* Data structure to keep packet order till fastpath processing. */ > @@ -7546,6 +7555,8 @@ dpcls_init(struct dpcls *cls) > { > cmap_init(&cls->subtables_map); > pvector_init(&cls->subtables); > + cls->blocks_scratch = 0; > + cls->blocks_scratch_size = 0; > } > > static void > @@ -7556,6 +7567,7 @@ dpcls_destroy_subtable(struct dpcls *cls, struct > dpcls_subtable *subtable) > cmap_remove(&cls->subtables_map, &subtable->cmap_node, > subtable->mask.hash); > cmap_destroy(&subtable->rules); > + ovsrcu_postpone(free, subtable->mf_masks); > ovsrcu_postpone(free, subtable); > } > > @@ -7574,6 +7586,7 @@ dpcls_destroy(struct dpcls *cls) > } > cmap_destroy(&cls->subtables_map); > pvector_destroy(&cls->subtables); > + free(cls->blocks_scratch); > } > } > > @@ -7589,7 +7602,29 @@ dpcls_create_subtable(struct dpcls *cls, const struct > netdev_flow_key *mask) > subtable->hit_cnt = 0; > netdev_flow_key_clone(&subtable->mask, mask); > > - /* decide which hash/lookup/verify function to use */ > + /* The count of bits in the mask defines the space required for masks. > + * Then call gen_masks() to create the appropriate masks, avoiding the > cost > + * of doing runtime calculations */ > + uint32_t unit0 = __builtin_popcountll(mask->mf.map.bits[0]); > + uint32_t unit1 = __builtin_popcountll(mask->mf.map.bits[1]); > + subtable->mf_bits_set_unit0 = unit0; > + subtable->mf_bits_set_unit1 = unit1; > + > + subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1)); > + netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1); > + > + > + /* allocate blocks scratch space only if subtable requires more size than > + * is currently allocated */ > + const uint32_t blocks_required_per_pkt = unit0 + unit1; > + if (cls->blocks_scratch_size < blocks_required_per_pkt) { > + free(cls->blocks_scratch); > + cls->blocks_scratch = xmalloc(sizeof(uint64_t) * NETDEV_MAX_BURST * > + blocks_required_per_pkt); > + cls->blocks_scratch_size = blocks_required_per_pkt; > + } > + > + /* Assign the generic lookup - this works with any miniflow fingerprint > */ > subtable->lookup_func = dpcls_subtable_lookup_generic; > > cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash); > @@ -7734,6 +7769,43 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule > *rule) > } > } > > +/* inner loop for mask generation of a unit, see netdev_flow_key_gen_masks */ > +static inline void > +netdev_flow_key_gen_mask_unit(uint64_t iter, > + const uint64_t count, > + uint64_t *mf_masks) > +{ > + int i; > + for (i = 0; i < count; i++) { > + uint64_t lowest_bit = (iter & -iter); > + iter &= ~lowest_bit; > + mf_masks[i] = (lowest_bit - 1); > + } > + /* checks that count has covered all bits in the iter bitmap */ > + ovs_assert(iter == 0); > +} > + > +/* generate a mask for each block in the miniflow, based on the bits set. > This > + * allows easily masking packets with the generated array here, without > + * calculations. This replaces runtime-calculating the masks. > + * @param key The table to generate the mf_masks for > + * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size > + * @param mf_bits_total Number of bits set in the whole miniflow (both units) > + * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow > + */ > +void > +netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl, > + uint64_t *mf_masks, > + const uint32_t mf_bits_u0, > + const uint32_t mf_bits_u1) > +{ > + uint64_t iter_u0 = tbl->mf.map.bits[0]; > + uint64_t iter_u1 = tbl->mf.map.bits[1]; > + > + netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]); > + netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, > &mf_masks[mf_bits_u0]); > +} > + > /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit > * in 'mask' the values in 'key' and 'target' are the same. */ > bool > @@ -7774,6 +7846,7 @@ dpcls_lookup(struct dpcls *cls, const struct > netdev_flow_key *keys[], > BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); > > struct dpcls_subtable *subtable; > + uint64_t *blocks_scratch = cls->blocks_scratch; > > uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */ > > @@ -7793,8 +7866,8 @@ dpcls_lookup(struct dpcls *cls, const struct > netdev_flow_key *keys[], > * non-overlapping. */ > PVECTOR_FOR_EACH (subtable, &cls->subtables) { > /* call the subtable specific lookup function */ > - uint32_t found_map = subtable->lookup_func(subtable, keys_map, > - keys, rules); > + uint32_t found_map = subtable->lookup_func(subtable, blocks_scratch, > + keys_map, keys, rules); > > /* Count the number of subtables searched for this packet match. This > * estimates the "spread" of subtables looked at per matched packet > */ > diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h > index 27145d721..9263256a9 100644 > --- a/lib/dpif-netdev.h > +++ b/lib/dpif-netdev.h > @@ -66,12 +66,14 @@ struct dpcls_rule { > * CPU instruction set available at runtime. > */ > typedef uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable > *subtable, > - uint32_t keys_map, const struct netdev_flow_key *keys[], > + uint64_t *blocks_scratch, uint32_t keys_map, > + const struct netdev_flow_key *keys[], > struct dpcls_rule **rules); > > /* Prototype for generic lookup func, using same code path as before */ > uint32_t > dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable, > + uint64_t *blocks_scratch, > uint32_t keys_map, > const struct netdev_flow_key *keys[], > struct dpcls_rule **rules); > @@ -92,8 +94,18 @@ struct dpcls_subtable { > * subtable matches on. The miniflow "bits" are used to select the actual > * dpcls lookup implementation at subtable creation time. > */ > + uint8_t mf_bits_set_unit0; > + uint8_t mf_bits_set_unit1; > + > + /* the lookup function to use for this subtable. If there is a known > + * property of the subtable (eg: only 3 bits of miniflow metadata is > + * used for the lookup) then this can point at an optimized version of > + * the lookup function for this particular subtable. */ > dpcls_subtable_lookup_func lookup_func; > > + /* caches the masks to match a packet to, reducing runtime calculations > */ > + uint64_t *mf_masks; > + > struct netdev_flow_key mask; /* Wildcards for fields (const). */ > /* 'mask' must be the last field, additional space is allocated here. */ > }; > @@ -102,6 +114,12 @@ struct dpcls_subtable { > #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \ > MINIFLOW_FOR_EACH_IN_FLOWMAP (VALUE, &(KEY)->mf, FLOWMAP) > > +void > +netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl, > + uint64_t *mf_masks, > + const uint32_t mf_bits_u0, > + const uint32_t mf_bits_u1); > + > bool dpcls_rule_matches_key(const struct dpcls_rule *rule, > const struct netdev_flow_key *target); > > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev