http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/HashTable.hpp ---------------------------------------------------------------------- diff --git a/storage/HashTable.hpp b/storage/HashTable.hpp index be31fd9..6e3dc96 100644 --- a/storage/HashTable.hpp +++ b/storage/HashTable.hpp @@ -23,6 +23,7 @@ #include <atomic> #include <cstddef> #include <cstdlib> +#include <memory> #include <type_traits> #include <vector> @@ -39,11 +40,14 @@ #include "types/Type.hpp" #include "types/TypedValue.hpp" #include "utility/BloomFilter.hpp" +#include "utility/BloomFilterAdapter.hpp" #include "utility/HashPair.hpp" #include "utility/Macros.hpp" namespace quickstep { +DECLARE_int64(bloom_adapter_batch_size); + /** \addtogroup Storage * @{ */ @@ -1016,8 +1020,12 @@ class HashTable : public HashTableBase<resizable, * * @param bloom_filter The pointer to the bloom filter. **/ - inline void setBuildSideBloomFilter(BloomFilter *bloom_filter) { - build_bloom_filter_ = bloom_filter; + inline void addBuildSideBloomFilter(BloomFilter *bloom_filter) { + build_bloom_filters_.emplace_back(bloom_filter); + } + + inline void addBuildSideAttributeId(const attribute_id build_attribute_id) { + build_attribute_ids_.push_back(build_attribute_id); } /** @@ -1042,8 +1050,8 @@ class HashTable : public HashTableBase<resizable, * @param probe_attribute_ids The vector of attribute ids to use for probing * the bloom filter. **/ - inline void addProbeSideAttributeIds(std::vector<attribute_id> &&probe_attribute_ids) { - probe_attribute_ids_.push_back(probe_attribute_ids); + inline void addProbeSideAttributeId(const attribute_id probe_attribute_id) { + probe_attribute_ids_.push_back(probe_attribute_id); } protected: @@ -1329,9 +1337,10 @@ class HashTable : public HashTableBase<resizable, // Data structures used for bloom filter optimized semi-joins. bool has_build_side_bloom_filter_ = false; bool has_probe_side_bloom_filter_ = false; - BloomFilter *build_bloom_filter_; + std::vector<BloomFilter *> build_bloom_filters_; + std::vector<attribute_id> build_attribute_ids_; std::vector<const BloomFilter*> probe_bloom_filters_; - std::vector<std::vector<attribute_id>> probe_attribute_ids_; + std::vector<attribute_id> probe_attribute_ids_; DISALLOW_COPY_AND_ASSIGN(HashTable); }; @@ -1477,12 +1486,26 @@ HashTablePutResult HashTable<ValueT, resizable, serializable, force_key_copy, al &prealloc_state); } } - std::unique_ptr<BloomFilter> thread_local_bloom_filter; + if (has_build_side_bloom_filter_) { - thread_local_bloom_filter.reset(new BloomFilter(build_bloom_filter_->getRandomSeed(), - build_bloom_filter_->getNumberOfHashes(), - build_bloom_filter_->getBitArraySize())); + for (std::size_t i = 0; i < build_bloom_filters_.size(); ++i) { + auto *build_bloom_filter = build_bloom_filters_[i]; + std::unique_ptr<BloomFilter> thread_local_bloom_filter( + new BloomFilter(build_bloom_filter->getNumberOfHashes(), + build_bloom_filter->getBitArraySize())); + const auto &build_attr = build_attribute_ids_[i]; + const std::size_t attr_size = + accessor->template getUntypedValueAndByteLengthAtAbsolutePosition<false>(0, build_attr).second; + while (accessor->next()) { + thread_local_bloom_filter->insertUnSafe( + static_cast<const std::uint8_t *>(accessor->getUntypedValue(build_attr)), + attr_size); + } + build_bloom_filter->bitwiseOr(thread_local_bloom_filter.get()); + accessor->beginIteration(); + } } + if (resizable) { while (result == HashTablePutResult::kOutOfSpace) { { @@ -1498,11 +1521,6 @@ HashTablePutResult HashTable<ValueT, resizable, serializable, force_key_copy, al variable_size, (*functor)(*accessor), using_prealloc ? &prealloc_state : nullptr); - // Insert into bloom filter, if enabled. - if (has_build_side_bloom_filter_) { - thread_local_bloom_filter->insertUnSafe(static_cast<const std::uint8_t *>(key.getDataPtr()), - key.getDataSize()); - } if (result == HashTablePutResult::kDuplicateKey) { DEBUG_ASSERT(!using_prealloc); return result; @@ -1528,20 +1546,11 @@ HashTablePutResult HashTable<ValueT, resizable, serializable, force_key_copy, al variable_size, (*functor)(*accessor), using_prealloc ? &prealloc_state : nullptr); - // Insert into bloom filter, if enabled. - if (has_build_side_bloom_filter_) { - thread_local_bloom_filter->insertUnSafe(static_cast<const std::uint8_t *>(key.getDataPtr()), - key.getDataSize()); - } if (result != HashTablePutResult::kOK) { return result; } } } - // Update the build side bloom filter with thread local copy, if available. - if (has_build_side_bloom_filter_) { - build_bloom_filter_->bitwiseOr(thread_local_bloom_filter.get()); - } return HashTablePutResult::kOK; }); @@ -1607,6 +1616,26 @@ HashTablePutResult HashTable<ValueT, resizable, serializable, force_key_copy, al &prealloc_state); } } + + if (has_build_side_bloom_filter_) { + for (std::size_t i = 0; i < build_bloom_filters_.size(); ++i) { + auto *build_bloom_filter = build_bloom_filters_[i]; + std::unique_ptr<BloomFilter> thread_local_bloom_filter( + new BloomFilter(build_bloom_filter->getNumberOfHashes(), + build_bloom_filter->getBitArraySize())); + const auto &build_attr = build_attribute_ids_[i]; + const std::size_t attr_size = + accessor->template getUntypedValueAndByteLengthAtAbsolutePosition<false>(0, build_attr).second; + while (accessor->next()) { + thread_local_bloom_filter->insertUnSafe( + static_cast<const std::uint8_t *>(accessor->getUntypedValue(build_attr)), + attr_size); + } + build_bloom_filter->bitwiseOr(thread_local_bloom_filter.get()); + accessor->beginIteration(); + } + } + if (resizable) { while (result == HashTablePutResult::kOutOfSpace) { { @@ -2229,6 +2258,7 @@ inline std::size_t HashTable<ValueT, resizable, serializable, force_key_copy, al } } + template <typename ValueT, bool resizable, bool serializable, @@ -2246,42 +2276,85 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_ InvokeOnAnyValueAccessor( accessor, [&](auto *accessor) -> void { // NOLINT(build/c++11) - while (accessor->next()) { - // Probe any bloom filters, if enabled. - if (has_probe_side_bloom_filter_) { - DCHECK_EQ(probe_bloom_filters_.size(), probe_attribute_ids_.size()); - // Check if the key is contained in the BloomFilters or not. - bool bloom_miss = false; - for (std::size_t i = 0; i < probe_bloom_filters_.size() && !bloom_miss; ++i) { - const BloomFilter *bloom_filter = probe_bloom_filters_[i]; - for (const attribute_id &attr_id : probe_attribute_ids_[i]) { - TypedValue bloom_key = accessor->getTypedValue(attr_id); - if (!bloom_filter->contains(static_cast<const std::uint8_t*>(bloom_key.getDataPtr()), - bloom_key.getDataSize())) { - bloom_miss = true; + std::unique_ptr<BloomFilterAdapter> bloom_filter_adapter; + if (has_probe_side_bloom_filter_) { + // Find (and cache) the size of each attribute in the probe lists. + // NOTE(nav): This code uses the accessor to get the size, + // and hence only works if there's at least one tuple. + std::vector<std::size_t> attr_size_vector; + attr_size_vector.reserve(probe_attribute_ids_.size()); + for (const auto &probe_attr : probe_attribute_ids_) { + auto val_and_size = + accessor->template getUntypedValueAndByteLengthAtAbsolutePosition<false>(0, probe_attr); +// std::cerr << "BF attr size = " << val_and_size.second << "\n"; + attr_size_vector.emplace_back(val_and_size.second); + } + + bloom_filter_adapter.reset(new BloomFilterAdapter( + probe_bloom_filters_, probe_attribute_ids_, attr_size_vector)); + + // We want to have large batch sizes for cache efficiency while probeing, + // but small batch sizes to ensure that the adaptation logic kicks in + // (and does early). We use exponentially increasing batch sizes to + // achieve a balance between the two. + // + // We also keep track of num_tuples_left in the block, to ensure that + // we don't reserve an unnecessarily large vector. + std::uint32_t batch_size_try = FLAGS_bloom_adapter_batch_size; + std::uint32_t num_tuples_left = accessor->getNumTuples(); + std::vector<tuple_id> batch(num_tuples_left); + + do { + std::uint32_t batch_size = + batch_size_try < num_tuples_left ? batch_size_try : num_tuples_left; + for (std::size_t i = 0; i < batch_size; ++i) { + accessor->next(); + batch.push_back(accessor->getCurrentPosition()); + } + + std::size_t num_hits = + bloom_filter_adapter->bulkProbe<true>(accessor, batch, batch_size); + + for (std::size_t t = 0; t < num_hits; ++t){ + tuple_id probe_tid = batch[t]; + TypedValue key = accessor->getTypedValueAtAbsolutePosition(key_attr_id, probe_tid); + if (check_for_null_keys && key.isNull()) { + continue; + } + const std::size_t true_hash = use_scalar_literal_hash_template ? key.getHashScalarLiteral() + : key.getHash(); + const std::size_t adjusted_hash = adjust_hashes_template ? this->AdjustHash(true_hash) + : true_hash; + std::size_t entry_num = 0; + const ValueT *value; + while (this->getNextEntryForKey(key, adjusted_hash, &value, &entry_num)) { + (*functor)(probe_tid, *value); + if (!allow_duplicate_keys) break; - } } } - if (bloom_miss) { - continue; // On a bloom filter miss, probing the hash table can be skipped. - } - } + batch.clear(); + num_tuples_left -= batch_size; + batch_size_try = batch_size * 2; + } while (!accessor->iterationFinished()); + } - TypedValue key = accessor->getTypedValue(key_attr_id); - if (check_for_null_keys && key.isNull()) { - continue; - } - const std::size_t true_hash = use_scalar_literal_hash_template ? key.getHashScalarLiteral() - : key.getHash(); - const std::size_t adjusted_hash = adjust_hashes_template ? this->AdjustHash(true_hash) - : true_hash; - std::size_t entry_num = 0; - const ValueT *value; - while (this->getNextEntryForKey(key, adjusted_hash, &value, &entry_num)) { - (*functor)(*accessor, *value); - if (!allow_duplicate_keys) { - break; + else { // no Bloom filters to probe + while(accessor->next()) { + TypedValue key = accessor->getTypedValue(key_attr_id); + if (check_for_null_keys && key.isNull()) { + continue; + } + const std::size_t true_hash = use_scalar_literal_hash_template ? key.getHashScalarLiteral() + : key.getHash(); + const std::size_t adjusted_hash = adjust_hashes_template ? this->AdjustHash(true_hash) + : true_hash; + std::size_t entry_num = 0; + const ValueT *value; + while (this->getNextEntryForKey(key, adjusted_hash, &value, &entry_num)) { + (*functor)(*accessor, *value); + if (!allow_duplicate_keys) + break; } } }
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/HashTable.proto ---------------------------------------------------------------------- diff --git a/storage/HashTable.proto b/storage/HashTable.proto index 7f00f29..6eabf60 100644 --- a/storage/HashTable.proto +++ b/storage/HashTable.proto @@ -34,10 +34,10 @@ message HashTable { required HashTableImplType hash_table_impl_type = 1; repeated Type key_types = 2; required uint64 estimated_num_entries = 3; - repeated uint32 build_side_bloom_filter_id = 4; - message ProbeSideBloomFilter { - required uint32 probe_side_bloom_filter_id = 1; - repeated uint32 probe_side_attr_ids = 2; + message BloomFilter { + required uint32 bloom_filter_id = 1; + required uint32 attr_id = 2; } - repeated ProbeSideBloomFilter probe_side_bloom_filters = 6; + repeated BloomFilter probe_side_bloom_filters = 4; + repeated BloomFilter build_side_bloom_filters = 5; } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/HashTableFactory.hpp ---------------------------------------------------------------------- diff --git a/storage/HashTableFactory.hpp b/storage/HashTableFactory.hpp index 34baaeb..fbb3d41 100644 --- a/storage/HashTableFactory.hpp +++ b/storage/HashTableFactory.hpp @@ -318,9 +318,15 @@ class HashTableFactory { // individual implementations of the hash table constructors. // Check if there are any build side bloom filter defined on the hash table. - if (proto.build_side_bloom_filter_id_size() > 0) { + if (proto.build_side_bloom_filters_size() > 0) { hash_table->enableBuildSideBloomFilter(); - hash_table->setBuildSideBloomFilter(bloom_filters[proto.build_side_bloom_filter_id(0)].get()); + for (int j = 0; j < proto.build_side_bloom_filters_size(); ++j) { + const auto build_side_bloom_filter = proto.build_side_bloom_filters(j); + hash_table->addBuildSideBloomFilter( + bloom_filters[build_side_bloom_filter.bloom_filter_id()].get()); + + hash_table->addBuildSideAttributeId(build_side_bloom_filter.attr_id()); + } } // Check if there are any probe side bloom filters defined on the hash table. @@ -330,15 +336,10 @@ class HashTableFactory { for (int j = 0; j < proto.probe_side_bloom_filters_size(); ++j) { // Add the pointer to the probe bloom filter within the list of probe bloom filters to use. const auto probe_side_bloom_filter = proto.probe_side_bloom_filters(j); - hash_table->addProbeSideBloomFilter(bloom_filters[probe_side_bloom_filter.probe_side_bloom_filter_id()].get()); - - // Add the attribute ids corresponding to this probe bloom filter. - std::vector<attribute_id> probe_attribute_ids; - for (int k = 0; k < probe_side_bloom_filter.probe_side_attr_ids_size(); ++k) { - const attribute_id probe_attribute_id = probe_side_bloom_filter.probe_side_attr_ids(k); - probe_attribute_ids.push_back(probe_attribute_id); - } - hash_table->addProbeSideAttributeIds(std::move(probe_attribute_ids)); + hash_table->addProbeSideBloomFilter( + bloom_filters[probe_side_bloom_filter.bloom_filter_id()].get()); + + hash_table->addProbeSideAttributeId(probe_side_bloom_filter.attr_id()); } } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/PackedRowStoreValueAccessor.hpp ---------------------------------------------------------------------- diff --git a/storage/PackedRowStoreValueAccessor.hpp b/storage/PackedRowStoreValueAccessor.hpp index 03a975e..cbd273e 100644 --- a/storage/PackedRowStoreValueAccessor.hpp +++ b/storage/PackedRowStoreValueAccessor.hpp @@ -18,6 +18,8 @@ #ifndef QUICKSTEP_STORAGE_PACKED_ROW_STORE_VALUE_ACCESSOR_HPP_ #define QUICKSTEP_STORAGE_PACKED_ROW_STORE_VALUE_ACCESSOR_HPP_ +#include <utility> + #include "catalog/CatalogRelationSchema.hpp" #include "catalog/CatalogTypedefs.hpp" #include "storage/StorageBlockInfo.hpp" @@ -40,7 +42,8 @@ class PackedRowStoreValueAccessorHelper { : relation_(relation), num_tuples_(num_tuples), tuple_storage_(tuple_storage), - null_bitmap_(null_bitmap) { + null_bitmap_(null_bitmap), + attr_max_lengths_(relation.getMaximumAttributeByteLengths()) { } inline tuple_id numPackedTuples() const { @@ -65,6 +68,25 @@ class PackedRowStoreValueAccessorHelper { + relation_.getFixedLengthAttributeOffset(attr); // Attribute offset within tuple. } + template <bool check_null> + inline std::pair<const void*, std::size_t> getAttributeValueAndByteLength(const tuple_id tuple, + const attribute_id attr) const { + DEBUG_ASSERT(tuple < num_tuples_); + DEBUG_ASSERT(relation_.hasAttributeWithId(attr)); + if (check_null) { + const int nullable_idx = relation_.getNullableAttributeIndex(attr); + if ((nullable_idx != -1) + && null_bitmap_->getBit(tuple * relation_.numNullableAttributes() + nullable_idx)) { + return std::make_pair(nullptr, 0); + } + } + + return std::make_pair(static_cast<const char*>(tuple_storage_) + + (tuple * relation_.getFixedByteLength()) + + relation_.getFixedLengthAttributeOffset(attr), + attr_max_lengths_[attr]); + } + inline TypedValue getAttributeValueTyped(const tuple_id tuple, const attribute_id attr) const { const Type &attr_type = relation_.getAttributeById(attr)->getType(); @@ -79,6 +101,7 @@ class PackedRowStoreValueAccessorHelper { const tuple_id num_tuples_; const void *tuple_storage_; const BitVector<false> *null_bitmap_; + const std::vector<std::size_t> &attr_max_lengths_; DISALLOW_COPY_AND_ASSIGN(PackedRowStoreValueAccessorHelper); }; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/SplitRowStoreValueAccessor.hpp ---------------------------------------------------------------------- diff --git a/storage/SplitRowStoreValueAccessor.hpp b/storage/SplitRowStoreValueAccessor.hpp index 9ea1a3a..19937f2 100644 --- a/storage/SplitRowStoreValueAccessor.hpp +++ b/storage/SplitRowStoreValueAccessor.hpp @@ -100,6 +100,11 @@ class SplitRowStoreValueAccessor : public ValueAccessor { return getUntypedValueAtAbsolutePosition<check_null>(attr_id, current_position_); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const attribute_id attr_id) const { + return getUntypedValueAndByteLengthAtAbsolutePosition<check_null>(attr_id, current_position_); + } + inline TypedValue getTypedValue(const attribute_id attr_id) const { return getTypedValueAtAbsolutePosition(attr_id, current_position_); } @@ -140,6 +145,44 @@ class SplitRowStoreValueAccessor : public ValueAccessor { } } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLengthAtAbsolutePosition(const attribute_id attr_id, + const tuple_id tid) const { + DEBUG_ASSERT(occupancy_bitmap_.getBit(tid)); + DEBUG_ASSERT(relation_.hasAttributeWithId(attr_id)); + const char *tuple_slot = static_cast<const char*>(tuple_storage_) + + tuple_slot_bytes_ * tid; + if (check_null) { + const int nullable_idx = relation_.getNullableAttributeIndex(attr_id); + if (nullable_idx != -1) { + // const_cast is safe here. We will only be using read-only methods of + // BitVector. + BitVector<true> tuple_null_bitmap(const_cast<void*>(static_cast<const void*>(tuple_slot)), + relation_.numNullableAttributes()); + if (tuple_null_bitmap.getBit(nullable_idx)) { + return std::make_pair(nullptr, 0); + } + } + } + + const int variable_length_idx = relation_.getVariableLengthAttributeIndex(attr_id); + if (variable_length_idx == -1) { + // Fixed-length, stored in-line in slot. + return std::make_pair(tuple_slot + per_tuple_null_bitmap_bytes_ + + relation_.getFixedLengthAttributeOffset(attr_id), + attr_max_lengths_[attr_id]); + + } else { + // Variable-length, stored at back of block. + const std::uint32_t *pos_ptr = reinterpret_cast<const std::uint32_t*>( + tuple_slot + per_tuple_null_bitmap_bytes_ + + relation_.getFixedByteLength() + + variable_length_idx * 2 * sizeof(std::uint32_t)); + return std::make_pair(static_cast<const char*>(tuple_storage_) + pos_ptr[0], + pos_ptr[1]); + } + } + inline TypedValue getTypedValueAtAbsolutePosition(const attribute_id attr_id, const tuple_id tid) const { DEBUG_ASSERT(occupancy_bitmap_.getBit(tid)); @@ -317,6 +360,7 @@ class SplitRowStoreValueAccessor : public ValueAccessor { tuple_storage_(tuple_storage), tuple_slot_bytes_(tuple_slot_bytes), per_tuple_null_bitmap_bytes_(per_tuple_null_bitmap_bytes), + attr_max_lengths_(relation.getMaximumAttributeByteLengths()), current_position_(std::numeric_limits<std::size_t>::max()) { } @@ -327,6 +371,7 @@ class SplitRowStoreValueAccessor : public ValueAccessor { const void *tuple_storage_; const std::size_t tuple_slot_bytes_; const std::size_t per_tuple_null_bitmap_bytes_; + const std::vector<std::size_t> &attr_max_lengths_; std::size_t current_position_; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/StorageBlock.cpp ---------------------------------------------------------------------- diff --git a/storage/StorageBlock.cpp b/storage/StorageBlock.cpp index fdd438d..78aba7c 100644 --- a/storage/StorageBlock.cpp +++ b/storage/StorageBlock.cpp @@ -389,15 +389,7 @@ AggregationState* StorageBlock::aggregate( const AggregationHandle &handle, const std::vector<std::unique_ptr<const Scalar>> &arguments, const std::vector<attribute_id> *arguments_as_attributes, - const Predicate *predicate, std::unique_ptr<TupleIdSequence> *reuse_matches) const { - // If there is a filter predicate that hasn't already been evaluated, - // evaluate it now and save the results for other aggregates on this same - // block. - if (predicate && !*reuse_matches) { - reuse_matches->reset(getMatchesForPredicate(predicate)); - } - #ifdef QUICKSTEP_ENABLE_VECTOR_COPY_ELISION_SELECTION // If all the arguments to this aggregate are plain relation attributes, // aggregate directly on a ValueAccessor from this block to avoid a copy. @@ -418,7 +410,6 @@ void StorageBlock::aggregateGroupBy( const AggregationHandle &handle, const std::vector<std::unique_ptr<const Scalar>> &arguments, const std::vector<std::unique_ptr<const Scalar>> &group_by, - const Predicate *predicate, AggregationStateHashTableBase *hash_table, std::unique_ptr<TupleIdSequence> *reuse_matches, std::vector<std::unique_ptr<ColumnVector>> *reuse_group_by_vectors) const { @@ -440,14 +431,7 @@ void StorageBlock::aggregateGroupBy( ColumnVectorsValueAccessor temp_result; { std::unique_ptr<ValueAccessor> accessor; - if (predicate) { - if (!*reuse_matches) { - // If there is a filter predicate that hasn't already been evaluated, - // evaluate it now and save the results for other aggregates on this - // same block. - reuse_matches->reset(getMatchesForPredicate(predicate)); - } - + if (reuse_matches) { // Create a filtered ValueAccessor that only iterates over predicate // matches. accessor.reset(tuple_store_->createValueAccessor(reuse_matches->get())); @@ -499,7 +483,6 @@ void StorageBlock::aggregateDistinct( const std::vector<std::unique_ptr<const Scalar>> &arguments, const std::vector<attribute_id> *arguments_as_attributes, const std::vector<std::unique_ptr<const Scalar>> &group_by, - const Predicate *predicate, AggregationStateHashTableBase *distinctify_hash_table, std::unique_ptr<TupleIdSequence> *reuse_matches, std::vector<std::unique_ptr<ColumnVector>> *reuse_group_by_vectors) const { @@ -514,14 +497,7 @@ void StorageBlock::aggregateDistinct( ColumnVectorsValueAccessor temp_result; { std::unique_ptr<ValueAccessor> accessor; - if (predicate) { - if (!*reuse_matches) { - // If there is a filter predicate that hasn't already been evaluated, - // evaluate it now and save the results for other aggregates on this - // same block. - reuse_matches->reset(getMatchesForPredicate(predicate)); - } - + if (reuse_matches) { // Create a filtered ValueAccessor that only iterates over predicate // matches. accessor.reset(tuple_store_->createValueAccessor(reuse_matches->get())); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/StorageBlock.hpp ---------------------------------------------------------------------- diff --git a/storage/StorageBlock.hpp b/storage/StorageBlock.hpp index 3ae3812..3217fa2 100644 --- a/storage/StorageBlock.hpp +++ b/storage/StorageBlock.hpp @@ -410,7 +410,6 @@ class StorageBlock : public StorageBlockBase { const AggregationHandle &handle, const std::vector<std::unique_ptr<const Scalar>> &arguments, const std::vector<attribute_id> *arguments_as_attributes, - const Predicate *predicate, std::unique_ptr<TupleIdSequence> *reuse_matches) const; /** @@ -460,7 +459,6 @@ class StorageBlock : public StorageBlockBase { void aggregateGroupBy(const AggregationHandle &handle, const std::vector<std::unique_ptr<const Scalar>> &arguments, const std::vector<std::unique_ptr<const Scalar>> &group_by, - const Predicate *predicate, AggregationStateHashTableBase *hash_table, std::unique_ptr<TupleIdSequence> *reuse_matches, std::vector<std::unique_ptr<ColumnVector>> @@ -505,7 +503,6 @@ class StorageBlock : public StorageBlockBase { const std::vector<std::unique_ptr<const Scalar>> &arguments, const std::vector<attribute_id> *arguments_as_attributes, const std::vector<std::unique_ptr<const Scalar>> &group_by, - const Predicate *predicate, AggregationStateHashTableBase *distinctify_hash_table, std::unique_ptr<TupleIdSequence> *reuse_matches, std::vector<std::unique_ptr<ColumnVector>> *reuse_group_by_vectors) const; @@ -588,6 +585,8 @@ class StorageBlock : public StorageBlockBase { **/ const std::size_t getNumTuples() const; + TupleIdSequence* getMatchesForPredicate(const Predicate *predicate) const; + private: static TupleStorageSubBlock* CreateTupleStorageSubBlock( const CatalogRelationSchema &relation, @@ -627,8 +626,6 @@ class StorageBlock : public StorageBlockBase { // StorageBlock's header. bool rebuildIndexes(bool short_circuit); - TupleIdSequence* getMatchesForPredicate(const Predicate *predicate) const; - std::unordered_map<attribute_id, TypedValue>* generateUpdatedValues( const ValueAccessor &accessor, const tuple_id tuple, http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/storage/ValueAccessor.hpp ---------------------------------------------------------------------- diff --git a/storage/ValueAccessor.hpp b/storage/ValueAccessor.hpp index e2a898e..e9370cc 100644 --- a/storage/ValueAccessor.hpp +++ b/storage/ValueAccessor.hpp @@ -375,6 +375,11 @@ class TupleIdSequenceAdapterValueAccessor : public ValueAccessor { return accessor_->template getUntypedValueAtAbsolutePosition<check_null>(attr_id, *current_position_); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const attribute_id attr_id) const { + return accessor_->template getUntypedValueAndByteLengthAtAbsolutePosition<check_null>(attr_id, *current_position_); + } + inline TypedValue getTypedValue(const attribute_id attr_id) const { return accessor_->getTypedValueAtAbsolutePosition(attr_id, *current_position_); } @@ -387,6 +392,13 @@ class TupleIdSequenceAdapterValueAccessor : public ValueAccessor { } // Pass-through. + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLengthAtAbsolutePosition(const attribute_id attr_id, + const tuple_id tid) const { + return accessor_->template getUntypedValueAndByteLengthAtAbsolutePosition<check_null>(attr_id, tid); + } + + // Pass-through. inline TypedValue getTypedValueAtAbsolutePosition(const attribute_id attr_id, const tuple_id tid) const { return accessor_->getTypedValueAtAbsolutePosition(attr_id, tid); @@ -560,6 +572,12 @@ class OrderedTupleIdSequenceAdapterValueAccessor : public ValueAccessor { id_sequence_[current_position_]); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const attribute_id attr_id) const { + return accessor_->template getUntypedValueAndByteLengthAtAbsolutePosition<check_null>( + attr_id, id_sequence_[current_position_]); + } + inline TypedValue getTypedValue(const attribute_id attr_id) const { return accessor_->getTypedValueAtAbsolutePosition(attr_id, id_sequence_[current_position_]); } @@ -571,6 +589,13 @@ class OrderedTupleIdSequenceAdapterValueAccessor : public ValueAccessor { "OrderedTupleIdSequenceAdapterValueAccessor"); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLengthAtAbsolutePosition(const attribute_id attr_id, + const tuple_id tid) const { + FATAL_ERROR("getUntypedValueAndByteLengthAtAbsolutePosition() not implemented in " + "OrderedTupleIdSequenceAdapterValueAccessor"); + } + inline TypedValue getTypedValueAtAbsolutePosition(const attribute_id attr_id, const tuple_id tid) const { FATAL_ERROR("getTypedValueAtAbsolutePosition() not implemented in " @@ -737,6 +762,11 @@ class PackedTupleStorageSubBlockValueAccessor : public ValueAccessor { return getUntypedValueAtAbsolutePosition<check_null>(attr_id, current_tuple_); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const attribute_id attr_id) const { + return getUntypedValueAndByteLengthAtAbsolutePosition<check_null>(attr_id, current_tuple_); + } + inline TypedValue getTypedValue(const attribute_id attr_id) const { return getTypedValueAtAbsolutePosition(attr_id, current_tuple_); } @@ -747,6 +777,12 @@ class PackedTupleStorageSubBlockValueAccessor : public ValueAccessor { return helper_.template getAttributeValue<check_null>(tid, attr_id); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLengthAtAbsolutePosition(const attribute_id attr_id, + const tuple_id tid) const { + return helper_.template getAttributeValueAndByteLength<check_null>(tid, attr_id); + } + inline TypedValue getTypedValueAtAbsolutePosition(const attribute_id attr_id, const tuple_id tid) const { return helper_.getAttributeValueTyped(tid, attr_id); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/types/containers/ColumnVector.hpp ---------------------------------------------------------------------- diff --git a/types/containers/ColumnVector.hpp b/types/containers/ColumnVector.hpp index 76968ba..a9349ee 100644 --- a/types/containers/ColumnVector.hpp +++ b/types/containers/ColumnVector.hpp @@ -193,6 +193,22 @@ class NativeColumnVector : public ColumnVector { } /** + * @brief Get the untyped pointer to a value as well as the value's byte length + * in this NativeColumnVector as a pair. + * + * @param position The position of the value to get. + * @return A pair containing the untyped pointer to the value at position and + * the value's byte length. + **/ + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const std::size_t position) const { + DCHECK_LT(position, actual_length_); + return (check_null && null_bitmap_ && null_bitmap_->getBit(position)) + ? std::make_pair(nullptr, 0) + : std::make_pair(static_cast<const char*>(values_) + (position * type_length_), type_length_); + } + + /** * @brief Get a value in this NativeColumnVector as a TypedValue. * * @param position The position of the value to get. @@ -453,6 +469,25 @@ class IndirectColumnVector : public ColumnVector { } /** + * @brief Get the untyped pointer to a value as well as the value's byte length + * in this IndirectColumnVector as a pair. + * + * @param position The position of the value to get. + * @return A pair containing the untyped pointer to the value at position and + * the value's byte length. + **/ + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const std::size_t position) const { + DCHECK_LT(position, values_.size()); + if (check_null && type_is_nullable_ && values_[position].isNull()) { + return std::make_pair(nullptr, 0); + } else { + const TypedValue &value = values_[position]; + return std::make_pair(value.getDataPtr(), value.getDataSize()); + } + } + + /** * @brief Get a value in this IndirectColumnVector as a TypedValue. * * @param position The position of the value to get. http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/types/containers/ColumnVectorsValueAccessor.hpp ---------------------------------------------------------------------- diff --git a/types/containers/ColumnVectorsValueAccessor.hpp b/types/containers/ColumnVectorsValueAccessor.hpp index f1d29a2..d69d1d8 100644 --- a/types/containers/ColumnVectorsValueAccessor.hpp +++ b/types/containers/ColumnVectorsValueAccessor.hpp @@ -124,6 +124,11 @@ class ColumnVectorsValueAccessor : public ValueAccessor { return getUntypedValueAtAbsolutePosition<check_null>(attr_id, current_position_); } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLength(const attribute_id attr_id) const { + return getUntypedValueAndByteLengthAtAbsolutePosition<check_null>(attr_id, current_position_); + } + inline TypedValue getTypedValue(const attribute_id attr_id) const { return getTypedValueAtAbsolutePosition(attr_id, current_position_); } @@ -140,6 +145,18 @@ class ColumnVectorsValueAccessor : public ValueAccessor { } } + template <bool check_null = true> + inline std::pair<const void*, std::size_t> getUntypedValueAndByteLengthAtAbsolutePosition(const attribute_id attr_id, + const tuple_id tid) const { + DCHECK(attributeIdInRange(attr_id)); + DCHECK(tupleIdInRange(tid)); + if (column_native_[attr_id]) { + return static_cast<const NativeColumnVector&>(*columns_[attr_id]).getUntypedValueAndByteLength<check_null>(tid); + } else { + return static_cast<const IndirectColumnVector&>(*columns_[attr_id]).getUntypedValueAndByteLength<check_null>(tid); + } + } + inline TypedValue getTypedValueAtAbsolutePosition(const attribute_id attr_id, const tuple_id tid) const { DCHECK(attributeIdInRange(attr_id)); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/BloomFilter.hpp ---------------------------------------------------------------------- diff --git a/utility/BloomFilter.hpp b/utility/BloomFilter.hpp index b93df84..973ca14 100644 --- a/utility/BloomFilter.hpp +++ b/utility/BloomFilter.hpp @@ -26,6 +26,7 @@ #include <algorithm> #include <cstddef> #include <cstdint> +#include <cstring> #include <memory> #include <utility> #include <vector> @@ -44,11 +45,358 @@ namespace quickstep { * @{ */ +class BloomFilterOriginal; +class BloomFilterBlocked; +typedef BloomFilterBlocked BloomFilter; + +/** + * @brief A "blocked" version of Bloom Filter based on this paper: + * Putze, Felix, Peter Sanders, and Johannes Singler. + * "Cache-, hash-and space-efficient bloom filters." + * International Workshop on Experimental and Efficient Algorithms. + * Springer Berlin Heidelberg, 2007. + **/ +class BloomFilterBlocked { + public: + static const std::uint8_t kNumBitsPerByte = 8; + static const std::uint8_t kMaxNumHashFns = 4; + + // This union allows us to read/write position in convenient fashion, + // through nested structs and their bitfield members + // + // A position can simply be a 32-bit hash + // Or it can be a cache line (block of 512 bits) and position within it + // Or it can be a byte (block of 8 bits) and position within it + union Position { + std::uint32_t hash; + struct CacheLinePosition { + unsigned index_in_line : 9; + unsigned line_num : 23; + } cache_line_pos; + struct BytePosition { + unsigned index_in_byte : 3; + unsigned byte_num : 29; + } byte_pos; + }; + + // This Bloom filter implementation requires the bit array to be a + // multiple of the cache-line size. So we either have to round up to a + // multiple (default behavior) or round down to a multiple. + // Rounding up is usually preferable but rounding down is necessary when + // we are given a bit array that we don't control the size of, in the + // constructor. + static std::uint64_t getNearestAllowedSize( + const std::uint64_t approx_size, + bool round_down = false) { + if (round_down) + return (approx_size / kCacheLineBytes) * kCacheLineBytes; + return ((approx_size + kCacheLineBytes - 1)/ kCacheLineBytes) * kCacheLineBytes; + } + + + /** + * @brief Constructor. + * @note When no bit_array is being passed to the constructor, + * then the bit_array is owned and managed by this class. + * + * @param hash_fn_count The number of hash functions used by this bloom filter. + * @param bit_array_size_in_bytes Size of the bit array. + **/ + BloomFilterBlocked(const std::uint8_t hash_fn_count, + const std::uint64_t bit_array_size_in_bytes) + : hash_fn_count_(hash_fn_count), + array_size_in_bytes_(getNearestAllowedSize(bit_array_size_in_bytes)), + is_bit_array_owner_(true), + bit_array_(new std::uint8_t[array_size_in_bytes_]) { + reset(); + } + + /** + * @brief Constructor. + * @note When a bit_array is passed as an argument to the constructor, + * then the ownership of the bit array lies with the caller. + * + * @param hash_fn_count The number of hash functions used by this bloom filter. + * @param bit_array_size_in_bytes Size of the bit array. + * @param bit_array A pointer to the memory region that is used to store bit array. + * @param is_initialized A boolean that indicates whether to zero-out the region + * before use or not. + **/ + BloomFilterBlocked(const std::uint8_t hash_fn_count, + const std::uint64_t bit_array_size_in_bytes, + std::uint8_t *bit_array, + const bool is_initialized) + : hash_fn_count_(hash_fn_count), + array_size_in_bytes_(getNearestAllowedSize(bit_array_size_in_bytes, true)), + is_bit_array_owner_(false), + bit_array_(bit_array) { // Owned by the calling method. + if (!is_initialized) { + reset(); + } + } + + /** + * @brief Constructor. + * @note When a bloom filter proto is passed as an initializer, + * then the bit_array is owned and managed by this class. + * + * @param bloom_filter_proto The protobuf representation of a + * bloom filter configuration. + **/ + explicit BloomFilterBlocked(const serialization::BloomFilter &bloom_filter_proto) + : hash_fn_count_(bloom_filter_proto.number_of_hashes()), + array_size_in_bytes_(bloom_filter_proto.bloom_filter_size()), + is_bit_array_owner_(true), + bit_array_(new std::uint8_t[array_size_in_bytes_]) { + reset(); + } + + /** + * @brief Destructor. + **/ + ~BloomFilterBlocked() { + if (is_bit_array_owner_) { + bit_array_.reset(); + } else { + bit_array_.release(); + } + } + + static bool ProtoIsValid(const serialization::BloomFilter &bloom_filter_proto) { + return bloom_filter_proto.IsInitialized(); + } + + /** + * @brief Zeros out the contents of the bit array. + **/ + inline void reset() { + // Initialize the bit_array with all zeros. + std::fill_n(bit_array_.get(), array_size_in_bytes_, 0x00); + inserted_element_count_ = 0; + } + + /** + * @brief Get the number of hash functions used in this bloom filter. + * + * @return Returns the number of hash functions. + **/ + inline std::uint8_t getNumberOfHashes() const { + return hash_fn_count_; + } + + /** + * @brief Get the size of the bit array in bytes for this bloom filter. + * + * @return Returns the bit array size (in bytes). + **/ + inline std::uint64_t getBitArraySize() const { + return array_size_in_bytes_; + } + + /** + * @brief Get the constant pointer to the bit array for this bloom filter + * + * @return Returns constant pointer to the bit array. + **/ + inline const std::uint8_t* getBitArray() const { + return bit_array_.get(); + } + + template <typename T> + void insert(const T &value) { + insert(reinterpret_cast<const std::uint8_t *>(&value), sizeof(T)); + } + + /** + * @brief Inserts a given value into the bloom filter in a thread-safe manner. + * + * @param key_begin A pointer to the value being inserted. + * @param length Size of the value being inserted in bytes. + */ + inline void insert(const std::uint8_t *key_begin, const std::size_t length) { + SpinSharedMutexExclusiveLock<false> exclusive_writer_lock(bloom_filter_insert_mutex_); + insertUnSafe(key_begin, length); + } + + template <typename T> + void insertUnSafe(const T &value) { + insertUnSafe(reinterpret_cast<const std::uint8_t *>(&value), sizeof(T)); + } + + /** + * @brief Inserts a given value into the bloom filter. + * @Warning This is a faster thread-unsafe version of the insert() function. + * The caller needs to ensure the thread safety. + * + * @param key_begin A pointer to the value being inserted. + * @param length Size of the value being inserted in bytes. + */ + inline void insertUnSafe(const std::uint8_t *key_begin, const std::size_t length) { + Position first_pos = getFirstPosition(key_begin, length); + setBitAtPosition(first_pos); + Position other_pos; + for (std::uint8_t i = 1; i <hash_fn_count_; ++i) { + other_pos = getOtherPosition(key_begin, length, first_pos, i); + setBitAtPosition(other_pos); + } + ++inserted_element_count_; + } + + template <typename T> + bool contains(const T &value) { + return contains(reinterpret_cast<const std::uint8_t *>(&value), sizeof(T)); + } + + /** + * @brief Test membership of a given value in the bloom filter. + * If true is returned, then a value may or may not be present in the bloom filter. + * If false is returned, a value is certainly not present in the bloom filter. + * + * @note The membersip test does not require any locks, because the assumption is that + * the bloom filter will only be used after it has been built. + * + * @param key_begin A pointer to the value being tested for membership. + * @param length Size of the value being inserted in bytes. + */ + inline bool contains( + const std::uint8_t *__restrict__ key_begin, + const std::size_t length) const { + Position first_pos = getFirstPosition(key_begin, length); + if (!getBitAtPosition(first_pos)) { + return false; + } + Position other_pos; + for (std::uint8_t i = 1; i < hash_fn_count_; ++i) { + other_pos = getOtherPosition(key_begin, length, first_pos, i); + if (!getBitAtPosition(other_pos)) { + return false; + } + } + return true; + } + + /** + * @brief Perform a bitwise-OR of the given Bloom filter with this bloom filter. + * Essentially, it does a union of this bloom filter with the passed bloom filter. + * + * @param bloom_filter A const pointer to the bloom filter object to do bitwise-OR with. + */ + inline void bitwiseOr(const BloomFilterBlocked *bloom_filter) { + SpinSharedMutexExclusiveLock<false> exclusive_writer_lock(bloom_filter_insert_mutex_); + for (std::size_t byte_index = 0; byte_index < bloom_filter->getBitArraySize(); ++byte_index) { + (bit_array_.get())[byte_index] |= bloom_filter->getBitArray()[byte_index]; + } + } + + /** + * @brief Return the number of elements currently inserted into bloom filter. + * + * @return The number of elements inserted into bloom filter. + **/ + inline std::uint32_t element_count() const { + return inserted_element_count_; + } + + protected: + Position getFirstPosition(const std::uint8_t *begin, std::size_t length) const { + Position pos; + pos.hash = hash_identity(begin, length); + return pos; + } + + Position getOtherPosition( + const std::uint8_t *begin, + std::size_t length, + const Position first_pos, + const std::uint8_t index) const { + Position pos; + pos.hash = hash_multiplicative(begin, length, hash_fn_[index-1]); + pos.cache_line_pos.line_num = first_pos.cache_line_pos.line_num; + return pos; + } + + void fillPosition( + const std::uint8_t *begin, + std::size_t length, + const std::uint8_t index, + Position positions[]) const { + if (index == 0) + positions[0].hash = hash_identity(begin, length); + else { + positions[index].hash = hash_multiplicative(begin, length, hash_fn_[index-1]); + positions[index].cache_line_pos.line_num = positions[0].cache_line_pos.line_num; + } + } + + void setBitAtPosition(const Position &pos) { + (bit_array_.get())[pos.byte_pos.byte_num] |= (1 << pos.byte_pos.index_in_byte); + } + + bool getBitAtPosition(const Position &pos) const { + return (bit_array_.get())[pos.byte_pos.byte_num] & (1 << pos.byte_pos.index_in_byte); + } + + inline std::uint32_t hash_identity( + const std::uint8_t *__restrict__ begin, + std::size_t length) const { + std::uint32_t hash; + if (length >= 4) + hash = *reinterpret_cast<const std::uint32_t*> (begin); + else + std::memcpy(&hash, begin, length); + return hash % (array_size_in_bytes_ * kNumBitsPerByte); + } + + inline std::uint32_t hash_multiplicative( + const std::uint8_t *__restrict__ begin, + std::size_t length, + const std::uint64_t multiplier) const { + std::uint32_t hash = 0; + std::size_t bytes_hashed = 0; + if (length >= 4) { + while (bytes_hashed < length) { + auto val = *reinterpret_cast<const std::uint32_t *>(begin + bytes_hashed); + hash += (multiplier * val) >> 24; + bytes_hashed += 4; + } + } + while (bytes_hashed < length) { + std::uint8_t val = *(begin + bytes_hashed); + hash += (multiplier * val) >> 24; + bytes_hashed++; + } + return hash;// % (array_size_in_bytes_ * kNumBitsPerByte); + } + + private: + const std::uint32_t hash_fn_count_; + const std::uint64_t array_size_in_bytes_; + std::uint32_t inserted_element_count_; + const bool is_bit_array_owner_; + + static constexpr std::uint64_t kKnuthGoldenRatioNumber = 2654435761; + const std::uint64_t hash_fn_[kMaxNumHashFns] = { // hash_fn_[i] is 2**(i+1) - 1 + 0x00000001 * kKnuthGoldenRatioNumber, // 0x00000003, 0x00000007, 0x0000000f, + // 0x0000001f * kKnuthGoldenRatioNumber, // 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff * kKnuthGoldenRatioNumber, // 0x000003ff, 0x000007ff, 0x00000fff, + // 0x00001fff * kKnuthGoldenRatioNumber, // 0x00003fff, 0x00007fff, 0x0000ffff, + 0x0001ffff * kKnuthGoldenRatioNumber, // 0x0003ffff, 0x0007ffff, 0x000fffff, + // 0x001fffff * kKnuthGoldenRatioNumber, // 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff * kKnuthGoldenRatioNumber, // 0x03ffffff, 0x07ffffff, 0x0fffffff, + // 0x1fffffff * kKnuthGoldenRatioNumber // 0x3fffffff, 0x7fffffff, 0xffffffff + }; + + alignas(kCacheLineBytes) std::unique_ptr<std::uint8_t> bit_array_; + alignas(kCacheLineBytes) mutable SpinSharedMutex<false> bloom_filter_insert_mutex_; + + DISALLOW_COPY_AND_ASSIGN(BloomFilterBlocked); +}; + /** * @brief A simple Bloom Filter implementation with basic primitives * based on Partow's Bloom Filter implementation. **/ -class BloomFilter { +class BloomFilterOriginal { public: static const uint32_t kNumBitsPerByte = 8; @@ -57,21 +405,17 @@ class BloomFilter { * @note When no bit_array is being passed to the constructor, * then the bit_array is owned and managed by this class. * - * @param random_seed A random_seed that generates unique hash functions. * @param hash_fn_count The number of hash functions used by this bloom filter. * @param bit_array_size_in_bytes Size of the bit array. **/ - BloomFilter(const std::uint64_t random_seed, - const std::size_t hash_fn_count, + BloomFilterOriginal(const std::size_t hash_fn_count, const std::uint64_t bit_array_size_in_bytes) - : random_seed_(random_seed), - hash_fn_count_(hash_fn_count), + : hash_fn_count_(hash_fn_count), array_size_in_bytes_(bit_array_size_in_bytes), array_size_(array_size_in_bytes_ * kNumBitsPerByte), bit_array_(new std::uint8_t[array_size_in_bytes_]), is_bit_array_owner_(true) { reset(); - generate_unique_hash_fn(); } /** @@ -79,20 +423,17 @@ class BloomFilter { * @note When a bit_array is passed as an argument to the constructor, * then the ownership of the bit array lies with the caller. * - * @param random_seed A random_seed that generates unique hash functions. * @param hash_fn_count The number of hash functions used by this bloom filter. * @param bit_array_size_in_bytes Size of the bit array. * @param bit_array A pointer to the memory region that is used to store bit array. * @param is_initialized A boolean that indicates whether to zero-out the region * before use or not. **/ - BloomFilter(const std::uint64_t random_seed, - const std::size_t hash_fn_count, + BloomFilterOriginal(const std::size_t hash_fn_count, const std::uint64_t bit_array_size_in_bytes, std::uint8_t *bit_array, const bool is_initialized) - : random_seed_(random_seed), - hash_fn_count_(hash_fn_count), + : hash_fn_count_(hash_fn_count), array_size_in_bytes_(bit_array_size_in_bytes), array_size_(bit_array_size_in_bytes * kNumBitsPerByte), bit_array_(bit_array), // Owned by the calling method. @@ -100,7 +441,6 @@ class BloomFilter { if (!is_initialized) { reset(); } - generate_unique_hash_fn(); } /** @@ -111,21 +451,19 @@ class BloomFilter { * @param bloom_filter_proto The protobuf representation of a * bloom filter configuration. **/ - explicit BloomFilter(const serialization::BloomFilter &bloom_filter_proto) - : random_seed_(bloom_filter_proto.bloom_filter_seed()), - hash_fn_count_(bloom_filter_proto.number_of_hashes()), + explicit BloomFilterOriginal(const serialization::BloomFilter &bloom_filter_proto) + : hash_fn_count_(bloom_filter_proto.number_of_hashes()), array_size_in_bytes_(bloom_filter_proto.bloom_filter_size()), array_size_(array_size_in_bytes_ * kNumBitsPerByte), bit_array_(new std::uint8_t[array_size_in_bytes_]), is_bit_array_owner_(true) { reset(); - generate_unique_hash_fn(); } /** * @brief Destructor. **/ - ~BloomFilter() { + ~BloomFilterOriginal() { if (is_bit_array_owner_) { bit_array_.reset(); } else { @@ -147,15 +485,6 @@ class BloomFilter { } /** - * @brief Get the random seed that was used to initialize this bloom filter. - * - * @return Returns the random seed. - **/ - inline std::uint64_t getRandomSeed() const { - return random_seed_; - } - - /** * @brief Get the number of hash functions used in this bloom filter. * * @return Returns the number of hash functions. @@ -198,7 +527,7 @@ class BloomFilter { // Determine all the bit positions that are required to be set. for (std::size_t i = 0; i < hash_fn_count_; ++i) { - compute_indices(hash_ap(key_begin, length, hash_fn_[i]), &bit_index, &bit); + compute_indices(hash_multiplicative(key_begin, length, hash_fn_[i]), &bit_index, &bit); modified_bit_positions.push_back(std::make_pair(bit_index, bit)); } @@ -243,7 +572,7 @@ class BloomFilter { std::size_t bit = 0; for (std::size_t i = 0; i < hash_fn_count_; ++i) { - compute_indices(hash_ap(key_begin, length, hash_fn_[i]), &bit_index, &bit); + compute_indices(hash_multiplicative(key_begin, length, hash_fn_[i]), &bit_index, &bit); (bit_array_.get())[bit_index / kNumBitsPerByte] |= (1 << bit); } @@ -265,7 +594,7 @@ class BloomFilter { std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < hash_fn_count_; ++i) { - compute_indices(hash_ap(key_begin, length, hash_fn_[i]), &bit_index, &bit); + compute_indices(hash_multiplicative(key_begin, length, hash_fn_[i]), &bit_index, &bit); if (((bit_array_.get())[bit_index / kNumBitsPerByte] & (1 << bit)) != (1 << bit)) { return false; } @@ -279,7 +608,7 @@ class BloomFilter { * * @param bloom_filter A const pointer to the bloom filter object to do bitwise-OR with. */ - inline void bitwiseOr(const BloomFilter *bloom_filter) { + inline void bitwiseOr(const BloomFilterOriginal *bloom_filter) { SpinSharedMutexExclusiveLock<false> exclusive_writer_lock(bloom_filter_insert_mutex_); for (std::size_t byte_index = 0; byte_index < bloom_filter->getBitArraySize(); ++byte_index) { (bit_array_.get())[byte_index] |= bloom_filter->getBitArray()[byte_index]; @@ -301,95 +630,28 @@ class BloomFilter { *bit = *bit_index % kNumBitsPerByte; } - void generate_unique_hash_fn() { - hash_fn_.reserve(hash_fn_count_); - const std::uint32_t predef_hash_fn_count = 128; - static const std::uint32_t predef_hash_fn[predef_hash_fn_count] = { - 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, - 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B, - 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66, - 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA, - 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99, - 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33, - 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5, - 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000, - 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F, - 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63, - 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7, - 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492, - 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A, - 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B, - 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3, - 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432, - 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC, - 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB, - 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331, - 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68, - 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8, - 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A, - 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF, - 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E, - 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39, - 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E, - 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355, - 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E, - 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79, - 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075, - 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC, - 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421 - }; - if (hash_fn_count_ <= predef_hash_fn_count) { - std::copy(predef_hash_fn, predef_hash_fn + hash_fn_count_, hash_fn_.begin()); - for (std::uint32_t i = 0; i < hash_fn_.size(); ++i) { - hash_fn_[i] = hash_fn_[i] * hash_fn_[(i + 3) % hash_fn_count_] + static_cast<std::uint32_t>(random_seed_); + inline std::uint32_t hash_multiplicative( + const std::uint8_t *begin, + std::size_t remaining_length, + const std::uint64_t multiplier) const { + std::uint32_t hash = 0; + std::size_t bytes_hashed = 0; + if (remaining_length >= 4) { + while (bytes_hashed < remaining_length) { + auto val = *reinterpret_cast<const std::uint32_t *>(begin + bytes_hashed); + hash += (multiplier * val) >> 32; + bytes_hashed += 4; } - } else { - LOG(FATAL) << "Requested number of hash functions is too large."; } - } - - inline std::uint32_t hash_ap(const std::uint8_t *begin, std::size_t remaining_length, std::uint32_t hash) const { - const std::uint8_t *itr = begin; - std::uint32_t loop = 0; - while (remaining_length >= 8) { - const std::uint32_t &i1 = *(reinterpret_cast<const std::uint32_t*>(itr)); itr += sizeof(std::uint32_t); - const std::uint32_t &i2 = *(reinterpret_cast<const std::uint32_t*>(itr)); itr += sizeof(std::uint32_t); - hash ^= (hash << 7) ^ i1 * (hash >> 3) ^ (~((hash << 11) + (i2 ^ (hash >> 5)))); - remaining_length -= 8; - } - if (remaining_length) { - if (remaining_length >= 4) { - const std::uint32_t &i = *(reinterpret_cast<const std::uint32_t*>(itr)); - if (loop & 0x01) { - hash ^= (hash << 7) ^ i * (hash >> 3); - } else { - hash ^= (~((hash << 11) + (i ^ (hash >> 5)))); - } - ++loop; - remaining_length -= 4; - itr += sizeof(std::uint32_t); - } - if (remaining_length >= 2) { - const std::uint16_t &i = *(reinterpret_cast<const std::uint16_t*>(itr)); - if (loop & 0x01) { - hash ^= (hash << 7) ^ i * (hash >> 3); - } else { - hash ^= (~((hash << 11) + (i ^ (hash >> 5)))); - } - ++loop; - remaining_length -= 2; - itr += sizeof(std::uint16_t); - } - if (remaining_length) { - hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop; - } + while (bytes_hashed < remaining_length) { + std::uint8_t val = *(begin + bytes_hashed); + hash += (multiplier * val) >> 32; + bytes_hashed++; } return hash; } private: - const std::uint64_t random_seed_; - std::vector<std::uint32_t> hash_fn_; const std::uint32_t hash_fn_count_; std::uint64_t array_size_in_bytes_; std::uint64_t array_size_; @@ -397,9 +659,21 @@ class BloomFilter { std::uint32_t inserted_element_count_; const bool is_bit_array_owner_; + static constexpr std::uint64_t kKnuthGoldenRatioNumber = 2654435761; + static constexpr std::size_t kMaxNumHashFns = 8; + const std::uint64_t hash_fn_[kMaxNumHashFns] = { // hash_fn_[i] is 2**(i+1) - 1 + 0x00000001 * kKnuthGoldenRatioNumber, // 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f * kKnuthGoldenRatioNumber, // 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff * kKnuthGoldenRatioNumber, // 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff * kKnuthGoldenRatioNumber, // 0x00003fff, 0x00007fff, 0x0000ffff, + 0x0001ffff * kKnuthGoldenRatioNumber, // 0x0003ffff, 0x0007ffff, 0x000fffff, + 0x001fffff * kKnuthGoldenRatioNumber, // 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff * kKnuthGoldenRatioNumber, // 0x03ffffff, 0x07ffffff, 0x0fffffff, + 0x1fffffff * kKnuthGoldenRatioNumber // 0x3fffffff, 0x7fffffff, 0xffffffff + }; alignas(kCacheLineBytes) mutable SpinSharedMutex<false> bloom_filter_insert_mutex_; - DISALLOW_COPY_AND_ASSIGN(BloomFilter); + DISALLOW_COPY_AND_ASSIGN(BloomFilterOriginal); }; /** @} */ http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/BloomFilter.proto ---------------------------------------------------------------------- diff --git a/utility/BloomFilter.proto b/utility/BloomFilter.proto index 8dd9163..b5d14a9 100644 --- a/utility/BloomFilter.proto +++ b/utility/BloomFilter.proto @@ -21,10 +21,8 @@ message BloomFilter { // The default values were determined from empirical experiments. // These values control the amount of false positivity that // is expected from Bloom Filter. - // - Default seed for initializing family of hashes = 0xA5A5A5A55A5A5A5A. // - Default bloom filter size = 10 KB. // - Default number of hash functions used in bloom filter = 5. - optional fixed64 bloom_filter_seed = 1 [default = 0xA5A5A5A55A5A5A5A]; - optional uint32 bloom_filter_size = 2 [default = 10000]; - optional uint32 number_of_hashes = 3 [default = 5]; + optional uint32 bloom_filter_size = 1 [default = 10000]; + optional uint32 number_of_hashes = 2 [default = 5]; } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/BloomFilterAdapter.hpp ---------------------------------------------------------------------- diff --git a/utility/BloomFilterAdapter.hpp b/utility/BloomFilterAdapter.hpp new file mode 100644 index 0000000..f094307 --- /dev/null +++ b/utility/BloomFilterAdapter.hpp @@ -0,0 +1,142 @@ +/** + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#ifndef QUICKSTEP_UTILITY_BLOOM_FILTER_ADAPTER_HPP +#define QUICKSTEP_UTILITY_BLOOM_FILTER_ADAPTER_HPP + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <utility> +#include <vector> + +#include "catalog/CatalogTypedefs.hpp" +#include "utility/BloomFilter.hpp" +#include "utility/Macros.hpp" + +#include "glog/logging.h" + +namespace quickstep { + +/** \addtogroup Utility + * @{ + */ + +class BloomFilterAdapter { + public: + BloomFilterAdapter(const std::vector<const BloomFilter*> &bloom_filters, + const std::vector<attribute_id> &attribute_ids, + const std::vector<std::size_t> &attr_sizes) { + DCHECK_EQ(bloom_filters.size(), attribute_ids.size()); + DCHECK_EQ(bloom_filters.size(), attr_sizes.size()); + + bloom_filter_entries_.reserve(bloom_filters.size()); + for (std::size_t i = 0; i < bloom_filters.size(); ++i) { + bloom_filter_entries_.emplace_back( + new BloomFilterEntry( + bloom_filters[i], attribute_ids[i], attr_sizes[i])); + } + } + + ~BloomFilterAdapter() { + for (auto &entry : bloom_filter_entries_) { + delete entry; + } + } + + template <bool adapt_filters, typename ValueAccessorT> + inline std::size_t bulkProbe(const ValueAccessorT *accessor, + std::vector<tuple_id> &batch, + const std::size_t batch_size) { + std::size_t out_size = batch_size; + for (auto &entry : bloom_filter_entries_) { + out_size = bulkProbeBloomFilterEntry<adapt_filters>(*entry, accessor, batch, out_size); + } + adaptEntryOrder(); + return out_size; + } + + private: + struct BloomFilterEntry { + BloomFilterEntry(const BloomFilter *in_bloom_filter, + const attribute_id &in_attribute_id, + const std::size_t &in_attribute_size) + : bloom_filter(in_bloom_filter), + attribute_id(in_attribute_id), + attribute_size(in_attribute_size), + miss(0), + cnt(0) { + } + + static bool isBetterThan(const BloomFilterEntry *a, + const BloomFilterEntry *b) { + return a->miss_rate > b->miss_rate; + } + + const BloomFilter *bloom_filter; + const attribute_id attribute_id; + const std::size_t attribute_size; + std::uint32_t miss; + std::uint32_t cnt; + float miss_rate; + }; + + template <bool adapt_filters, typename ValueAccessorT> + inline std::size_t bulkProbeBloomFilterEntry( + BloomFilterEntry &entry, + const ValueAccessorT *accessor, + std::vector<tuple_id> &batch, + const std::size_t in_size) { + std::size_t out_size = 0; + const BloomFilter *bloom_filter = entry.bloom_filter; + + for (std::size_t t = 0; t < in_size; ++t) { + const tuple_id tid = batch[t]; + const auto value = static_cast<const std::uint8_t*>( + accessor->getUntypedValueAtAbsolutePosition(entry.attribute_id, tid)); + if (bloom_filter->contains(value, entry.attribute_size)) { + batch[out_size] = tid; + ++out_size; + } + } + if (adapt_filters) { + entry.cnt += in_size; + entry.miss += (in_size - out_size); + } + return out_size; + } + + inline void adaptEntryOrder() { + for (auto &entry : bloom_filter_entries_) { + entry->miss_rate = static_cast<float>(entry->miss) / entry->cnt; + } + std::sort(bloom_filter_entries_.begin(), + bloom_filter_entries_.end(), + BloomFilterEntry::isBetterThan); + } + + std::vector<BloomFilterEntry *> bloom_filter_entries_; + + DISALLOW_COPY_AND_ASSIGN(BloomFilterAdapter); +}; + +/** @} */ + +} // namespace quickstep + +#endif // QUICKSTEP_UTILITY_BLOOM_FILTER_ADAPTER_HPP http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/utility/CMakeLists.txt b/utility/CMakeLists.txt index 2d3db8f..5883470 100644 --- a/utility/CMakeLists.txt +++ b/utility/CMakeLists.txt @@ -159,6 +159,7 @@ add_library(quickstep_utility_Alignment ../empty_src.cpp Alignment.hpp) add_library(quickstep_utility_BitManipulation ../empty_src.cpp BitManipulation.hpp) add_library(quickstep_utility_BitVector ../empty_src.cpp BitVector.hpp) add_library(quickstep_utility_BloomFilter ../empty_src.cpp BloomFilter.hpp) +add_library(quickstep_utility_BloomFilterAdapter ../empty_src.cpp BloomFilterAdapter.hpp) add_library(quickstep_utility_BloomFilter_proto ${quickstep_utility_BloomFilter_proto_srcs} ${quickstep_utility_BloomFilter_proto_hdrs}) @@ -166,6 +167,9 @@ add_library(quickstep_utility_CalculateInstalledMemory CalculateInstalledMemory. add_library(quickstep_utility_Cast ../empty_src.cpp Cast.hpp) add_library(quickstep_utility_CheckSnprintf ../empty_src.cpp CheckSnprintf.hpp) add_library(quickstep_utility_DAG ../empty_src.cpp DAG.hpp) +add_library(quickstep_utility_DAGVisualizer DAGVisualizer.cpp DAGVisualizer.hpp) +add_library(quickstep_utility_DisjointTreeForest ../empty_src.cpp DisjointTreeForest.hpp) +add_library(quickstep_utility_EventProfiler EventProfiler.cpp EventProfiler.hpp) add_library(quickstep_utility_EqualsAnyConstant ../empty_src.cpp EqualsAnyConstant.hpp) add_library(quickstep_utility_Glob Glob.cpp Glob.hpp) add_library(quickstep_utility_HashPair ../empty_src.cpp HashPair.hpp) @@ -216,6 +220,10 @@ target_link_libraries(quickstep_utility_BloomFilter quickstep_threading_SpinSharedMutex quickstep_utility_BloomFilter_proto quickstep_utility_Macros) +target_link_libraries(quickstep_utility_BloomFilterAdapter + quickstep_catalog_CatalogTypedefs + quickstep_utility_BloomFilter + quickstep_utility_Macros) target_link_libraries(quickstep_utility_BloomFilter_proto ${PROTOBUF_LIBRARY}) target_link_libraries(quickstep_utility_CalculateInstalledMemory @@ -225,6 +233,8 @@ target_link_libraries(quickstep_utility_CheckSnprintf target_link_libraries(quickstep_utility_DAG glog quickstep_utility_Macros) +target_link_libraries(quickstep_utility_EventProfiler + quickstep_threading_Mutex) target_link_libraries(quickstep_utility_Glob glog) target_link_libraries(quickstep_utility_MemStream @@ -243,6 +253,12 @@ target_link_libraries(quickstep_utility_PlanVisualizer quickstep_queryoptimizer_physical_TopLevelPlan quickstep_utility_Macros quickstep_utility_StringUtil) +target_link_libraries(quickstep_utility_DAGVisualizer + quickstep_queryoptimizer_QueryPlan + quickstep_utility_EventProfiler + quickstep_utility_Macros + quickstep_utility_StringUtil) +target_link_libraries(quickstep_utility_DisjointTreeForest) target_link_libraries(quickstep_utility_PtrList quickstep_utility_Macros) target_link_libraries(quickstep_utility_PtrMap @@ -297,11 +313,15 @@ target_link_libraries(quickstep_utility quickstep_utility_BitManipulation quickstep_utility_BitVector quickstep_utility_BloomFilter + quickstep_utility_BloomFilterAdapter quickstep_utility_BloomFilter_proto quickstep_utility_CalculateInstalledMemory quickstep_utility_Cast quickstep_utility_CheckSnprintf quickstep_utility_DAG + quickstep_utility_DAGVisualizer + quickstep_utility_DisjointTreeForest + quickstep_utility_EventProfiler quickstep_utility_EqualsAnyConstant quickstep_utility_Glob quickstep_utility_HashPair http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/DAGVisualizer.cpp ---------------------------------------------------------------------- diff --git a/utility/DAGVisualizer.cpp b/utility/DAGVisualizer.cpp new file mode 100644 index 0000000..e62f948 --- /dev/null +++ b/utility/DAGVisualizer.cpp @@ -0,0 +1,167 @@ +/** + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#include "utility/DAGVisualizer.hpp" +#include "utility/EventProfiler.hpp" + +#include <cmath> +#include <cstddef> +#include <iomanip> +#include <memory> +#include <sstream> +#include <string> +#include <unordered_map> +#include <vector> + +#include "query_optimizer/QueryPlan.hpp" +#include "utility/EventProfiler.hpp" +#include "utility/StringUtil.hpp" + +#include "glog/logging.h" + +namespace quickstep { + +std::string DAGVisualizer::toDOT() { + std::set<std::string> no_display_op_names = + { "DestroyHashOperator", "DropTableOperator" }; + + const auto &dag = plan_.getQueryPlanDAG(); + const std::size_t num_nodes = dag.size(); + + std::vector<double> time_elapsed(num_nodes, 0); + std::vector<double> time_percentage(num_nodes, 0); + std::vector<double> time_start(num_nodes, std::numeric_limits<double>::max()); + std::vector<double> time_end(num_nodes, 0); + const auto &zero_time = relop_profiler.zero_time(); + for (const auto &container : relop_profiler.containers()) { + for (const auto &line : container.second.events) { + const std::size_t relop_index = line.first; + for (const auto &event : line.second) { + time_elapsed[relop_index] += + std::chrono::duration<double>(event.end_time - event.start_time).count(); + time_start[relop_index] = + std::min(time_start[relop_index], + std::chrono::duration<double>(event.start_time - zero_time).count()); + time_end[relop_index] = + std::max(time_end[relop_index], + std::chrono::duration<double>(event.end_time - zero_time).count()); + } + } + } + const std::size_t num_threads = relop_profiler.containers().size(); + double total_time_elapsed = 0; + double max_percentage = 0; + for (std::size_t i = 0; i < time_elapsed.size(); ++i) { + time_elapsed[i] /= num_threads; + total_time_elapsed += time_elapsed[i]; + } + for (std::size_t i = 0; i < time_elapsed.size(); ++i) { + time_percentage[i] = time_elapsed[i] / total_time_elapsed; + max_percentage = std::max(max_percentage, time_percentage[i]); + } + + std::vector<bool> display_ops(num_nodes, false); + for (std::size_t node_index = 0; node_index < num_nodes; ++node_index) { + const auto &node = dag.getNodePayload(node_index); + const std::string relop_name = node.getName(); + if (no_display_op_names.find(relop_name) == no_display_op_names.end()) { + display_ops[node_index] = true; + + nodes_.emplace_back(); + NodeInfo &node_info = nodes_.back(); + node_info.id = node_index; + + std::string hue = + std::to_string(std::sqrt(time_percentage[node_index] / max_percentage)); + node_info.color = hue + " " + hue + " 1.0"; + + node_info.labels.emplace_back( + "[" + std::to_string(node.getOperatorIndex()) + "] " + relop_name); + node_info.labels.emplace_back( + std::to_string(std::lround(time_elapsed[node_index] * 1000)) + + "ms (" + PercentageToString(time_percentage[node_index] * 100) + "%)"); + node_info.labels.emplace_back( + "span: [" + + std::to_string(std::lround(time_start[node_index] * 1000)) + "ms, " + + std::to_string(std::lround(time_end[node_index] * 1000)) + "ms]"); + } + } + for (std::size_t node_index = 0; node_index < num_nodes; ++node_index) { + if (display_ops[node_index]) { + for (const auto &link : dag.getDependents(node_index)) { + if (display_ops[link.first]) { + edges_.emplace_back(); + EdgeInfo &edge_info = edges_.back(); + edge_info.src_node_id = node_index; + edge_info.dst_node_id = link.first; + edge_info.is_pipeline_breaker = link.second; + } + } + } + } + + // Format output graph + std::ostringstream graph_oss; + graph_oss << "digraph g {\n"; + graph_oss << " rankdir=BT\n"; + graph_oss << " node [penwidth=2]\n"; + graph_oss << " edge [fontsize=16 fontcolor=gray penwidth=2]\n\n"; + + // Format nodes + for (const NodeInfo &node_info : nodes_) { + graph_oss << " " << node_info.id << " [ "; + if (!node_info.labels.empty()) { + graph_oss << "label=\"" + << EscapeSpecialChars(JoinToString(node_info.labels, " ")) + << "\" "; + } + if (!node_info.color.empty()) { + graph_oss << "style=filled fillcolor=\"" << node_info.color << "\" "; + } + graph_oss << "]\n"; + } + graph_oss << "\n"; + + // Format edges + for (const EdgeInfo &edge_info : edges_) { + graph_oss << " " << edge_info.src_node_id << " -> " + << edge_info.dst_node_id << " [ "; + if (edge_info.is_pipeline_breaker) { + graph_oss << "style=dashed "; + } + if (!edge_info.labels.empty()) { + graph_oss << "label=\"" + << EscapeSpecialChars(JoinToString(edge_info.labels, " ")) + << "\" "; + } + graph_oss << "]\n"; + } + + graph_oss << "}\n"; + + return graph_oss.str(); +} + +std::string DAGVisualizer::PercentageToString(double percentage) { + std::ostringstream oss; + oss << static_cast<std::uint32_t>(percentage) << "."; + int digits = std::lround(percentage * 10000) % 100; + oss << digits / 10 << digits % 10; + return oss.str(); +} + +} // namespace quickstep http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/DAGVisualizer.hpp ---------------------------------------------------------------------- diff --git a/utility/DAGVisualizer.hpp b/utility/DAGVisualizer.hpp new file mode 100644 index 0000000..5c81d22 --- /dev/null +++ b/utility/DAGVisualizer.hpp @@ -0,0 +1,85 @@ +/** + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#ifndef QUICKSTEP_UTILITY_DAG_VISUALIZER_HPP_ +#define QUICKSTEP_UTILITY_DAG_VISUALIZER_HPP_ + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "utility/Macros.hpp" + +namespace quickstep { + +class QueryPlan; + +/** \addtogroup Utility + * @{ + */ + +/** + * @brief A visualizer that converts an execution plan DAG into a graph in + * DOT format. Note that DOT is a plain text graph description language. + * + * @note This utility tool can be further extended to be more generic. + */ +class DAGVisualizer { + public: + DAGVisualizer(const QueryPlan &plan) + : plan_(plan) {} + + ~DAGVisualizer() {} + + std::string toDOT(); + + private: + static std::string PercentageToString(double percentage); + + /** + * @brief Information of a graph node. + */ + struct NodeInfo { + std::size_t id; + std::vector<std::string> labels; + std::string color; + }; + + /** + * @brief Information of a graph edge. + */ + struct EdgeInfo { + std::size_t src_node_id; + std::size_t dst_node_id; + std::vector<std::string> labels; + bool is_pipeline_breaker; + }; + + const QueryPlan &plan_; + + std::vector<NodeInfo> nodes_; + std::vector<EdgeInfo> edges_; + + DISALLOW_COPY_AND_ASSIGN(DAGVisualizer); +}; + +/** @} */ + +} // namespace quickstep + +#endif /* QUICKSTEP_UTILITY_DAG_VISUALIZER_HPP_ */ http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/DisjointTreeForest.hpp ---------------------------------------------------------------------- diff --git a/utility/DisjointTreeForest.hpp b/utility/DisjointTreeForest.hpp new file mode 100644 index 0000000..f5722ba --- /dev/null +++ b/utility/DisjointTreeForest.hpp @@ -0,0 +1,116 @@ +/** + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#ifndef QUICKSTEP_UTILITY_DISJOINT_TREE_FOREST_HPP_ +#define QUICKSTEP_UTILITY_DISJOINT_TREE_FOREST_HPP_ + +#include <cstddef> +#include <limits> +#include <utility> +#include <unordered_map> + +namespace quickstep { + +/** \addtogroup Utility + * @{ + */ + +/** + * @brief A.k.a. union-find set. + */ +template <typename ElementT, + class MapperT = std::unordered_map<ElementT, std::size_t>> +class DisjointTreeForest { + public: + inline bool hasElement(const ElementT &element) const { + return elements_map_.find(element) != elements_map_.end(); + } + + inline void makeSet(const ElementT &element) { + if (!hasElement(element)) { + std::size_t loc = nodes_.size(); + nodes_.emplace_back(0, loc); + elements_map_.emplace(element, loc); + } + } + + inline std::size_t find(const ElementT &element) { + const std::size_t node_id = elements_map_.at(element); + std::size_t root_id = node_id; + std::size_t parent_id; + while ((parent_id = nodes_[root_id].parent) != root_id) { + root_id = parent_id; + } + compress_path(node_id, root_id); + return root_id; + } + + inline void merge(const ElementT &element1, const ElementT &element2) { + std::size_t root_id1 = find(element1); + std::size_t root_id2 = find(element2); + if (root_id1 != root_id2) { + Node &n1 = nodes_[root_id1]; + Node &n2 = nodes_[root_id2]; + if (n1.rank > n2.rank) { + n2.parent = root_id1; + } else if (n1.rank < n2.rank) { + n1.parent = root_id2; + } else { + n1.parent = root_id2; + n2.rank += 1; + } + } + } + + inline bool isConnected(const ElementT &element1, const ElementT &element2) { + return find(element1) == find(element2); + } + + private: + struct Node { + Node(const std::size_t rank_in, const std::size_t parent_in) + : rank(rank_in), parent(parent_in) { + } + std::size_t rank; + std::size_t parent; + }; + + inline void compress_path(const std::size_t leaf_node_id, const std::size_t root_node_id) { + std::size_t node_id = leaf_node_id; + std::size_t max_rank = 0; + while (node_id != root_node_id) { + const Node &node = nodes_[node_id]; + max_rank = std::max(max_rank, node.rank); + + const std::size_t parent_id = node.parent; + nodes_[node_id].parent = root_node_id; + node_id = parent_id; + } + nodes_[root_node_id].rank = max_rank + 1; + } + + std::vector<Node> nodes_; + MapperT elements_map_; + + static constexpr std::size_t kInvalid = std::numeric_limits<std::size_t>::max(); +}; + +/** @} */ + +} // namespace quickstep + +#endif // QUICKSTEP_UTILITY_DISJOINT_TREE_FOREST_HPP_ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/97d8dca8/utility/EventProfiler.cpp ---------------------------------------------------------------------- diff --git a/utility/EventProfiler.cpp b/utility/EventProfiler.cpp new file mode 100644 index 0000000..728ebff --- /dev/null +++ b/utility/EventProfiler.cpp @@ -0,0 +1,29 @@ +/** + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#include "utility/EventProfiler.hpp" + +#include <cstddef> +#include <string> +#include <vector> + +namespace quickstep { + +EventProfiler<int, std::size_t> simple_profiler; +EventProfiler<std::size_t> relop_profiler; + +} // namespace quickstep