Repository: incubator-quickstep Updated Branches: refs/heads/adaptive-bloom-filters 319d557bf -> 7d868b0dd
Updates Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/9cc47e56 Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/9cc47e56 Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/9cc47e56 Branch: refs/heads/adaptive-bloom-filters Commit: 9cc47e561d893a108fc2ad686413e6b251de64cb Parents: 319d557 Author: Jianqiao Zhu <jianq...@cs.wisc.edu> Authored: Thu Jul 28 17:29:43 2016 -0400 Committer: Jianqiao Zhu <jianq...@cs.wisc.edu> Committed: Thu Jul 28 17:29:43 2016 -0400 ---------------------------------------------------------------------- cli/QuickstepCli.cpp | 18 +++ query_optimizer/ExecutionGenerator.cpp | 76 +++------- query_optimizer/ExecutionHeuristics.cpp | 128 ++++++++--------- query_optimizer/ExecutionHeuristics.hpp | 28 ++-- query_optimizer/PhysicalGenerator.cpp | 1 - query_optimizer/physical/HashJoin.cpp | 18 +++ query_optimizer/physical/HashJoin.hpp | 70 ++++++++- query_optimizer/rules/AttachBloomFilters.cpp | 143 +++++++++++++++---- query_optimizer/rules/AttachBloomFilters.hpp | 16 ++- .../StarSchemaHashJoinOrderOptimization.cpp | 32 ++++- .../StarSchemaHashJoinOrderOptimization.hpp | 14 +- storage/HashTable.hpp | 35 ++--- storage/HashTable.proto | 10 +- storage/HashTableFactory.hpp | 16 +-- utility/BloomFilterAdapter.hpp | 34 ++--- 15 files changed, 403 insertions(+), 236 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/cli/QuickstepCli.cpp ---------------------------------------------------------------------- diff --git a/cli/QuickstepCli.cpp b/cli/QuickstepCli.cpp index ddabf31..8031dd3 100644 --- a/cli/QuickstepCli.cpp +++ b/cli/QuickstepCli.cpp @@ -219,6 +219,23 @@ void addPrimaryKeyInfoForTPCHTables(quickstep::CatalogDatabase *database) { } } +void addPrimaryKeyInfoForSSBTables(quickstep::CatalogDatabase *database) { + const std::vector<std::pair<std::string, std::vector<std::string>>> rel_pkeys = { + { "supplier", { "s_suppkey" } }, + { "customer", { "c_custkey" } }, + { "part", { "p_partkey" } }, + { "ddate", { "d_datekey" } } + }; + for (const auto &rel_pair : rel_pkeys) { + CatalogRelation *rel = database->getRelationByNameMutable(rel_pair.first); + std::vector<quickstep::attribute_id> attrs; + for (const auto &pkey : rel_pair.second) { + attrs.emplace_back(rel->getAttributeByName(pkey)->getID()); + } + rel->getConstraintsMutable()->setPrimaryKey(attrs); + } +} + int main(int argc, char* argv[]) { google::InitGoogleLogging(argv[0]); gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -327,6 +344,7 @@ int main(int argc, char* argv[]) { } // addPrimaryKeyInfoForTPCHTables(query_processor->getDefaultDatabase()); +// addPrimaryKeyInfoForSSBTables(query_processor->getDefaultDatabase()); // std::string proto_str; // google::protobuf::TextFormat::PrintToString( // query_processor->getDefaultDatabase()->getProto(), &proto_str); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/ExecutionGenerator.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/ExecutionGenerator.cpp b/query_optimizer/ExecutionGenerator.cpp index f84c371..fe6b6e7 100644 --- a/query_optimizer/ExecutionGenerator.cpp +++ b/query_optimizer/ExecutionGenerator.cpp @@ -599,8 +599,10 @@ void ExecutionGenerator::convertHashJoin(const P::HashJoinPtr &physical_plan) { std::vector<attribute_id> probe_attribute_ids; std::vector<attribute_id> build_attribute_ids; - std::vector<attribute_id> probe_original_attribute_ids; - std::vector<attribute_id> build_original_attribute_ids; + const P::BloomFilterConfig &bloom_filter_config = + physical_plan->bloom_filter_config(); + std::vector<attribute_id> probe_side_bloom_filter_attribute_ids; + std::vector<attribute_id> build_side_bloom_filter_attribute_ids; const CatalogRelation *referenced_stored_probe_relation = nullptr; const CatalogRelation *referenced_stored_build_relation = nullptr; @@ -613,18 +615,6 @@ void ExecutionGenerator::convertHashJoin(const P::HashJoinPtr &physical_plan) { const std::vector<E::AttributeReferencePtr> &left_join_attributes = physical_plan->left_join_attributes(); for (const E::AttributeReferencePtr &left_join_attribute : left_join_attributes) { - // Try to determine the original stored relation referenced in the Hash Join. - referenced_stored_probe_relation = - optimizer_context_->catalog_database()->getRelationByName(left_join_attribute->relation_name()); - if (referenced_stored_probe_relation == nullptr) { - // Hash Join optimizations are not possible, if the referenced relation cannot be determined. - skip_hash_join_optimization = true; - } else { - const attribute_id probe_operator_attribute_id = - referenced_stored_probe_relation->getAttributeByName(left_join_attribute->attribute_name())->getID(); - probe_original_attribute_ids.emplace_back(probe_operator_attribute_id); - } - const CatalogAttribute *probe_catalog_attribute = attribute_substitution_map_[left_join_attribute->id()]; probe_attribute_ids.emplace_back(probe_catalog_attribute->getID()); @@ -637,18 +627,6 @@ void ExecutionGenerator::convertHashJoin(const P::HashJoinPtr &physical_plan) { const std::vector<E::AttributeReferencePtr> &right_join_attributes = physical_plan->right_join_attributes(); for (const E::AttributeReferencePtr &right_join_attribute : right_join_attributes) { - // Try to determine the original stored relation referenced in the Hash Join. - referenced_stored_build_relation = - optimizer_context_->catalog_database()->getRelationByName(right_join_attribute->relation_name()); - if (referenced_stored_build_relation == nullptr) { - // Hash Join optimizations are not possible, if the referenced relation cannot be determined. - skip_hash_join_optimization = true; - } else { - const attribute_id build_operator_attribute_id = - referenced_stored_build_relation->getAttributeByName(right_join_attribute->attribute_name())->getID(); - build_original_attribute_ids.emplace_back(build_operator_attribute_id); - } - const CatalogAttribute *build_catalog_attribute = attribute_substitution_map_[right_join_attribute->id()]; build_attribute_ids.emplace_back(build_catalog_attribute->getID()); @@ -658,6 +636,20 @@ void ExecutionGenerator::convertHashJoin(const P::HashJoinPtr &physical_plan) { } } + for (const auto &bf : bloom_filter_config.probe_side_bloom_filters) { + const CatalogAttribute *probe_bf_catalog_attribute + = attribute_substitution_map_[bf.attribute->id()]; + probe_side_bloom_filter_attribute_ids.emplace_back( + probe_bf_catalog_attribute->getID()); + } + + for (const auto &bf : bloom_filter_config.build_side_bloom_filters) { + const CatalogAttribute *build_bf_catalog_attribute + = attribute_substitution_map_[bf.attribute->id()]; + build_side_bloom_filter_attribute_ids.emplace_back( + build_bf_catalog_attribute->getID()); + } + // Remember key types for call to SimplifyHashTableImplTypeProto() below. std::vector<const Type*> key_types; for (std::vector<E::AttributeReferencePtr>::size_type attr_idx = 0; @@ -672,34 +664,7 @@ void ExecutionGenerator::convertHashJoin(const P::HashJoinPtr &physical_plan) { key_types.push_back(&left_attribute_type); } - std::size_t probe_cardinality = cost_model_->estimateCardinality(probe_physical); std::size_t build_cardinality = cost_model_->estimateCardinality(build_physical); - // For inner join, we may swap the probe table and the build table. - if (physical_plan->join_type() == P::HashJoin::JoinType::kInnerJoin) { - const bool left_unique_join_attrs = - probe_physical->impliesUniqueAttributes(left_join_attributes); - const bool right_unique_join_attrs = - build_physical->impliesUniqueAttributes(right_join_attributes); - - bool swap_probe_build; - if (left_unique_join_attrs != right_unique_join_attrs) { - swap_probe_build = left_unique_join_attrs; - } else { - // Choose the smaller table as the inner build table, - // and the other one as the outer probe table. - swap_probe_build = (probe_cardinality < build_cardinality); - } - - if (swap_probe_build) { - // Switch the probe and build physical nodes. - std::swap(probe_physical, build_physical); - std::swap(probe_cardinality, build_cardinality); - std::swap(probe_attribute_ids, build_attribute_ids); - std::swap(any_probe_attributes_nullable, any_build_attributes_nullable); - std::swap(probe_original_attribute_ids, build_original_attribute_ids); - std::swap(referenced_stored_probe_relation, referenced_stored_build_relation); - } - } // Convert the residual predicate proto. QueryContext::predicate_id residual_predicate_index = QueryContext::kInvalidPredicateId; @@ -861,8 +826,9 @@ void ExecutionGenerator::convertHashJoin(const P::HashJoinPtr &physical_plan) { join_operator_index, referenced_stored_build_relation, referenced_stored_probe_relation, - std::move(build_original_attribute_ids), - std::move(probe_original_attribute_ids), + bloom_filter_config, + std::move(build_side_bloom_filter_attribute_ids), + std::move(probe_side_bloom_filter_attribute_ids), join_hash_table_index, star_schema_cost_model_->estimateCardinality(build_physical)); } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/ExecutionHeuristics.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/ExecutionHeuristics.cpp b/query_optimizer/ExecutionHeuristics.cpp index dec078c..b407453 100644 --- a/query_optimizer/ExecutionHeuristics.cpp +++ b/query_optimizer/ExecutionHeuristics.cpp @@ -25,6 +25,8 @@ #include "catalog/CatalogTypedefs.hpp" #include "query_execution/QueryContext.pb.h" #include "query_optimizer/QueryPlan.hpp" +#include "query_optimizer/physical/Physical.hpp" +#include "query_optimizer/physical/HashJoin.hpp" #include "utility/Macros.hpp" #include "glog/logging.h" @@ -32,91 +34,79 @@ namespace quickstep { namespace optimizer { +namespace E = ::quickstep::optimizer::expressions; +namespace P = ::quickstep::optimizer::physical; + static const std::size_t kNumBitsPerByte = 8; DEFINE_double(bloom_num_bits_per_tuple, kNumBitsPerByte, "Number of bits per tuple used to size the Bloom filter."); -DEFINE_int32(bloom_num_hash_fns, 3, +DEFINE_int32(bloom_num_hash_fns, 1, "Number of hash functions used in the Bloom filter."); void ExecutionHeuristics::optimizeExecutionPlan(QueryPlan *query_plan, serialization::QueryContext *query_context_proto) { - // Currently this only optimizes left deep joins using bloom filters. - // It uses a simple algorithm to discover the left deep joins. - // It starts with the first hash join in the plan and keeps on iterating - // over the next hash joins, till a probe on a different relation id is found. - // The set of hash joins found in this way forms a chain and can be recognized - // as a left deep join. It becomes a candidate for optimization. - - // The optimization is done by modifying each of the build operators in the chain - // to generate a bloom filter on the build key during their hash table creation. - // The leaf-level probe operator is then modified to query all the bloom - // filters generated from all the build operators in the chain. These - // bloom filters are queried to test the membership of the probe key - // just prior to probing the hash table. - - QueryPlan::DAGNodeIndex origin_node = 0; - while (origin_node < hash_joins_.size() - 1) { - std::vector<std::size_t> chained_nodes; - chained_nodes.push_back(origin_node); - for (std::size_t i = origin_node + 1; i < hash_joins_.size(); ++i) { - const relation_id checked_relation_id = hash_joins_[origin_node].referenced_stored_probe_relation_->getID(); - const relation_id expected_relation_id = hash_joins_[i].referenced_stored_probe_relation_->getID(); - if (checked_relation_id == expected_relation_id) { - chained_nodes.push_back(i); - } else { - break; - } + std::map<std::pair<E::ExprId, P::PhysicalPtr>, + std::pair<QueryContext::bloom_filter_id, QueryPlan::DAGNodeIndex>> bloom_filter_map; + for (const auto &info : hash_joins_) { + auto *hash_table_proto = + query_context_proto->mutable_join_hash_tables(info.join_hash_table_id_); + const auto &bloom_filter_config = info.bloom_filter_config_; + + for (std::size_t i = 0; i < info.build_side_bloom_filter_ids_.size(); ++i) { + const QueryContext::bloom_filter_id bloom_filter_id = query_context_proto->bloom_filters_size(); + serialization::BloomFilter *bloom_filter_proto = query_context_proto->add_bloom_filters(); + setBloomFilterProperties(bloom_filter_proto, info.estimated_build_relation_cardinality_); + + const auto &build_side_bf = + bloom_filter_config.build_side_bloom_filters[i]; + bloom_filter_map.emplace( + std::make_pair(build_side_bf.attribute->id(), + bloom_filter_config.builder), + std::make_pair(bloom_filter_id, info.build_operator_index_)); + + auto *build_side_bloom_filter = hash_table_proto->add_build_side_bloom_filters(); + build_side_bloom_filter->set_bloom_filter_id(bloom_filter_id); + build_side_bloom_filter->set_attr_id(info.build_side_bloom_filter_ids_[i]); + + std::cout << "Build " << build_side_bf.attribute->toString() + << " @" << bloom_filter_config.builder << "\n"; } + } - // Only chains of length greater than one are suitable candidates for semi-join optimization. - if (chained_nodes.size() > 1) { - std::unordered_map<QueryContext::bloom_filter_id, std::vector<attribute_id>> probe_bloom_filter_info; - for (const std::size_t node : chained_nodes) { - // Provision for a new bloom filter to be used by the build operator. - const QueryContext::bloom_filter_id bloom_filter_id = query_context_proto->bloom_filters_size(); - serialization::BloomFilter *bloom_filter_proto = query_context_proto->add_bloom_filters(); - - // Modify the bloom filter properties based on the statistics of the relation. - setBloomFilterProperties(bloom_filter_proto, hash_joins_[node].referenced_stored_build_relation_); - - // Add build-side bloom filter information to the corresponding hash table proto. - query_context_proto->mutable_join_hash_tables(hash_joins_[node].join_hash_table_id_) - ->add_build_side_bloom_filter_id(bloom_filter_id); - - probe_bloom_filter_info.insert(std::make_pair(bloom_filter_id, hash_joins_[node].probe_attributes_)); - } - - // Add probe-side bloom filter information to the corresponding hash table proto for each build-side bloom filter. - for (const std::pair<QueryContext::bloom_filter_id, std::vector<attribute_id>> - &bloom_filter_info : probe_bloom_filter_info) { - auto *probe_side_bloom_filter = - query_context_proto->mutable_join_hash_tables(hash_joins_[origin_node].join_hash_table_id_) - ->add_probe_side_bloom_filters(); - probe_side_bloom_filter->set_probe_side_bloom_filter_id(bloom_filter_info.first); - for (const attribute_id &probe_attribute_id : bloom_filter_info.second) { - probe_side_bloom_filter->add_probe_side_attr_ids(probe_attribute_id); - } - } - - // Add node dependencies from chained build nodes to origin node probe. - for (std::size_t i = 1; i < chained_nodes.size(); ++i) { // Note: It starts from index 1. - query_plan->addDirectDependency(hash_joins_[origin_node].join_operator_index_, - hash_joins_[origin_node + i].build_operator_index_, - true /* is_pipeline_breaker */); - } + for (const auto &info : hash_joins_) { + auto *hash_table_proto = + query_context_proto->mutable_join_hash_tables(info.join_hash_table_id_); + const auto &bloom_filter_config = info.bloom_filter_config_; + + for (std::size_t i = 0; i < info.probe_side_bloom_filter_ids_.size(); ++i) { + auto *probe_side_bloom_filter = hash_table_proto->add_probe_side_bloom_filters(); + const auto &probe_side_bf = + bloom_filter_config.probe_side_bloom_filters[i]; + std::cout << "Probe " << probe_side_bf.attribute->toString() + << " @" << probe_side_bf.builder << "\n"; + + const auto &build_side_info = + bloom_filter_map.at( + std::make_pair(probe_side_bf.source_attribute->id(), + probe_side_bf.builder)); + probe_side_bloom_filter->set_bloom_filter_id(build_side_info.first); + probe_side_bloom_filter->set_attr_id(info.probe_side_bloom_filter_ids_[i]); + std::cout << "Probe attr_id = " << info.probe_side_bloom_filter_ids_[i] << "\n"; + + query_plan->addDirectDependency(info.join_operator_index_, + build_side_info.second, + true /* is_pipeline_breaker */); } - - // Update the origin node. - origin_node = chained_nodes.back() + 1; } } void ExecutionHeuristics::setBloomFilterProperties(serialization::BloomFilter *bloom_filter_proto, - const CatalogRelation *relation) { - const std::size_t cardinality = relation->getStatistics().getNumTuples(); + const std::size_t cardinality) { bloom_filter_proto->set_bloom_filter_size( - static_cast<std::size_t>(FLAGS_bloom_num_bits_per_tuple * cardinality) / kNumBitsPerByte); + BloomFilter::getNearestAllowedSize( + (FLAGS_bloom_num_bits_per_tuple * cardinality) / kNumBitsPerByte)); + std::cout << "bf size = " << bloom_filter_proto->bloom_filter_size() << "\n"; bloom_filter_proto->set_number_of_hashes(FLAGS_bloom_num_hash_fns); } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/ExecutionHeuristics.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/ExecutionHeuristics.hpp b/query_optimizer/ExecutionHeuristics.hpp index 2106a1b..8af1b4a 100644 --- a/query_optimizer/ExecutionHeuristics.hpp +++ b/query_optimizer/ExecutionHeuristics.hpp @@ -25,6 +25,7 @@ #include "query_execution/QueryContext.hpp" #include "query_execution/QueryContext.pb.h" #include "query_optimizer/QueryPlan.hpp" +#include "query_optimizer/physical/HashJoin.hpp" #include "utility/Macros.hpp" #include "glog/logging.h" @@ -65,16 +66,18 @@ class ExecutionHeuristics { const QueryPlan::DAGNodeIndex join_operator_index, const CatalogRelation *referenced_stored_build_relation, const CatalogRelation *referenced_stored_probe_relation, - std::vector<attribute_id> &&build_attributes, - std::vector<attribute_id> &&probe_attributes, + const physical::BloomFilterConfig &bloom_filter_config, + std::vector<attribute_id> &&build_side_bloom_filter_ids, + std::vector<attribute_id> &&probe_side_bloom_filter_ids, const QueryContext::join_hash_table_id join_hash_table_id, const std::size_t estimated_build_relation_cardinality) : build_operator_index_(build_operator_index), join_operator_index_(join_operator_index), referenced_stored_build_relation_(referenced_stored_build_relation), referenced_stored_probe_relation_(referenced_stored_probe_relation), - build_attributes_(std::move(build_attributes)), - probe_attributes_(std::move(probe_attributes)), + bloom_filter_config_(bloom_filter_config), + build_side_bloom_filter_ids_(std::move(build_side_bloom_filter_ids)), + probe_side_bloom_filter_ids_(std::move(probe_side_bloom_filter_ids)), join_hash_table_id_(join_hash_table_id), estimated_build_relation_cardinality_(estimated_build_relation_cardinality) { } @@ -83,8 +86,9 @@ class ExecutionHeuristics { const QueryPlan::DAGNodeIndex join_operator_index_; const CatalogRelation *referenced_stored_build_relation_; const CatalogRelation *referenced_stored_probe_relation_; - const std::vector<attribute_id> build_attributes_; - const std::vector<attribute_id> probe_attributes_; + const physical::BloomFilterConfig &bloom_filter_config_; + const std::vector<attribute_id> build_side_bloom_filter_ids_; + const std::vector<attribute_id> probe_side_bloom_filter_ids_; const QueryContext::join_hash_table_id join_hash_table_id_; const std::size_t estimated_build_relation_cardinality_; }; @@ -112,16 +116,18 @@ class ExecutionHeuristics { const QueryPlan::DAGNodeIndex join_operator_index, const CatalogRelation *referenced_stored_build_relation, const CatalogRelation *referenced_stored_probe_relation, - std::vector<attribute_id> &&build_attributes, - std::vector<attribute_id> &&probe_attributes, + const physical::BloomFilterConfig &bloom_filter_config, + std::vector<attribute_id> &&build_side_bloom_filter_ids, + std::vector<attribute_id> &&probe_side_bloom_filter_ids, const QueryContext::join_hash_table_id join_hash_table_id, const std::size_t estimated_build_relation_cardinality) { hash_joins_.push_back(HashJoinInfo(build_operator_index, join_operator_index, referenced_stored_build_relation, referenced_stored_probe_relation, - std::move(build_attributes), - std::move(probe_attributes), + bloom_filter_config, + std::move(build_side_bloom_filter_ids), + std::move(probe_side_bloom_filter_ids), join_hash_table_id, estimated_build_relation_cardinality)); } @@ -144,7 +150,7 @@ class ExecutionHeuristics { * @param relation The catalog relation on which bloom filter is being built. **/ void setBloomFilterProperties(serialization::BloomFilter *bloom_filter_proto, - const CatalogRelation *relation); + const std::size_t cardinality); std::size_t estimated_build_relation_cardinality() const { return estimated_build_relation_cardinality_; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/PhysicalGenerator.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/PhysicalGenerator.cpp b/query_optimizer/PhysicalGenerator.cpp index 50e1d86..f73a546 100644 --- a/query_optimizer/PhysicalGenerator.cpp +++ b/query_optimizer/PhysicalGenerator.cpp @@ -114,7 +114,6 @@ P::PhysicalPtr PhysicalGenerator::optimizePlan() { quickstep::PlanVisualizer plan_visualizer; std::cerr << "\n" << plan_visualizer.visualize(physical_plan_) << "\n"; } - exit(0); #ifdef QUICKSTEP_DEBUG Validate(physical_plan_); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/physical/HashJoin.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/physical/HashJoin.cpp b/query_optimizer/physical/HashJoin.cpp index 7bea86a..f0e72e8 100644 --- a/query_optimizer/physical/HashJoin.cpp +++ b/query_optimizer/physical/HashJoin.cpp @@ -115,6 +115,24 @@ void HashJoin::getFieldStringItems( container_child_fields->push_back(CastSharedPtrVector<OptimizerTreeBase>(left_join_attributes_)); container_child_field_names->push_back("right_join_attributes"); container_child_fields->push_back(CastSharedPtrVector<OptimizerTreeBase>(right_join_attributes_)); + + if (!bloom_filter_config_.build_side_bloom_filters.empty()) { + container_child_field_names->push_back("build_side_bloom_filters"); + container_child_fields->emplace_back(); + auto &container = container_child_fields->back(); + for (const auto& bf : bloom_filter_config_.build_side_bloom_filters) { + container.emplace_back(bf.attribute); + } + } + + if (!bloom_filter_config_.probe_side_bloom_filters.empty()) { + container_child_field_names->push_back("probe_side_bloom_filters"); + container_child_fields->emplace_back(); + auto &container = container_child_fields->back(); + for (const auto& bf : bloom_filter_config_.probe_side_bloom_filters) { + container.emplace_back(bf.attribute); + } + } } } // namespace physical http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/physical/HashJoin.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/physical/HashJoin.hpp b/query_optimizer/physical/HashJoin.hpp index 55c9588..cacb08b 100644 --- a/query_optimizer/physical/HashJoin.hpp +++ b/query_optimizer/physical/HashJoin.hpp @@ -48,6 +48,56 @@ namespace physical { class HashJoin; typedef std::shared_ptr<const HashJoin> HashJoinPtr; +struct BloomFilterConfig { + struct BuildSide { + BuildSide(const expressions::AttributeReferencePtr &attribute_in) + : attribute(attribute_in) { + } + expressions::AttributeReferencePtr attribute; + }; + struct ProbeSide { + ProbeSide(const expressions::AttributeReferencePtr &attribute_in, + const expressions::AttributeReferencePtr &source_attribute_in, + const physical::PhysicalPtr &builder_in) + : attribute(attribute_in), + source_attribute(source_attribute_in), + builder(builder_in) { + } + expressions::AttributeReferencePtr attribute; + expressions::AttributeReferencePtr source_attribute; + PhysicalPtr builder; + }; + BloomFilterConfig() {} + BloomFilterConfig(const PhysicalPtr &builder_in) + : builder(builder_in) { + } + BloomFilterConfig(const PhysicalPtr &builder_in, + const std::vector<BuildSide> &build_side_bloom_filters_in, + const std::vector<ProbeSide> &probe_side_bloom_filters_in) + : builder(builder_in), + build_side_bloom_filters(build_side_bloom_filters_in), + probe_side_bloom_filters(probe_side_bloom_filters_in) { + } + void addBuildSideBloomFilter(const expressions::AttributeReferencePtr &attribute_in) { + for (const auto &build_bf : build_side_bloom_filters) { + if (attribute_in == build_bf.attribute) { + return; + } + } + build_side_bloom_filters.emplace_back(attribute_in); + } + void addProbeSideBloomFilter(const expressions::AttributeReferencePtr &attribute_in, + const expressions::AttributeReferencePtr &source_attribute_in, + const physical::PhysicalPtr &builder_in) { + probe_side_bloom_filters.emplace_back(attribute_in, + source_attribute_in, + builder_in); + } + PhysicalPtr builder; + std::vector<BuildSide> build_side_bloom_filters; + std::vector<ProbeSide> probe_side_bloom_filters; +}; + /** * @brief Physical hash join node. */ @@ -115,7 +165,8 @@ class HashJoin : public BinaryJoin { right_join_attributes_, residual_predicate_, project_expressions(), - join_type_); + join_type_, + bloom_filter_config_); } std::vector<expressions::AttributeReferencePtr> getReferencedAttributes() const override; @@ -127,6 +178,10 @@ class HashJoin : public BinaryJoin { bool impliesUniqueAttributes( const std::vector<expressions::AttributeReferencePtr> &attributes) const override; + const BloomFilterConfig &bloom_filter_config() const { + return bloom_filter_config_; + } + /** * @brief Creates a physical HashJoin. The left/right operand does not correspond to * probe/build operand. @@ -147,7 +202,8 @@ class HashJoin : public BinaryJoin { const std::vector<expressions::AttributeReferencePtr> &right_join_attributes, const expressions::PredicatePtr &residual_predicate, const std::vector<expressions::NamedExpressionPtr> &project_expressions, - const JoinType join_type) { + const JoinType join_type, + const BloomFilterConfig bloom_filter_config = BloomFilterConfig()) { return HashJoinPtr( new HashJoin(left, right, @@ -155,7 +211,8 @@ class HashJoin : public BinaryJoin { right_join_attributes, residual_predicate, project_expressions, - join_type)); + join_type, + bloom_filter_config)); } protected: @@ -175,18 +232,21 @@ class HashJoin : public BinaryJoin { const std::vector<expressions::AttributeReferencePtr> &right_join_attributes, const expressions::PredicatePtr &residual_predicate, const std::vector<expressions::NamedExpressionPtr> &project_expressions, - const JoinType join_type) + const JoinType join_type, + const BloomFilterConfig &bloom_filter_config) : BinaryJoin(left, right, project_expressions), left_join_attributes_(left_join_attributes), right_join_attributes_(right_join_attributes), residual_predicate_(residual_predicate), - join_type_(join_type) { + join_type_(join_type), + bloom_filter_config_(bloom_filter_config) { } std::vector<expressions::AttributeReferencePtr> left_join_attributes_; std::vector<expressions::AttributeReferencePtr> right_join_attributes_; expressions::PredicatePtr residual_predicate_; JoinType join_type_; + BloomFilterConfig bloom_filter_config_; DISALLOW_COPY_AND_ASSIGN(HashJoin); }; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/rules/AttachBloomFilters.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/rules/AttachBloomFilters.cpp b/query_optimizer/rules/AttachBloomFilters.cpp index b3cf0da..e3bdc36 100644 --- a/query_optimizer/rules/AttachBloomFilters.cpp +++ b/query_optimizer/rules/AttachBloomFilters.cpp @@ -70,16 +70,29 @@ P::PhysicalPtr AttachBloomFilters::apply(const P::PhysicalPtr &input) { std::cerr << "********\n"; } - return input; + return visitAndAttach(input); } -void AttachBloomFilters::visitProducer(const P::PhysicalPtr &node, int depth) { +void AttachBloomFilters::visitProducer(const P::PhysicalPtr &node, const int depth) { for (const P::PhysicalPtr &child : node->children()) { visitProducer(child, depth+1); } std::vector<BloomFilterInfo> bloom_filters; + if (node->getPhysicalType() == P::PhysicalType::kHashJoin) { + const P::HashJoinPtr &hash_join = + std::static_pointer_cast<const P::HashJoin>(node); + const P::PhysicalPtr &build_node = hash_join->right(); + double selectivity = cost_model_->estimateSelectivity(build_node); + if (selectivity < 1.0) { + auto &build_node_info = producers_[build_node]; + for (const auto &attr : hash_join->right_join_attributes()) { + build_node_info.emplace_back(node, attr, depth, selectivity, false); + } + } + } + const std::vector<E::AttributeReferencePtr> output_attributes( node->getOutputAttributes()); std::unordered_set<E::ExprId> output_attribute_ids; @@ -111,12 +124,12 @@ void AttachBloomFilters::visitProducer(const P::PhysicalPtr &node, int depth) { } // Self-produced bloom filters - double selectivity = cost_model_->estimateSelectivity(node); - if (selectivity < 1.0) { - for (const auto &attr : output_attributes) { - bloom_filters.emplace_back(node, attr, depth, selectivity, false); - } - } +// double selectivity = cost_model_->estimateSelectivity(node); +// if (selectivity < 1.0) { +// for (const auto &attr : output_attributes) { +// bloom_filters.emplace_back(node, attr, depth, selectivity, false); +// } +// } producers_.emplace(node, std::move(bloom_filters)); } @@ -149,41 +162,62 @@ void AttachBloomFilters::visitConsumer(const P::PhysicalPtr &node) { } } - // Bloom filters from siblings via HashJoin + // Bloom filters from build side to probe side via HashJoin if (node->getPhysicalType() == P::PhysicalType::kHashJoin) { const P::HashJoinPtr hash_join = std::static_pointer_cast<const P::HashJoin>(node); - std::vector<P::PhysicalPtr> children = - { hash_join->left(), hash_join->right() }; + const P::PhysicalPtr &producer_child = hash_join->right(); + const P::PhysicalPtr &consumer_child = hash_join->left(); std::unordered_map<E::ExprId, E::AttributeReferencePtr> join_attribute_pairs; for (std::size_t i = 0; i < hash_join->left_join_attributes().size(); ++i) { - const E::AttributeReferencePtr left_join_attribute = + const E::AttributeReferencePtr probe_join_attribute = hash_join->left_join_attributes()[i]; - const E::AttributeReferencePtr right_join_attribute = + const E::AttributeReferencePtr build_join_attribute = hash_join->right_join_attributes()[i]; - join_attribute_pairs.emplace(left_join_attribute->id(), - right_join_attribute); - join_attribute_pairs.emplace(right_join_attribute->id(), - left_join_attribute); + join_attribute_pairs.emplace(build_join_attribute->id(), + probe_join_attribute); } - for (int side = 0; side < 2; ++side) { - const P::PhysicalPtr &producer_child = children[side]; - const P::PhysicalPtr &consumer_child = children[1-side]; - - auto &consumer_bloom_filters = consumers_[consumer_child]; - for (const auto &info : producers_[producer_child]) { - const auto pair_it = join_attribute_pairs.find(info.attribute->id()); - if (pair_it != join_attribute_pairs.end()) { - consumer_bloom_filters.emplace_back(info.source, - pair_it->second, - info.depth, - info.selectivity, - true, - info.attribute); + + auto &consumer_bloom_filters = consumers_[consumer_child]; + for (const auto &info : producers_[producer_child]) { + const auto pair_it = join_attribute_pairs.find(info.attribute->id()); + if (pair_it != join_attribute_pairs.end()) { + consumer_bloom_filters.emplace_back(info.source, + pair_it->second, + info.depth, + info.selectivity, + true, + info.attribute); + } + } + + // Decide attaches + if (cost_model_->estimateCardinality(consumer_child) > 200000000 && + !consumer_bloom_filters.empty()) { + std::map<E::AttributeReferencePtr, const BloomFilterInfo*> filters; + for (const auto &info : consumer_bloom_filters) { + auto it = filters.find(info.attribute); + if (it == filters.end()) { + filters.emplace(info.attribute, &info); + } else { + if (BloomFilterInfo::isBetterThan(&info, it->second)) { + it->second = &info; + } } } + + auto &probe_attaches = getBloomFilterConfig(node); + for (const auto &pair : filters) { + auto &build_attaches = getBloomFilterConfig(pair.second->source); + build_attaches.addBuildSideBloomFilter( + pair.second->source_attribute); + probe_attaches.addProbeSideBloomFilter( + pair.first, + pair.second->source_attribute, + pair.second->source); + } } } @@ -192,5 +226,52 @@ void AttachBloomFilters::visitConsumer(const P::PhysicalPtr &node) { } } +P::PhysicalPtr AttachBloomFilters::visitAndAttach(const physical::PhysicalPtr &node) { + std::vector<P::PhysicalPtr> new_children; + bool has_changed = false; + for (const auto &child : node->children()) { + P::PhysicalPtr new_child = visitAndAttach(child); + if (new_child != child) { + has_changed = true; + } + new_children.emplace_back(new_child); + } + + if (node->getPhysicalType() == P::PhysicalType::kHashJoin) { + const auto attach_it = attaches_.find(node); + if (attach_it != attaches_.end()) { + for (const auto& item : attach_it->second.probe_side_bloom_filters) { + std::cout << "Attach probe from " << item.builder + << " to " << node << "\n"; + } + + const P::HashJoinPtr hash_join = + std::static_pointer_cast<const P::HashJoin>(node); + return P::HashJoin::Create( + new_children[0], + new_children[1], + hash_join->left_join_attributes(), + hash_join->right_join_attributes(), + hash_join->residual_predicate(), + hash_join->project_expressions(), + hash_join->join_type(), + attach_it->second); + } + } + + if (has_changed) { + return node->copyWithNewChildren(new_children); + } + + return node; +} + +P::BloomFilterConfig& AttachBloomFilters::getBloomFilterConfig(const physical::PhysicalPtr &node) { + if (attaches_.find(node) == attaches_.end()) { + attaches_.emplace(node, node); + } + return attaches_[node]; +} + } // namespace optimizer } // namespace quickstep http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/rules/AttachBloomFilters.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/rules/AttachBloomFilters.hpp b/query_optimizer/rules/AttachBloomFilters.hpp index 69b50e6..e4437f7 100644 --- a/query_optimizer/rules/AttachBloomFilters.hpp +++ b/query_optimizer/rules/AttachBloomFilters.hpp @@ -32,6 +32,7 @@ #include "query_optimizer/expressions/NamedExpression.hpp" #include "query_optimizer/expressions/Predicate.hpp" #include "query_optimizer/physical/Physical.hpp" +#include "query_optimizer/physical/HashJoin.hpp" #include "query_optimizer/rules/Rule.hpp" #include "utility/Macros.hpp" @@ -76,6 +77,14 @@ class AttachBloomFilters : public Rule<physical::Physical> { : source_attribute_in) { } + static bool isBetterThan(const BloomFilterInfo *a, + const BloomFilterInfo *b) { + if (a->selectivity == b->selectivity) { + return a->depth > b->depth; + } else { + return a->selectivity < b->selectivity; + } + } physical::PhysicalPtr source; expressions::AttributeReferencePtr attribute; int depth; @@ -84,14 +93,19 @@ class AttachBloomFilters : public Rule<physical::Physical> { expressions::AttributeReferencePtr source_attribute; }; - void visitProducer(const physical::PhysicalPtr &node, int depth); + void visitProducer(const physical::PhysicalPtr &node, const int depth); void visitConsumer(const physical::PhysicalPtr &node); + physical::PhysicalPtr visitAndAttach(const physical::PhysicalPtr &node); + + physical::BloomFilterConfig &getBloomFilterConfig(const physical::PhysicalPtr &node); + std::unique_ptr<cost::StarSchemaSimpleCostModel> cost_model_; std::map<physical::PhysicalPtr, std::vector<BloomFilterInfo>> producers_; std::map<physical::PhysicalPtr, std::vector<BloomFilterInfo>> consumers_; + std::map<physical::PhysicalPtr, physical::BloomFilterConfig> attaches_; DISALLOW_COPY_AND_ASSIGN(AttachBloomFilters); }; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp index e198149..42a7402 100644 --- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp +++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp @@ -223,6 +223,21 @@ physical::PhysicalPtr StarSchemaHashJoinOrderOptimization::generatePlan( std::unique_ptr<JoinPair> new_join( new JoinPair(probe_table_info, build_table_info)); if (best_join == nullptr || new_join->isBetterThan(*best_join)) { +// if (best_join != nullptr) { +// std::cerr << "(" << best_join->probe->estimated_selectivity +// << ", " << best_join->probe->estimated_cardinality << ")" +// << " -- " +// << "(" << best_join->build->estimated_selectivity +// << ", " << best_join->build->estimated_cardinality << ")" +// << "\n"; +// std::cerr << "REPLACED WITH\n"; +// } +// std::cerr << "(" << new_join->probe->estimated_selectivity +// << ", " << new_join->probe->estimated_cardinality << ")" +// << " -- " +// << "(" << new_join->build->estimated_selectivity +// << ", " << new_join->build->estimated_cardinality << ")" +// << "\n****\n"; best_join.reset(new_join.release()); } } @@ -242,6 +257,11 @@ physical::PhysicalPtr StarSchemaHashJoinOrderOptimization::generatePlan( CHECK(selected_probe_table_info != nullptr); CHECK(selected_build_table_info != nullptr); +// std::cerr << selected_probe_table_info->estimated_selectivity +// << " -- " +// << selected_build_table_info->estimated_selectivity +// << "\n"; + // std::cerr << selected_probe_table_info->estimated_num_output_attributes // << " -- " // << selected_build_table_info->estimated_num_output_attributes @@ -278,10 +298,10 @@ physical::PhysicalPtr StarSchemaHashJoinOrderOptimization::generatePlan( if (remaining_tables.size() > 0) { P::PhysicalPtr output = - P::HashJoin::Create(build_child, - probe_child, - build_attributes, + P::HashJoin::Create(probe_child, + build_child, probe_attributes, + build_attributes, nullptr, output_attributes, P::HashJoin::JoinType::kInnerJoin); @@ -310,10 +330,10 @@ physical::PhysicalPtr StarSchemaHashJoinOrderOptimization::generatePlan( } } } else { - return P::HashJoin::Create(build_child, - probe_child, - build_attributes, + return P::HashJoin::Create(probe_child, + build_child, probe_attributes, + build_attributes, residual_predicate, project_expressions, P::HashJoin::JoinType::kInnerJoin); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp index 6ad300c..a0e34ce 100644 --- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp +++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp @@ -107,13 +107,13 @@ class StarSchemaHashJoinOrderOptimization : public Rule<physical::Physical> { return rhs_has_large_output; } - const bool lhs_has_small_build = - !lhs_has_large_output && lhs.build->estimated_cardinality < 0x1000; - const bool rhs_has_small_build = - !rhs_has_large_output && rhs.build->estimated_cardinality < 0x1000; - if (lhs_has_small_build != rhs_has_small_build) { - return lhs_has_small_build; - } +// const bool lhs_has_small_build = +// !lhs_has_large_output && lhs.build->estimated_cardinality < 0x1000; +// const bool rhs_has_small_build = +// !rhs_has_large_output && rhs.build->estimated_cardinality < 0x1000; +// if (lhs_has_small_build != rhs_has_small_build) { +// return lhs_has_small_build; +// } if (lhs.probe->estimated_cardinality != rhs.probe->estimated_cardinality) { return lhs.probe->estimated_cardinality < rhs.probe->estimated_cardinality; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/storage/HashTable.hpp ---------------------------------------------------------------------- diff --git a/storage/HashTable.hpp b/storage/HashTable.hpp index 13cd83c..2c526c2 100644 --- a/storage/HashTable.hpp +++ b/storage/HashTable.hpp @@ -41,6 +41,7 @@ #include "types/TypedValue.hpp" #include "utility/BloomFilter.hpp" #include "utility/BloomFilterAdapter.hpp" +#include "utility/EventProfiler.hpp" #include "utility/HashPair.hpp" #include "utility/Macros.hpp" @@ -1047,8 +1048,8 @@ class HashTable : public HashTableBase<resizable, * @param probe_attribute_ids The vector of attribute ids to use for probing * the bloom filter. **/ - inline void addProbeSideAttributeIds(std::vector<attribute_id> &&probe_attribute_ids) { - probe_attribute_ids_.push_back(probe_attribute_ids); + inline void addProbeSideAttributeIds(const attribute_id &probe_attribute_id) { + probe_attribute_ids_.push_back(probe_attribute_id); } protected: @@ -1336,7 +1337,7 @@ class HashTable : public HashTableBase<resizable, bool has_probe_side_bloom_filter_ = false; BloomFilter *build_bloom_filter_; std::vector<const BloomFilter*> probe_bloom_filters_; - std::vector<std::vector<attribute_id>> probe_attribute_ids_; + std::vector<attribute_id> probe_attribute_ids_; DISALLOW_COPY_AND_ASSIGN(HashTable); }; @@ -2257,23 +2258,16 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_ // Find (and cache) the size of each attribute in the probe lists. // NOTE(nav): This code uses the accessor to get the size, // and hence only works if there's at least one tuple. - std::vector<std::vector<std::size_t>> attr_size_vectors; - attr_size_vectors.reserve(probe_attribute_ids_.size()); - for (const auto &probe_attr_vector : probe_attribute_ids_) { - std::vector<std::size_t> attr_sizes; - attr_sizes.reserve(probe_attr_vector.size()); - for (const auto &probe_attr : probe_attr_vector) { - // Using the function below is the easiest way to get attribute size - // here. The template parameter check_null is set to false to ensure - // that we get the size even if the first attr value is null. - auto val_and_size = accessor->template getUntypedValueAndByteLengthAtAbsolutePosition<false>(0, probe_attr); - attr_sizes.push_back(val_and_size.second); - } - attr_size_vectors.push_back(attr_sizes); + std::vector<std::size_t> attr_size_vector; + attr_size_vector.reserve(probe_attribute_ids_.size()); + for (const auto &probe_attr : probe_attribute_ids_) { + auto val_and_size = + accessor->template getUntypedValueAndByteLengthAtAbsolutePosition<false>(0, probe_attr); + attr_size_vector.push_back(val_and_size.second); } bloom_filter_adapter.reset(new BloomFilterAdapter( - probe_bloom_filters_, probe_attribute_ids_, attr_size_vectors)); + probe_bloom_filters_, probe_attribute_ids_, attr_size_vector)); // We want to have large batch sizes for cache efficiency while probeing, // but small batch sizes to ensure that the adaptation logic kicks in @@ -2286,6 +2280,9 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_ std::uint32_t num_tuples_left = accessor->getNumTuples(); std::vector<tuple_id> batch(num_tuples_left); + auto *container = simple_profiler.getContainer(); + auto *line = container->getEventLine(0); + do { const std::uint32_t batch_size = batch_size_try < num_tuples_left ? batch_size_try : num_tuples_left; @@ -2294,12 +2291,16 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_ batch.push_back(accessor->getCurrentPosition()); } + line->emplace_back(); std::size_t num_hits; if (FLAGS_adapt_bloom_filters) { num_hits = bloom_filter_adapter->bulkProbe<true>(accessor, batch); } else { num_hits = bloom_filter_adapter->bulkProbe<false>(accessor, batch); } + line->back().setPayload(num_hits+0); + line->back().endEvent(); +// std::size_t num_hits = batch_size; for (std::size_t i = 0; i < num_hits; ++i){ const tuple_id probe_tid = batch[i]; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/storage/HashTable.proto ---------------------------------------------------------------------- diff --git a/storage/HashTable.proto b/storage/HashTable.proto index 7f00f29..0cf9f5e 100644 --- a/storage/HashTable.proto +++ b/storage/HashTable.proto @@ -34,10 +34,10 @@ message HashTable { required HashTableImplType hash_table_impl_type = 1; repeated Type key_types = 2; required uint64 estimated_num_entries = 3; - repeated uint32 build_side_bloom_filter_id = 4; - message ProbeSideBloomFilter { - required uint32 probe_side_bloom_filter_id = 1; - repeated uint32 probe_side_attr_ids = 2; + message BloomFilterReference { + required uint32 bloom_filter_id = 1; + required uint32 attr_id = 2; } - repeated ProbeSideBloomFilter probe_side_bloom_filters = 6; + repeated BloomFilterReference build_side_bloom_filters = 4; + repeated BloomFilterReference probe_side_bloom_filters = 5; } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/storage/HashTableFactory.hpp ---------------------------------------------------------------------- diff --git a/storage/HashTableFactory.hpp b/storage/HashTableFactory.hpp index 34baaeb..00a09c1 100644 --- a/storage/HashTableFactory.hpp +++ b/storage/HashTableFactory.hpp @@ -318,9 +318,11 @@ class HashTableFactory { // individual implementations of the hash table constructors. // Check if there are any build side bloom filter defined on the hash table. - if (proto.build_side_bloom_filter_id_size() > 0) { + if (proto.build_side_bloom_filters_size() > 0) { + CHECK_EQ(1u, proto.build_side_bloom_filters_size()); hash_table->enableBuildSideBloomFilter(); - hash_table->setBuildSideBloomFilter(bloom_filters[proto.build_side_bloom_filter_id(0)].get()); + hash_table->setBuildSideBloomFilter( + bloom_filters[proto.build_side_bloom_filters(0).bloom_filter_id()].get()); } // Check if there are any probe side bloom filters defined on the hash table. @@ -330,15 +332,11 @@ class HashTableFactory { for (int j = 0; j < proto.probe_side_bloom_filters_size(); ++j) { // Add the pointer to the probe bloom filter within the list of probe bloom filters to use. const auto probe_side_bloom_filter = proto.probe_side_bloom_filters(j); - hash_table->addProbeSideBloomFilter(bloom_filters[probe_side_bloom_filter.probe_side_bloom_filter_id()].get()); + hash_table->addProbeSideBloomFilter( + bloom_filters[probe_side_bloom_filter.bloom_filter_id()].get()); // Add the attribute ids corresponding to this probe bloom filter. - std::vector<attribute_id> probe_attribute_ids; - for (int k = 0; k < probe_side_bloom_filter.probe_side_attr_ids_size(); ++k) { - const attribute_id probe_attribute_id = probe_side_bloom_filter.probe_side_attr_ids(k); - probe_attribute_ids.push_back(probe_attribute_id); - } - hash_table->addProbeSideAttributeIds(std::move(probe_attribute_ids)); + hash_table->addProbeSideAttributeIds(probe_side_bloom_filter.attr_id()); } } http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/9cc47e56/utility/BloomFilterAdapter.hpp ---------------------------------------------------------------------- diff --git a/utility/BloomFilterAdapter.hpp b/utility/BloomFilterAdapter.hpp index fbd6957..a160353 100644 --- a/utility/BloomFilterAdapter.hpp +++ b/utility/BloomFilterAdapter.hpp @@ -40,8 +40,8 @@ namespace quickstep { class BloomFilterAdapter { public: BloomFilterAdapter(const std::vector<const BloomFilter*> &bloom_filters, - const std::vector<std::vector<attribute_id>> attribute_ids, - const std::vector<std::vector<std::size_t>> attr_sizes) { + const std::vector<attribute_id> &attribute_ids, + const std::vector<std::size_t> &attr_sizes) { DCHECK_EQ(bloom_filters.size(), attribute_ids.size()); DCHECK_EQ(bloom_filters.size(), attr_sizes.size()); @@ -64,7 +64,7 @@ class BloomFilterAdapter { std::vector<tuple_id> &batch) { std::size_t out_size = batch.size(); for (auto &entry : bloom_filter_entries_) { - out_size = bulkProbeBloomFilterEntry(*entry, accessor, batch, out_size); + out_size = bulkProbeBloomFilterEntry<adapt_filters>(*entry, accessor, batch, out_size); } adaptEntryOrder(); return out_size; @@ -73,11 +73,11 @@ class BloomFilterAdapter { private: struct BloomFilterEntry { BloomFilterEntry(const BloomFilter *in_bloom_filter, - const std::vector<attribute_id> &in_attribute_ids, - const std::vector<std::size_t> &in_attribute_sizes) + const attribute_id &in_attribute_id, + const std::size_t &in_attribute_size) : bloom_filter(in_bloom_filter), - attribute_ids(in_attribute_ids), - attribute_sizes(in_attribute_sizes), + attribute_id(in_attribute_id), + attribute_size(in_attribute_size), miss(0), cnt(0) { } @@ -88,8 +88,8 @@ class BloomFilterAdapter { } const BloomFilter *bloom_filter; - const std::vector<attribute_id> attribute_ids; - const std::vector<std::size_t> attribute_sizes; + const attribute_id attribute_id; + const std::size_t attribute_size; std::uint32_t miss; std::uint32_t cnt; float miss_rate; @@ -105,16 +105,12 @@ class BloomFilterAdapter { const BloomFilter *bloom_filter = entry.bloom_filter; for (std::size_t t = 0; t < in_size; ++t) { - tuple_id tid = batch[t]; - for (std::size_t a = 0; a < entry.attribute_ids.size(); ++a) { - const attribute_id &attr_id = entry.attribute_ids[a]; - const std::size_t size = entry.attribute_sizes[a]; - const auto value = static_cast<const std::uint8_t*>( - accessor->getUntypedValueAtAbsolutePosition(attr_id, tid)); - if (bloom_filter->contains(value, size)) { - batch[out_size] = tid; - ++out_size; - } + const tuple_id tid = batch[t]; + const auto value = static_cast<const std::uint8_t*>( + accessor->getUntypedValueAtAbsolutePosition(entry.attribute_id, tid)); + if (bloom_filter->contains(value, entry.attribute_size)) { + batch[out_size] = tid; + ++out_size; } } if (adapt_filters) {