Github user zuyu commented on a diff in the pull request:
https://github.com/apache/incubator-quickstep/pull/171#discussion_r98385945
--- Diff: relational_operators/HashJoinOperator.cpp ---
@@ -504,123 +616,61 @@ void HashInnerJoinWorkOrder::execute() {
// hash join is below a reasonable threshold so that we don't blow up
// temporary memory requirements to an unreasonable degree.
if (residual_predicate_ != nullptr) {
- std::pair<std::vector<tuple_id>, std::vector<tuple_id>>
filtered_matches;
+ PairOfVectors filtered_matches;
+
for (std::size_t i = 0; i < build_tids.size(); ++i) {
if (residual_predicate_->matchesForJoinedTuples(*build_accessor,
build_relation_id,
build_tids[i],
*probe_accessor,
probe_relation_id,
probe_tids[i])) {
- filtered_matches.first.push_back(build_tids[i]);
- filtered_matches.second.push_back(probe_tids[i]);
+ filtered_matches.first.emplace_back(build_tids[i]);
+ filtered_matches.second.emplace_back(probe_tids[i]);
}
}
build_block_entry.second = std::move(filtered_matches);
}
- // TODO(chasseur): If all the output expressions are ScalarAttributes,
- // we could implement a similar fast-path to
StorageBlock::selectSimple()
- // that avoids a copy.
- //
// TODO(chasseur): See TODO in NestedLoopsJoinOperator.cpp about
limiting
// the size of materialized temporary results. In common usage, this
// probably won't be an issue for hash-joins, but in the worst case a
hash
// join can still devolve into a cross-product.
- //
- // NOTE(chasseur): We could also create one big
ColumnVectorsValueAccessor
- // and accumulate all the results across multiple block pairs into it
- // before inserting anything into output blocks, but this would require
- // some significant API extensions to the expressions system for a
dubious
- // benefit (probably only a real performance win when there are very
few
- // matching tuples in each individual inner block but very many inner
- // blocks with at least one match).
-
- // We now create ordered value accessors for both build and probe side,
- // using the joined tuple TIDs. Note that we have to use this
Lambda-based
- // invocation method here because the accessors don't have a virtual
- // function that creates such an
OrderedTupleIdSequenceAdapterValueAccessor.
- std::unique_ptr<ValueAccessor> ordered_build_accessor,
ordered_probe_accessor;
- InvokeOnValueAccessorNotAdapter(
- build_accessor.get(),
- [&](auto *accessor) -> void { // NOLINT(build/c++11)
- ordered_build_accessor.reset(
-
accessor->createSharedOrderedTupleIdSequenceAdapter(build_tids));
- });
-
- if (probe_accessor->isTupleIdSequenceAdapter()) {
- InvokeOnTupleIdSequenceAdapterValueAccessor(
- probe_accessor.get(),
- [&](auto *accessor) -> void { // NOLINT(build/c++11)
- ordered_probe_accessor.reset(
-
accessor->createSharedOrderedTupleIdSequenceAdapter(probe_tids));
- });
- } else {
- InvokeOnValueAccessorNotAdapter(
- probe_accessor.get(),
- [&](auto *accessor) -> void { // NOLINT(build/c++11)
- ordered_probe_accessor.reset(
-
accessor->createSharedOrderedTupleIdSequenceAdapter(probe_tids));
- });
- }
-
// We also need a temp value accessor to store results of any scalar
expressions.
ColumnVectorsValueAccessor temp_result;
+ if (!non_trivial_expressions.empty()) {
+ // The getAllValuesForJoin function below needs joined tuple IDs as a
+ // vector of pair of (build-tuple-ID, probe-tuple-ID), and we have a
pair
+ // of (build-tuple-IDs-vector, probe-tuple-IDs-vector). So we'll
have to
+ // zip our two vectors together.
+ VectorOfPairs zipped_joined_tuple_ids;
+ zipped_joined_tuple_ids.reserve(build_tids.size());
+ for (std::size_t i = 0; i < build_tids.size(); ++i) {
+ zipped_joined_tuple_ids.emplace_back(build_tids[i], probe_tids[i]);
+ }
- // Create a map of ValueAccessors and what attributes we want to pick
from them
- std::vector<std::pair<ValueAccessor *, std::vector<attribute_id>>>
accessor_attribute_map;
- const std::vector<ValueAccessor *> accessors{
- ordered_build_accessor.get(), ordered_probe_accessor.get(),
&temp_result};
- const unsigned int build_index = 0, probe_index = 1, temp_index = 2;
- for (auto &accessor : accessors) {
- accessor_attribute_map.push_back(std::make_pair(
- accessor,
- std::vector<attribute_id>(selection_.size(),
kInvalidCatalogId)));
- }
-
- attribute_id dest_attr = 0;
- std::vector<std::pair<tuple_id, tuple_id>> zipped_joined_tuple_ids;
-
- for (auto &selection_cit : selection_) {
- // If the Scalar (column) is not an attribute in build/probe blocks,
then
- // insert it into a ColumnVectorsValueAccessor.
- if (selection_cit->getDataSource() !=
Scalar::ScalarDataSource::kAttribute) {
- // Current destination attribute maps to the column we'll create
now.
- accessor_attribute_map[temp_index].second[dest_attr] =
temp_result.getNumColumns();
-
- if (temp_result.getNumColumns() == 0) {
- // The getAllValuesForJoin function below needs joined tuple IDs
as
- // a vector of pair of (build-tuple-ID, probe-tuple-ID), and we
have
- // a pair of (build-tuple-IDs-vector, probe-tuple-IDs-vector). So
- // we'll have to zip our two vectors together. We do this inside
- // the loop because most queries don't exercise this code since
- // they don't have scalar expressions with attributes from both
- // build and probe relations (other expressions would have been
- // pushed down to before the join).
- zipped_joined_tuple_ids.reserve(build_tids.size());
- for (std::size_t i = 0; i < build_tids.size(); ++i) {
-
zipped_joined_tuple_ids.push_back(std::make_pair(build_tids[i], probe_tids[i]));
- }
- }
- temp_result.addColumn(
- selection_cit
- ->getAllValuesForJoin(build_relation_id,
build_accessor.get(),
- probe_relation_id,
probe_accessor.get(),
- zipped_joined_tuple_ids));
- } else {
- auto scalar_attr = static_cast<const ScalarAttribute
*>(selection_cit.get());
- const attribute_id attr_id = scalar_attr->getAttribute().getID();
- if (scalar_attr->getAttribute().getParent().getID() ==
build_relation_id) {
- accessor_attribute_map[build_index].second[dest_attr] = attr_id;
- } else {
- accessor_attribute_map[probe_index].second[dest_attr] = attr_id;
- }
+ for (const Scalar *scalar : non_trivial_expressions) {
+
temp_result.addColumn(scalar->getAllValuesForJoin(build_relation_id,
+
build_accessor.get(),
+
probe_relation_id,
+ probe_accessor,
+
zipped_joined_tuple_ids));
}
- ++dest_attr;
}
+ // We now create ordered value accessors for both build and probe side,
+ // using the joined tuple TIDs.
--- End diff --
Change from `TIDs` to `IDs`? I guess `TID` means `tuple id`.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---