[
https://issues.apache.org/jira/browse/ARROW-14679?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17468957#comment-17468957
]
Weston Pace commented on ARROW-14679:
-------------------------------------
Forgot to add the unit test:
{code}
TEST(HashJoin, Prefix) {
BatchesWithSchema input_left;
input_left.batches = {ExecBatchFromJSON({int32(), int32(), int32()}, R"([
[1, 4, 7],
[2, 5, 8],
[3, 6, 9]
])")};
input_left.schema = schema(
{field("lkey", int32()), field("shared", int32()), field("ldistinct",
int32())});
BatchesWithSchema input_right;
input_right.batches = {ExecBatchFromJSON({int32(), int32(), int32()}, R"([
[1, 10, 13],
[2, 11, 14],
[3, 12, 15]
])")};
input_right.schema = schema(
{field("rkey", int32()), field("shared", int32()), field("rdistinct",
int32())});
BatchesWithSchema expected;
expected.batches = {
ExecBatchFromJSON({int32(), int32(), int32(), int32(), int32(), int32()},
R"([
[1, 4, 7, 1, 10, 13],
[2, 5, 8, 2, 11, 14],
[3, 6, 9, 3, 12, 15]
])")};
expected.schema = schema({field("l_lkey", int32()), field("l_shared",
int32()),
field("l_ldistinct", int32()), field("r_rkey",
int32()),
field("r_shared", int32()), field("r_rdistinct",
int32())});
ExecContext exec_ctx;
ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(&exec_ctx));
AsyncGenerator<util::optional<ExecBatch>> sink_gen;
ExecNode* left_source;
ExecNode* right_source;
ASSERT_OK_AND_ASSIGN(
left_source,
MakeExecNode("source", plan.get(), {},
SourceNodeOptions{input_left.schema,
input_left.gen(/*parallel=*/true,
/*slow=*/false)}));
ASSERT_OK_AND_ASSIGN(
right_source,
MakeExecNode("source", plan.get(), {},
SourceNodeOptions{input_right.schema,
input_right.gen(/*parallel=*/true,
/*slow=*/false)}))
HashJoinNodeOptions join_opts{JoinType::INNER,
/*left_keys=*/{"lkey"},
/*right_keys=*/{"rkey"}, literal(true), "l_",
"r_"};
ASSERT_OK_AND_ASSIGN(
auto hashjoin,
MakeExecNode("hashjoin", plan.get(), {left_source, right_source},
join_opts));
ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {hashjoin},
SinkNodeOptions{&sink_gen}));
ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(),
sink_gen));
AssertExecBatchesEqual(expected.schema, expected.batches, result);
AssertSchemaEqual(expected.schema, hashjoin->output_schema());
}
{code}
> [R] [C++] Handle suffix argument in joins
> -----------------------------------------
>
> Key: ARROW-14679
> URL: https://issues.apache.org/jira/browse/ARROW-14679
> Project: Apache Arrow
> Issue Type: Improvement
> Components: C++, R
> Reporter: Jonathan Keane
> Assignee: Vibhatha Lakmal Abeykoon
> Priority: Major
> Labels: pull-request-available, query-engine
> Fix For: 7.0.0
>
> Time Spent: 20m
> Remaining Estimate: 0h
>
> If there is a name collision, we need to do something
> https://github.com/apache/arrow/blob/a3746040d8a3ddb84bab6c7ca4771b6c120e3444/r/R/dplyr-join.R#L31
> A few notes:
> * arrow doesn't seem to actually be able to apply the prefixes (I'm getting
> errors when trying), I couldn't tell if there were tests of this — I couldn't
> find any, so I'm not sure if I'm calling this wrong or if it's not working at
> all.
> * arrow always appends the affixes (where as dplyr only adds them if there is
> a name collision)
> * arrow only supports prefixes (can we configure this, or ask the clients to
> provide new names?) in the tests I wrote I've worked around this, but it
> would be nice to be able to match dplyr/allow things other than prefix
--
This message was sent by Atlassian Jira
(v8.20.1#820001)