icexelloss commented on code in PR #34311: URL: https://github.com/apache/arrow/pull/34311#discussion_r1128114556
########## cpp/src/arrow/compute/kernels/hash_aggregate_test.cc: ########## @@ -135,22 +142,96 @@ Result<Datum> NaiveGroupBy(std::vector<Datum> arguments, std::vector<Datum> keys return Take(struct_arr, sorted_indices); } +Result<Datum> MakeGroupByOutput(const std::vector<ExecBatch>& output_batches, + const std::shared_ptr<Schema> output_schema, + size_t num_aggregates, size_t num_keys, bool naive) { + ArrayVector out_arrays(num_aggregates + num_keys); + for (size_t i = 0; i < out_arrays.size(); ++i) { + std::vector<std::shared_ptr<Array>> arrays(output_batches.size()); + for (size_t j = 0; j < output_batches.size(); ++j) { + arrays[j] = output_batches[j].values[i].make_array(); + } + if (arrays.empty()) { + ARROW_ASSIGN_OR_RAISE( + out_arrays[i], + MakeArrayOfNull(output_schema->field(static_cast<int>(i))->type(), + /*length=*/0)); + } else { + ARROW_ASSIGN_OR_RAISE(out_arrays[i], Concatenate(arrays)); + } + } + + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr<Array> struct_arr, + StructArray::Make(std::move(out_arrays), output_schema->fields())); + + bool need_sort = !naive; + for (size_t i = num_aggregates; need_sort && i < out_arrays.size(); i++) { + if (output_schema->field(static_cast<int>(i))->type()->id() == Type::DICTIONARY) { + need_sort = false; + } + } + if (!need_sort) { + return struct_arr; + } + + // The exec plan may reorder the output rows. The tests are all setup to expect ouptut + // in ascending order of keys. So we need to sort the result by the key columns. To do + // that we create a table using the key columns, calculate the sort indices from that + // table (sorting on all fields) and then use those indices to calculate our result. + std::vector<std::shared_ptr<Field>> key_fields; + std::vector<std::shared_ptr<Array>> key_columns; + std::vector<SortKey> sort_keys; + for (std::size_t i = 0; i < num_keys; i++) { + const std::shared_ptr<Array>& arr = out_arrays[i + num_aggregates]; + key_columns.push_back(arr); + key_fields.push_back(field("name_does_not_matter", arr->type())); + sort_keys.emplace_back(static_cast<int>(i)); + } + std::shared_ptr<Schema> key_schema = schema(std::move(key_fields)); + std::shared_ptr<Table> key_table = Table::Make(std::move(key_schema), key_columns); + SortOptions sort_options(std::move(sort_keys)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> sort_indices, + SortIndices(key_table, sort_options)); + + return Take(struct_arr, sort_indices); +} + Result<Datum> RunGroupBy(const BatchesWithSchema& input, const std::vector<std::string>& key_names, - const std::vector<Aggregate>& aggregates, bool use_threads) { + const std::vector<std::string>& segment_key_names, + const std::vector<Aggregate>& aggregates, ExecContext* ctx, + bool use_threads, bool segmented = false, bool naive = false) { + // When segment_keys is non-empty the `segmented` flag is always true; otherwise (when + // empty), it may still be set to true. In this case, the tester restructures (without + // changing the data of) the result of RunGroupBy from `std::vector<ExecBatch>` + // (output_batches) to `std::vector<ArrayVector>` (out_arrays), which have the structure + // typical of the case of a non-empty segment_keys (with multiple arrays per column, one + // array per segment) but only one array per column (because, technically, there is only + // one segment in this case). Thus, this case focuses on the structure of the result. + // + // The `naive` flag means that the output is expected to be like that of `NaiveGroupBy`, + // which in particular doesn't require sorting. The reason for the naive flag is that Review Comment: "doesn't require sorting" means that the output of `NaiveGroupBy` is unsorted or does this mean sth else? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org