bkietz commented on a change in pull request #9323:
URL: https://github.com/apache/arrow/pull/9323#discussion_r581371432
##########
File path: cpp/src/arrow/dataset/partition_test.cc
##########
@@ -77,6 +78,39 @@ class TestPartitioning : public ::testing::Test {
ASSERT_OK_AND_ASSIGN(partitioning_, factory_->Finish(actual));
}
+ void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
+ const std::shared_ptr<RecordBatch> full_batch,
+ const RecordBatchVector& expected_batches,
+ const std::vector<Expression>& expected_expressions) {
+ ASSERT_OK_AND_ASSIGN(auto partition_results,
partitioning->Partition(full_batch));
+ std::shared_ptr<RecordBatch> rest = full_batch;
Review comment:
Unused:
```suggestion
```
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -147,7 +151,9 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const
Key& key) const {
std::shared_ptr<Scalar> converted;
- if (field->type()->id() == Type::DICTIONARY) {
+ if (!key.value.has_value()) {
+ return is_null(field_ref(field->name()));
+ } else if (field->type()->id() == Type::DICTIONARY) {
Review comment:
nit:
```suggestion
}
if (field->type()->id() == Type::DICTIONARY) {
```
##########
File path: cpp/src/arrow/compute/kernels/vector_hash_test.cc
##########
@@ -542,6 +547,12 @@ TEST_F(TestHashKernel, UniqueDecimal) {
{true, false, true, true}, expected,
{1, 0, 1});
}
+TEST_F(TestHashKernel, UniqueNull) {
+ CheckUnique<NullType, std::nullptr_t>(null(), {nullptr, nullptr}, {false,
true},
+ {nullptr}, {false});
+ CheckUnique<NullType, std::nullptr_t>(null(), {}, {}, {}, {});
Review comment:
```suggestion
CheckUnique(ArrayFromJSON(null(), "[null, null, null]"),
ArrayFromJSON(null(), "[null]"));
CheckUnique(ArrayFromJSON(null(), "[]"), ArrayFromJSON(null(), "[]"));
```
##########
File path: cpp/src/arrow/dataset/partition_test.cc
##########
@@ -282,9 +348,16 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
equal(field_ref("alpha"), literal(0))),
"alpha=0/beta=3.25");
AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0");
Review comment:
Since a null valued partition key is semantically equivalent to an
absent one, we should ensure they format identically. I've created ARROW-11762
for follow up
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -625,8 +650,27 @@ class StructDictionary {
private:
Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
+ if (column.type()->id() == Type::DICTIONARY) {
+ if (column.null_count() != 0) {
+ // TODO(ARROW-11732) Optimize this by allowign DictionaryEncode to
transfer a
Review comment:
```suggestion
// TODO(ARROW-11732) Optimize this by allowing DictionaryEncode to
transfer a
```
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -287,8 +303,13 @@ class KeyValuePartitioningFactory : public
PartitioningFactory {
return it_inserted.first->second;
}
- Status InsertRepr(const std::string& name, util::string_view repr) {
- return InsertRepr(GetOrInsertField(name), repr);
+ Status InsertRepr(const std::string& name, util::optional<string_view> repr)
{
+ auto field_index = GetOrInsertField(name);
+ if (repr.has_value()) {
+ return InsertRepr(field_index, *repr);
+ } else {
+ return Status::OK();
+ }
Review comment:
```suggestion
if (repr.has_value()) {
auto field_index = GetOrInsertField(name);
return InsertRepr(field_index, *repr);
}
return Status::OK();
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]