[GitHub] [arrow] westonpace commented on a diff in pull request #33969: GH-33668: [Python] Reading flat dataset with partitioning="hive" results in partition schema equal to dataset schema

via GitHub Mon, 06 Feb 2023 09:53:06 -0800


westonpace commented on code in PR #33969:
URL: https://github.com/apache/arrow/pull/33969#discussion_r1097677559



##########
cpp/src/arrow/dataset/partition.cc:
##########
@@ -827,7 +827,7 @@ class HivePartitioningFactory : public 
KeyValuePartitioningFactory {
   Result<std::shared_ptr<Partitioning>> Finish(
       const std::shared_ptr<Schema>& schema) const override {
     if (dictionaries_.empty()) {
-      return std::make_shared<HivePartitioning>(schema, dictionaries_);
+      return std::make_shared<HivePartitioning>(arrow::schema({}), 
dictionaries_);

Review Comment:
   Nit: Is the `arrow::` prefix necessary here?



##########
python/pyarrow/tests/parquet/test_dataset.py:
##########
@@ -1925,3 +1925,20 @@ def test_write_to_dataset_kwargs_passed(tempdir, 
write_dataset_kwarg):
         pq.write_to_dataset(table, path, **{key: arg})
         _name, _args, kwargs = mock_write_dataset.mock_calls[0]
         assert kwargs[key] == arg
+
+
+def test_partition_schema_validity(tempdir):
+    data = {
+        "a": [1, 2, 1, 2, 3, 4],
+        "b": [10, 20, 30, 40, 50, 60]
+    }
+
+    table = pa.Table.from_pydict(data)
+    pq.write_table(table, tempdir / "table.parquet")

Review Comment:
   A full round-trip test here is ok as it is much simpler to write such a test 
and we don't fully expose `PartitioningFactory` so it wouldn't be possible to 
do the simpler test.



##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -5084,10 +5084,8 @@ def test_dataset_partition_with_slash(tmpdir):
     read_table = ds.dataset(
         source=path,
         format='ipc',
-        partitioning='hive',
-        schema=pa.schema([pa.field("exp_id", pa.int32()),
-                          pa.field("exp_meta", pa.utf8())])
-    ).to_table().combine_chunks()
+        schema=dt_table.schema,

Review Comment:
   Does the schema even need to be specified here?



##########
cpp/src/arrow/dataset/partition_test.cc:
##########
@@ -1107,5 +1108,88 @@ TEST(TestStripPrefixAndFilename, Basic) {
                                           "year=2019/month=12/day=01"));
 }
 
+TEST_F(TestPartitioning, HivePartitionReadEvaluation) {

Review Comment:
   This test is too complex.  Could you create a narrower unit test that 
focuses on the needed behavior?  For example, there should be no need to create 
a filesystem and actually write files if your goal is to test the partitioning 
factory.  The partitioning factory takes strings & schemas as input and outputs 
a partitioning object.
   
   For example, you can reproduce the issue with this:
   
   ```diff
   diff --git a/cpp/src/arrow/dataset/partition_test.cc 
b/cpp/src/arrow/dataset/partition_test.cc
   index 3e681a9cb..0e22152e0 100644
   --- a/cpp/src/arrow/dataset/partition_test.cc
   +++ b/cpp/src/arrow/dataset/partition_test.cc
   @@ -83,8 +83,14 @@ class TestPartitioning : public ::testing::Test {
      void AssertInspect(const std::vector<std::string>& paths,
                         const std::vector<std::shared_ptr<Field>>& expected) {
        ASSERT_OK_AND_ASSIGN(auto actual, factory_->Inspect(paths));
   -    ASSERT_EQ(*actual, Schema(expected));
   -    ASSERT_OK_AND_ASSIGN(partitioning_, factory_->Finish(actual));
   +    Schema expected_schema(expected);
   +    ASSERT_EQ(*actual, expected_schema);
   +    // The dataset will usually have more than just partition fields.  The 
partitioning
   +    // schema should ignore these.
   +    std::vector<std::shared_ptr<Field>> all_fields(expected);
   +    all_fields.push_back(field("field_unrelated_to_partitioning", int8()));
   +    ASSERT_OK_AND_ASSIGN(partitioning_, 
factory_->Finish(schema(std::move(all_fields))));
   +    AssertSchemaEqual(expected_schema, *partitioning_->schema());
      }
    
      void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
   @@ -571,6 +577,11 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
      // missing path segments will not cause an error
      AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3", "/gamma=what"},
                    {Int("alpha"), Int("beta"), Str("gamma")});
   +
   +  // No partitioning information should yield an empty partitioning even if
   +  // there are fields in the dataset schema
   +  AssertInspect({"/foo/chunk.parquet", "/bar/chunk.parquet", "/baz"}, {});
   +  AssertInspect({"chunk.parquet"}, {});
    }
    
    TEST_F(TestPartitioning, HiveDictionaryInference) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] westonpace commented on a diff in pull request #33969: GH-33668: [Python] Reading flat dataset with partitioning="hive" results in partition schema equal to dataset schema

Reply via email to