wjones127 commented on code in PR #15179: URL: https://github.com/apache/arrow/pull/15179#discussion_r1061890853
########## cpp/src/parquet/arrow/arrow_reader_writer_test.cc: ########## @@ -4138,6 +4139,74 @@ TEST_P(TestArrowWriteDictionary, Statistics) { INSTANTIATE_TEST_SUITE_P(WriteDictionary, TestArrowWriteDictionary, ::testing::Values(ParquetDataPageVersion::V1, ParquetDataPageVersion::V2)); + +TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { + // Two chunks, with a shared dictionary + std::shared_ptr<::arrow::Table> table; + std::shared_ptr<::arrow::DataType> dict_type = + ::arrow::dictionary(::arrow::int32(), ::arrow::utf8()); + std::shared_ptr<::arrow::Schema> schema = + ::arrow::schema({::arrow::field("values", dict_type)}); + { + // It's important there are no duplicate values in the dictionary, otherwise + // we trigger the WriteDense() code path which side-steps dictionary encoding. + std::shared_ptr<::arrow::Array> test_dictionary = + ArrayFromJSON(::arrow::utf8(), R"(["b", "c", "d", "a"])"); + std::vector<std::shared_ptr<::arrow::Array>> test_indices = { + ArrayFromJSON(::arrow::int32(), + R"([0, null, 3, 0, null, 3])"), // ["b", null "a", "b", null, "a"] + ArrayFromJSON( + ::arrow::int32(), + R"([0, 3, null, 0, null, 1])")}; // ["b", "c", null, "b", "c", null] Review Comment: So I like the data as is. The first row group contains the first chunk plus the first three rows of the next chunk. Then the second group contains the last three elements of the second chunk. I've updated the comment so it is accurate again. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org