[arrow] branch main updated: GH-35746: [Parquet][C++][Python] Switch default Parquet version to 2.6 (#36137)

jorisvandenbossche Thu, 06 Jul 2023 06:10:51 -0700

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 22f7e59788 GH-35746: [Parquet][C++][Python] Switch default Parquet 
version to 2.6 (#36137)
22f7e59788 is described below

commit 22f7e5978892585519dc2c6ad6197a234885d3b6
Author: Anja Kefala <[email protected]>
AuthorDate: Thu Jul 6 06:10:35 2023 -0700

    GH-35746: [Parquet][C++][Python] Switch default Parquet version to 2.6 
(#36137)
    
    Change the default parquet version to 2.6.
    
    Discussed in:
    * [ML](https://lists.apache.org/thread/027g366yr3m03hwtpst6sr58b3trwhsm)
    * [Issue](https://github.com/apache/arrow/issues/35746)
    
    * Closes: #35746
    
    Lead-authored-by: anjakefala <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Co-authored-by: mwish <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 34 +++++++++++++++-------
 cpp/src/parquet/arrow/arrow_schema_test.cc        |  7 ++---
 cpp/src/parquet/column_writer_test.cc             |  1 +
 cpp/src/parquet/properties.h                      |  4 +--
 cpp/src/parquet/properties_test.cc                |  2 +-
 python/pyarrow/_parquet.pyx                       |  6 ++--
 python/pyarrow/parquet/core.py                    |  6 ++--
 python/pyarrow/tests/parquet/test_basic.py        |  4 ++-
 python/pyarrow/tests/parquet/test_datetime.py     | 35 ++++++++++++-----------
 python/pyarrow/tests/parquet/test_pandas.py       |  2 +-
 10 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 47570405c9..f82177b86f 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1249,11 +1249,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) {
   ASSERT_OK(NullableArray<::arrow::UInt32Type>(LARGE_SIZE, 100, kDefaultSeed, 
&values));
   std::shared_ptr<Table> table = MakeSimpleTable(values, true);
 
-  // Parquet 2.4 roundtrip should yield an uint32_t column again
+  // Parquet 2.6 roundtrip should yield an uint32_t column again
   this->ResetSink();
   std::shared_ptr<::parquet::WriterProperties> properties =
       ::parquet::WriterProperties::Builder()
-          .version(ParquetVersion::PARQUET_2_4)
+          .version(ParquetVersion::PARQUET_2_6)
           ->build();
   ASSERT_OK_NO_THROW(
       WriteTable(*table, default_memory_pool(), this->sink_, 512, properties));
@@ -1613,7 +1613,7 @@ TYPED_TEST(TestPrimitiveParquetIO, 
SingleColumnRequiredChunkedTableRead) {
   ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4));
 }
 
-void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = 
false) {
+void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected_micro = 
false) {
   using ::arrow::ArrayFromVector;
 
   std::vector<bool> is_valid = {true, true, true, false, true, true};
@@ -1629,7 +1629,7 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, 
bool expected = false)
   auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO));
 
   std::shared_ptr<::arrow::Schema> schema(
-      new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6}));
+      new ::arrow::Schema({f0, f1, f2, (expected_micro ? f3_x : f3), f4, f5, 
f6}));
 
   std::vector<int32_t> t32_values = {1489269000, 1489270000, 1489271000,
                                      1489272000, 1489272000, 1489273000};
@@ -1654,20 +1654,30 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* 
out, bool expected = false)
   ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, 
t64_us_values, &a5);
   ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, 
t64_ns_values, &a6);
 
-  *out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6});
+  *out = Table::Make(schema, {a0, a1, a2, expected_micro ? a3_x : a3, a4, a5, 
a6});
 }
 
 TEST(TestArrowReadWrite, DateTimeTypes) {
-  std::shared_ptr<Table> table, result;
+  std::shared_ptr<Table> table, result, expected;
 
+  // Parquet 2.6 nanoseconds are preserved
   MakeDateTimeTypesTable(&table);
   ASSERT_NO_FATAL_FAILURE(
       DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, 
&result));
 
-  MakeDateTimeTypesTable(&table, true);  // build expected result
-  ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*table->schema(), 
*result->schema(),
+  MakeDateTimeTypesTable(&expected, false);
+  ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*expected->schema(),
+                                                     *result->schema(),
                                                      
/*check_metadata=*/false));
-  ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result));
+  ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected, *result));
+
+  // Parquet 2.4 nanoseconds are converted to microseconds
+  auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder()
+                                            
.version(ParquetVersion::PARQUET_2_4)
+                                            ->build();
+  MakeDateTimeTypesTable(&expected, true);
+  ASSERT_NO_FATAL_FAILURE(
+      CheckConfiguredRoundtrip(table, expected, 
parquet_version_2_4_properties));
 }
 
 TEST(TestArrowReadWrite, UseDeprecatedInt96) {
@@ -1973,7 +1983,9 @@ TEST(TestArrowReadWrite, 
ParquetVersionTimestampDifferences) {
                               field("ts:us", t_us), field("ts:ns", t_ns)});
   auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns});
 
-  auto parquet_version_1_properties = ::parquet::default_writer_properties();
+  auto parquet_version_1_properties = ::parquet::WriterProperties::Builder()
+                                          .version(ParquetVersion::PARQUET_1_0)
+                                          ->build();
   ARROW_SUPPRESS_DEPRECATION_WARNING
   auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder()
                                             
.version(ParquetVersion::PARQUET_2_0)
@@ -3334,7 +3346,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) {
     auto arrow_writer_props = ArrowWriterProperties::Builder();
     arrow_writer_props.store_schema();
     if (inner_type->id() == ::arrow::Type::UINT32) {
-      writer_props.version(ParquetVersion::PARQUET_2_4);
+      writer_props.version(ParquetVersion::PARQUET_2_6);
     } else if (inner_type->id() == ::arrow::Type::TIMESTAMP) {
       // By default ns is coerced to us, override that
       ::arrow::TimeUnit::type unit =
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc 
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 621f2a0e76..7c608e4424 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -869,9 +869,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
                               /*is_from_converted_type=*/false,
                               /*force_set_converted_type=*/true),
        ParquetType::INT64, -1},
-      // Parquet v1, values converted to microseconds
       {"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO),
-       LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS,
+       LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS,
                               /*is_from_converted_type=*/false,
                               /*force_set_converted_type=*/true),
        ParquetType::INT64, -1},
@@ -882,7 +881,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), 
ParquetType::INT64,
        -1},
       {"timestamp(nanosecond, UTC)", 
::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"),
-       LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), 
ParquetType::INT64,
+       LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), 
ParquetType::INT64,
        -1},
       {"timestamp(millisecond, CET)", 
::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"),
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS), 
ParquetType::INT64,
@@ -891,7 +890,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), 
ParquetType::INT64,
        -1},
       {"timestamp(nanosecond, CET)", 
::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"),
-       LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), 
ParquetType::INT64,
+       LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), 
ParquetType::INT64,
        -1}};
 
   std::vector<std::shared_ptr<Field>> arrow_fields;
diff --git a/cpp/src/parquet/column_writer_test.cc 
b/cpp/src/parquet/column_writer_test.cc
index ed9bb4b407..af9876370e 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -650,6 +650,7 @@ TYPED_TEST(TestPrimitiveWriter, 
DictionaryFallbackVersion1_0) {
 
 TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) {
   this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4);
+  this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6);
 }
 
 TEST(TestWriter, NullValuesBuffer) {
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index f16f6c49d3..0a69659508 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -214,7 +214,7 @@ class PARQUET_EXPORT WriterProperties {
           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
           pagesize_(kDefaultDataPageSize),
-          version_(ParquetVersion::PARQUET_2_4),
+          version_(ParquetVersion::PARQUET_2_6),
           data_page_version_(ParquetDataPageVersion::V1),
           created_by_(DEFAULT_CREATED_BY),
           store_decimal_as_integer_(false),
@@ -296,7 +296,7 @@ class PARQUET_EXPORT WriterProperties {
     }
 
     /// Specify the Parquet file version.
-    /// Default PARQUET_2_4.
+    /// Default PARQUET_2_6.
     Builder* version(ParquetVersion::type version) {
       version_ = version;
       return this;
diff --git a/cpp/src/parquet/properties_test.cc 
b/cpp/src/parquet/properties_test.cc
index 5fd182f679..2ba1b8a604 100644
--- a/cpp/src/parquet/properties_test.cc
+++ b/cpp/src/parquet/properties_test.cc
@@ -44,7 +44,7 @@ TEST(TestWriterProperties, Basics) {
 
   ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize());
   ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, 
props->dictionary_pagesize_limit());
-  ASSERT_EQ(ParquetVersion::PARQUET_2_4, props->version());
+  ASSERT_EQ(ParquetVersion::PARQUET_2_6, props->version());
   ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version());
   ASSERT_FALSE(props->page_checksum_enabled());
 }
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 781c7e23be..2f53d5fbba 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -710,7 +710,7 @@ cdef class FileMetaData(_Weakrefable):
         """
         Parquet format version used in file (str, such as '1.0', '2.4').
 
-        If version is missing or unparsable, will default to assuming '2.4'.
+        If version is missing or unparsable, will default to assuming '2.6'.
         """
         cdef ParquetVersion version = self._metadata.version()
         if version == ParquetVersion_V1:
@@ -722,9 +722,9 @@ cdef class FileMetaData(_Weakrefable):
         elif version == ParquetVersion_V2_6:
             return '2.6'
         else:
-            warnings.warn('Unrecognized file version, assuming 2.4: {}'
+            warnings.warn('Unrecognized file version, assuming 2.6: {}'
                           .format(version))
-            return '2.4'
+            return '2.6'
 
     @property
     def created_by(self):
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 0675a2c9cc..e7945499be 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -748,7 +748,7 @@ def _sanitize_table(table, new_schema, flavor):
         return table
 
 
-_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4"
+_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.6"
     Determine which Parquet logical types are available for use, whether the
     reduced set from the Parquet 1.x.x format or the expanded logical types
     added in later format versions.
@@ -944,7 +944,7 @@ Examples
 
     def __init__(self, where, schema, filesystem=None,
                  flavor=None,
-                 version='2.4',
+                 version='2.6',
                  use_dictionary=True,
                  compression='snappy',
                  write_statistics=True,
@@ -3060,7 +3060,7 @@ read_pandas.__doc__ = _read_table_docstring.format(
     _DNF_filter_doc, "")
 
 
-def write_table(table, where, row_group_size=None, version='2.4',
+def write_table(table, where, row_group_size=None, version='2.6',
                 use_dictionary=True, compression='snappy',
                 write_statistics=True,
                 use_deprecated_int96_timestamps=None,
diff --git a/python/pyarrow/tests/parquet/test_basic.py 
b/python/pyarrow/tests/parquet/test_basic.py
index 5c0e1a077d..9bc59cbcf9 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -630,7 +630,9 @@ def test_write_error_deletes_incomplete_file(tempdir):
 
     filename = tempdir / 'tmp_file'
     try:
-        _write_table(pdf, filename)
+        # Test relies on writing nanoseconds to raise an error
+        # true for Parquet 2.4
+        _write_table(pdf, filename, version="2.4")
     except pa.ArrowException:
         pass
 
diff --git a/python/pyarrow/tests/parquet/test_datetime.py 
b/python/pyarrow/tests/parquet/test_datetime.py
index bb1107972b..1cad82b839 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -336,9 +336,10 @@ def test_coerce_int96_timestamp_overflow(pq_reader_method, 
tempdir):
     tm.assert_frame_equal(df, df_correct)
 
 
-def test_timestamp_restore_timezone():
[email protected]('unit', ['ms', 'us', 'ns'])
+def test_timestamp_restore_timezone(unit):
     # ARROW-5888, restore timezone from serialized metadata
-    ty = pa.timestamp('ms', tz='America/New_York')
+    ty = pa.timestamp(unit, tz='America/New_York')
     arr = pa.array([1, 2, 3], type=ty)
     t = pa.table([arr], names=['f0'])
     _check_roundtrip(t)
@@ -346,13 +347,13 @@ def test_timestamp_restore_timezone():
 
 def test_timestamp_restore_timezone_nanosecond():
     # ARROW-9634, also restore timezone for nanosecond data that get stored
-    # as microseconds in the parquet file
+    # as microseconds in the parquet file for Parquet ver 2.4 and less
     ty = pa.timestamp('ns', tz='America/New_York')
     arr = pa.array([1000, 2000, 3000], type=ty)
     table = pa.table([arr], names=['f0'])
     ty_us = pa.timestamp('us', tz='America/New_York')
     expected = pa.table([arr.cast(ty_us)], names=['f0'])
-    _check_roundtrip(table, expected=expected)
+    _check_roundtrip(table, expected=expected, version='2.4')
 
 
 @pytest.mark.pandas
@@ -378,28 +379,31 @@ def test_parquet_version_timestamp_differences():
     a_us = pa.array(d_us, type=pa.timestamp('us'))
     a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
 
+    all_versions = ['1.0', '2.4', '2.6']
+
     names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
     table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
 
-    # Using Parquet version 1.0, seconds should be coerced to milliseconds
+    # Using Parquet version 1.0 and 2.4, seconds should be coerced to 
milliseconds
     # and nanoseconds should be coerced to microseconds by default
     expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
-    _check_roundtrip(table, expected)
+    _check_roundtrip(table, expected, version='1.0')
+    _check_roundtrip(table, expected, version='2.4')
 
-    # Using Parquet version 2.0, seconds should be coerced to milliseconds
+    # Using Parquet version 2.6, seconds should be coerced to milliseconds
     # and nanoseconds should be retained by default
     expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
     _check_roundtrip(table, expected, version='2.6')
 
-    # Using Parquet version 1.0, coercing to milliseconds or microseconds
+    # For either Parquet version coercing to milliseconds or microseconds
     # is allowed
     expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
-    _check_roundtrip(table, expected, coerce_timestamps='ms')
+    for ver in all_versions:
+        _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
 
-    # Using Parquet version 2.0, coercing to milliseconds or microseconds
-    # is allowed
     expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
-    _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
 
     # TODO: after pyarrow allows coerce_timestamps='ns', tests like the
     # following should pass ...
@@ -416,10 +420,9 @@ def test_parquet_version_timestamp_differences():
     # For either Parquet version, coercing to nanoseconds is allowed
     # if Int96 storage is used
     expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
-    _check_roundtrip(table, expected,
-                     use_deprecated_int96_timestamps=True)
-    _check_roundtrip(table, expected, version='2.6',
-                     use_deprecated_int96_timestamps=True)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver,
+                         use_deprecated_int96_timestamps=True)
 
 
 @pytest.mark.pandas
diff --git a/python/pyarrow/tests/parquet/test_pandas.py 
b/python/pyarrow/tests/parquet/test_pandas.py
index ea99906427..6bd68e08fc 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -256,7 +256,7 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, 
use_legacy_dataset):
     arrow_table = pa.Table.from_pandas(df)
 
     with filename.open('wb') as f:
-        _write_table(arrow_table, f, version="2.4")
+        _write_table(arrow_table, f, version="2.6")
 
     data = io.BytesIO(filename.read_bytes())

[arrow] branch main updated: GH-35746: [Parquet][C++][Python] Switch default Parquet version to 2.6 (#36137)

Reply via email to