This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 9b584547c7 GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset 
field and no longer write Metadata at end of Chunk (#43428)
9b584547c7 is described below

commit 9b584547c768fb09b2e33b4ad8797cf45c3b3b97
Author: mwish <[email protected]>
AuthorDate: Wed Aug 7 20:01:04 2024 +0800

    GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset field and no 
longer write Metadata at end of Chunk (#43428)
    
    
    
    ### Rationale for this change
    
    See github issue
    
    ### What changes are included in this PR?
    
    Force `ColumnChunk::file_offset` tobe 0
    
    ### Are these changes tested?
    
    No
    
    ### Are there any user-facing changes?
    
    Might affect user using legacy `ColumnChunk::file_offset`
    
    * GitHub Issue: #43427
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: mwish <[email protected]>
---
 c_glib/test/parquet/test-column-chunk-metadata.rb | 2 +-
 cpp/src/parquet/column_writer.cc                  | 5 -----
 cpp/src/parquet/metadata.cc                       | 6 +++---
 cpp/src/parquet/metadata.h                        | 7 ++++++-
 python/pyarrow/tests/parquet/test_metadata.py     | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/c_glib/test/parquet/test-column-chunk-metadata.rb 
b/c_glib/test/parquet/test-column-chunk-metadata.rb
index f0012f0124..4612e5bf0c 100644
--- a/c_glib/test/parquet/test-column-chunk-metadata.rb
+++ b/c_glib/test/parquet/test-column-chunk-metadata.rb
@@ -77,7 +77,7 @@ class TestParquetColumnChunkMetadata < Test::Unit::TestCase
 
   test("#file_offset") do
     assert do
-      @metadata.file_offset > 0
+      @metadata.file_offset == 0
     end
   end
 
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 90e0102b42..f859ec9653 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -353,8 +353,6 @@ class SerializedPageWriter : public PageWriter {
                       total_compressed_size_, total_uncompressed_size_, 
has_dictionary,
                       fallback, dict_encoding_stats_, data_encoding_stats_,
                       meta_encryptor_);
-    // Write metadata at end of column chunk
-    metadata_->WriteTo(sink_.get());
   }
 
   /**
@@ -667,9 +665,6 @@ class BufferedPageWriter : public PageWriter {
                       has_dictionary, fallback, pager_->dict_encoding_stats_,
                       pager_->data_encoding_stats_, pager_->meta_encryptor_);
 
-    // Write metadata at end of column chunk
-    metadata_->WriteTo(in_memory_sink_.get());
-
     // Buffered page writer needs to adjust page offsets.
     pager_->FinishPageIndexes(final_position);
 
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index fe16f5b76b..10c8afaf37 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -1536,10 +1536,10 @@ class 
ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
               const std::shared_ptr<Encryptor>& encryptor) {
     if (dictionary_page_offset > 0) {
       
column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
-      column_chunk_->__set_file_offset(dictionary_page_offset + 
compressed_size);
-    } else {
-      column_chunk_->__set_file_offset(data_page_offset + compressed_size);
     }
+    // The `file_offset` field is deprecated and should be set to 0.
+    // See https://github.com/apache/parquet-format/pull/440 for detail.
+    column_chunk_->__set_file_offset(0);
     column_chunk_->__isset.meta_data = true;
     column_chunk_->meta_data.__set_num_values(num_values);
     if (index_page_offset >= 0) {
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index e02d2e7c85..e46297540b 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -146,7 +146,12 @@ class PARQUET_EXPORT ColumnChunkMetaData {
 
   bool Equals(const ColumnChunkMetaData& other) const;
 
-  // column chunk
+  // Byte offset of `ColumnMetaData` in `file_path()`.
+  //
+  // Note that the meaning of this field has been inconsistent among 
implementations
+  // so its use has since been deprecated in the Parquet specification. Modern
+  // implementations will set this to `0` to indicate that the 
`ColumnMetaData` is solely
+  // contained in the `ColumnChunk` struct.
   int64_t file_offset() const;
 
   // parameter is only used when a dataset is spread across multiple files
diff --git a/python/pyarrow/tests/parquet/test_metadata.py 
b/python/pyarrow/tests/parquet/test_metadata.py
index 52ab59a961..528cf0110d 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -122,7 +122,7 @@ def test_parquet_metadata_api():
         col_meta = rg_meta.column(ncols + 2)
 
     col_meta = rg_meta.column(0)
-    assert col_meta.file_offset > 0
+    assert col_meta.file_offset == 0
     assert col_meta.file_path == ''  # created from BytesIO
     assert col_meta.physical_type == 'BOOLEAN'
     assert col_meta.num_values == 10000

Reply via email to