This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c24bc29100 GH-49576: [Ruby] Add support for custom metadata in Footer
(#49577)
c24bc29100 is described below
commit c24bc29100a141ecc7372e1f786d6505c6337360
Author: Sutou Kouhei <[email protected]>
AuthorDate: Mon Mar 23 06:32:49 2026 +0900
GH-49576: [Ruby] Add support for custom metadata in Footer (#49577)
### Rationale for this change
In file format, Footer can have custom metadata.
### What changes are included in this PR?
* Add `garrow_record_batch_file_reader_get_metadata()`
* Add `garrow_record_batch_file_writer_new_full()`
* Add `ArrowFormat::FileReader#metadata`
* Add `metadata` to `ArrowFormat::FileWriter#finish`
* Add `metadata:` to `Arrow::Table#save`
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #49576
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
c_glib/arrow-glib/reader.cpp | 29 +++++++++++++++
c_glib/arrow-glib/reader.h | 4 +++
c_glib/arrow-glib/writer.cpp | 42 ++++++++++++++++++++--
c_glib/arrow-glib/writer.h | 9 +++++
c_glib/test/test-file-writer.rb | 32 +++++++++++++++++
ruby/red-arrow-format/Gemfile | 1 +
.../lib/arrow-format/file-reader.rb | 2 ++
.../lib/arrow-format/file-writer.rb | 21 +++++++----
ruby/red-arrow-format/test/helper.rb | 1 +
ruby/red-arrow-format/test/test-reader.rb | 32 +++++++++++++----
ruby/red-arrow-format/test/test-writer.rb | 21 +++++++++++
ruby/red-arrow/lib/arrow/table-saver.rb | 6 ++--
12 files changed, 181 insertions(+), 19 deletions(-)
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 9fe9d9d1b3..f6e0d3064d 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -668,6 +668,35 @@
garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r
}
}
+/**
+ * garrow_record_batch_file_reader_get_metadata:
+ * @reader: A #GArrowRecordBatchFileReader.
+ *
+ * Returns: (nullable) (element-type utf8 utf8) (transfer full):
+ * The metadata in the footer.
+ *
+ * Since: 24.0.0
+ */
+GHashTable *
+garrow_record_batch_file_reader_get_metadata(GArrowRecordBatchFileReader
*reader)
+{
+ auto arrow_reader = garrow_record_batch_file_reader_get_raw(reader);
+ auto arrow_metadata = arrow_reader->metadata();
+
+ if (!arrow_metadata) {
+ return nullptr;
+ }
+
+ auto metadata = g_hash_table_new(g_str_hash, g_str_equal);
+ const auto n = arrow_metadata->size();
+ for (int64_t i = 0; i < n; ++i) {
+ g_hash_table_insert(metadata,
+ const_cast<gchar *>(arrow_metadata->key(i).c_str()),
+ const_cast<gchar *>(arrow_metadata->value(i).c_str()));
+ }
+ return metadata;
+}
+
struct GArrowFeatherFileReaderPrivate
{
std::shared_ptr<arrow::ipc::feather::Reader> feather_reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index 5401aa3bb1..1e896fd09f 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -166,6 +166,10 @@
garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r
guint i,
GError **error);
+GARROW_AVAILABLE_IN_24_0
+GHashTable *
+garrow_record_batch_file_reader_get_metadata(GArrowRecordBatchFileReader
*reader);
+
#define GARROW_TYPE_FEATHER_FILE_READER (garrow_feather_file_reader_get_type())
GARROW_AVAILABLE_IN_ALL
G_DECLARE_DERIVABLE_TYPE(GArrowFeatherFileReader,
diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp
index 4228b60910..0cbd88a769 100644
--- a/c_glib/arrow-glib/writer.cpp
+++ b/c_glib/arrow-glib/writer.cpp
@@ -20,6 +20,8 @@
#include <arrow-glib/array.hpp>
#include <arrow-glib/enums.h>
#include <arrow-glib/error.hpp>
+#include <arrow-glib/internal-hash-table.hpp>
+#include <arrow-glib/ipc-options.hpp>
#include <arrow-glib/record-batch.hpp>
#include <arrow-glib/schema.hpp>
#include <arrow-glib/table.hpp>
@@ -288,16 +290,50 @@ GArrowRecordBatchFileWriter *
garrow_record_batch_file_writer_new(GArrowOutputStream *sink,
GArrowSchema *schema,
GError **error)
+{
+ return garrow_record_batch_file_writer_new_full(sink, schema, nullptr,
nullptr, error);
+}
+
+/**
+ * garrow_record_batch_file_writer_new_full:
+ * @sink: The output of the writer.
+ * @schema: The schema of the writer.
+ * @options: (nullable): The options for serialization.
+ * @metadata: (nullable) (element-type utf8 utf8): The custom metadata in
+ * the footer.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (nullable): A newly created #GArrowRecordBatchFileWriter
+ * or %NULL on error.
+ *
+ * Since: 24.0.0
+ */
+GArrowRecordBatchFileWriter *
+garrow_record_batch_file_writer_new_full(GArrowOutputStream *sink,
+ GArrowSchema *schema,
+ GArrowWriteOptions *options,
+ GHashTable *metadata,
+ GError **error)
{
auto arrow_sink = garrow_output_stream_get_raw(sink);
auto arrow_schema = garrow_schema_get_raw(schema);
+ arrow::ipc::IpcWriteOptions arrow_options =
arrow::ipc::IpcWriteOptions::Defaults();
+ if (options) {
+ arrow_options = *garrow_write_options_get_raw(options);
+ }
+ std::shared_ptr<arrow::KeyValueMetadata> arrow_metadata;
+ if (metadata) {
+ arrow_metadata = garrow_internal_hash_table_to_metadata(metadata);
+ }
+
std::shared_ptr<arrow::ipc::RecordBatchWriter> arrow_writer;
- auto arrow_writer_result = arrow::ipc::MakeFileWriter(arrow_sink,
arrow_schema);
- if (garrow::check(error, arrow_writer_result,
"[record-batch-file-writer][open]")) {
+ auto arrow_writer_result =
+ arrow::ipc::MakeFileWriter(arrow_sink, arrow_schema, arrow_options,
arrow_metadata);
+ if (garrow::check(error, arrow_writer_result,
"[record-batch-file-writer][new]")) {
auto arrow_writer = *arrow_writer_result;
return garrow_record_batch_file_writer_new_raw(&arrow_writer);
} else {
- return NULL;
+ return nullptr;
}
}
diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h
index fc5fe0c2c7..e02da0e30d 100644
--- a/c_glib/arrow-glib/writer.h
+++ b/c_glib/arrow-glib/writer.h
@@ -20,6 +20,7 @@
#pragma once
#include <arrow-glib/array.h>
+#include <arrow-glib/ipc-options.h>
#include <arrow-glib/record-batch.h>
#include <arrow-glib/schema.h>
@@ -94,6 +95,14 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink,
GArrowSchema *schema,
GError **error);
+GARROW_AVAILABLE_IN_24_0
+GArrowRecordBatchFileWriter *
+garrow_record_batch_file_writer_new_full(GArrowOutputStream *sink,
+ GArrowSchema *schema,
+ GArrowWriteOptions *options,
+ GHashTable *metadata,
+ GError **error);
+
/**
* GArrowCSVQuotingStyle:
* @GARROW_CSV_QUOTING_STYLE_NEEDED: Only enclose values in quotes which need
them.
diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb
index 06c9dfa25c..41fd00cee4 100644
--- a/c_glib/test/test-file-writer.rb
+++ b/c_glib/test/test-file-writer.rb
@@ -88,4 +88,36 @@ class TestFileWriter < Test::Unit::TestCase
input.close
end
end
+
+ def test_footer_custom_metadata
+ tempfile = Tempfile.open("arrow-ipc-file-writer")
+ output = Arrow::FileOutputStream.new(tempfile.path, false)
+
+ array = build_boolean_array([true, false, true])
+ field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new)
+ schema = Arrow::Schema.new([field])
+
+ options = Arrow::WriteOptions.new
+ metadata = {"key1" => "value1", "key2" => "value2"}
+ begin
+ file_writer = Arrow::RecordBatchFileWriter.new(output,
+ schema,
+ options,
+ metadata)
+ file_writer.close
+ assert do
+ file_writer.closed?
+ end
+ ensure
+ output.close
+ end
+
+ input = Arrow::MemoryMappedInputStream.new(tempfile.path)
+ begin
+ file_reader = Arrow::RecordBatchFileReader.new(input)
+ assert_equal(metadata, file_reader.metadata)
+ ensure
+ input.close
+ end
+ end
end
diff --git a/ruby/red-arrow-format/Gemfile b/ruby/red-arrow-format/Gemfile
index 296a7b4435..34c981237c 100644
--- a/ruby/red-arrow-format/Gemfile
+++ b/ruby/red-arrow-format/Gemfile
@@ -26,5 +26,6 @@ gem "red-arrow", path: "../red-arrow"
group :development do
gem "benchmark-driver"
gem "rake"
+ gem "stringio"
gem "test-unit"
end
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
index 7c749e5fbf..cec3711096 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
@@ -35,6 +35,7 @@ module ArrowFormat
FOOTER_SIZE_SIZE = IO::Buffer.size_of(FOOTER_SIZE_FORMAT)
attr_reader :schema
+ attr_reader :metadata
def initialize(input)
case input
when IO
@@ -47,6 +48,7 @@ module ArrowFormat
validate
@footer = read_footer
+ @metadata = read_custom_metadata(@footer.custom_metadata)
@record_batch_blocks = @footer.record_batches || []
@schema = read_schema(@footer.schema)
@dictionaries = read_dictionaries
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
index 27b6b55bbf..2ac4695180 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
@@ -29,26 +29,33 @@ module ArrowFormat
super
end
- def finish
- super
- write_footer
+ def finish(metadata=nil)
+ super()
+ write_footer(metadata)
write_data(MAGIC)
@output
end
private
- def build_footer
+ def build_footer(metadata)
fb_footer = FB::Footer::Data.new
fb_footer.version = FB::MetadataVersion::V5
fb_footer.schema = @fb_schema
fb_footer.dictionaries = @fb_dictionary_blocks
fb_footer.record_batches = @fb_record_batch_blocks
- # fb_footer.custom_metadata = ... # TODO
+ if metadata
+ fb_footer.custom_metadata = metadata.collect do |key, value|
+ fb_key_value = FB::KeyValue::Data.new
+ fb_key_value.key = key
+ fb_key_value.value = value
+ fb_key_value
+ end
+ end
FB::Footer.serialize(fb_footer)
end
- def write_footer
- footer = build_footer
+ def write_footer(metadata)
+ footer = build_footer(metadata)
write_data(footer)
write_data([footer.bytesize].pack("l<"))
end
diff --git a/ruby/red-arrow-format/test/helper.rb
b/ruby/red-arrow-format/test/helper.rb
index 394d92d0dd..29fbfaec4c 100644
--- a/ruby/red-arrow-format/test/helper.rb
+++ b/ruby/red-arrow-format/test/helper.rb
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+require "stringio"
require "tmpdir"
require "test-unit"
diff --git a/ruby/red-arrow-format/test/test-reader.rb
b/ruby/red-arrow-format/test/test-reader.rb
index c1c6b26288..1d3202f4f7 100644
--- a/ruby/red-arrow-format/test/test-reader.rb
+++ b/ruby/red-arrow-format/test/test-reader.rb
@@ -675,18 +675,36 @@ module ReaderTests
end
end
+module FileReaderTests
+ def test_custom_metadata_footer
+ Dir.mktmpdir do |tmp_dir|
+ table = Arrow::Table.new(value: Arrow::Int8Array.new([1, 2, 3]))
+ metadata = {
+ "key1" => "value1",
+ "key2" => "value2",
+ }
+ open_input(table, tmp_dir, metadata: metadata) do |input|
+ reader = reader_class.new(input)
+ assert_equal(metadata, reader.metadata)
+ end
+ ensure
+ GC.start
+ end
+ end
+end
+
module FileInput
- def open_input(table, tmp_dir, &block)
+ def open_input(table, tmp_dir, **options, &block)
path = File.join(tmp_dir, "data.#{file_extension}")
- table.save(path)
+ table.save(path, **options)
File.open(path, "rb", &block)
end
end
module PipeInput
- def open_input(table, tmp_dir, &block)
+ def open_input(table, tmp_dir, **options)
buffer = Arrow::ResizableBuffer.new(4096)
- table.save(buffer, format: format)
+ table.save(buffer, format: format, **options)
IO.pipe do |input, output|
write_thread = Thread.new do
output.write(buffer.data.to_s)
@@ -701,15 +719,16 @@ module PipeInput
end
module StringInput
- def open_input(table, tmp_dir)
+ def open_input(table, tmp_dir, **options)
buffer = Arrow::ResizableBuffer.new(4096)
- table.save(buffer, format: format)
+ table.save(buffer, format: format, **options)
yield(buffer.data.to_s)
end
end
class TestFileReaderFileInput < Test::Unit::TestCase
include ReaderTests
+ include FileReaderTests
include FileInput
def file_extension
@@ -723,6 +742,7 @@ end
class TestFileReaderStringInput < Test::Unit::TestCase
include ReaderTests
+ include FileReaderTests
include StringInput
def format
diff --git a/ruby/red-arrow-format/test/test-writer.rb
b/ruby/red-arrow-format/test/test-writer.rb
index 72776f01ab..55b3c22b7a 100644
--- a/ruby/red-arrow-format/test/test-writer.rb
+++ b/ruby/red-arrow-format/test/test-writer.rb
@@ -924,6 +924,26 @@ module WriterTests
end
end
+module FileWriterTests
+ def test_custom_metadata_footer
+ output = StringIO.new(+"".b)
+ writer = writer_class.new(output)
+ field = ArrowFormat::Field.new("value", ArrowFormat::BooleanType.new)
+ schema = ArrowFormat::Schema.new([field])
+ writer.start(schema)
+ metadata = {
+ "key1" => "value1",
+ "key2" => "value2",
+ }
+ writer.finish(metadata)
+ buffer = Arrow::Buffer.new(output.string)
+ Arrow::BufferInputStream.open(buffer) do |input|
+ reader = Arrow::RecordBatchFileReader.new(input)
+ assert_equal(metadata, reader.metadata)
+ end
+ end
+end
+
module WriterDictionaryDeltaTests
def build_schema(value_type)
index_type = ArrowFormat::Int32Type.singleton
@@ -1513,6 +1533,7 @@ class TestFileWriter < Test::Unit::TestCase
sub_test_case("Basic") do
include WriterTests
+ include FileWriterTests
end
sub_test_case("Dictionary: delta") do
diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb
b/ruby/red-arrow/lib/arrow/table-saver.rb
index c33e641438..d456f235e5 100644
--- a/ruby/red-arrow/lib/arrow/table-saver.rb
+++ b/ruby/red-arrow/lib/arrow/table-saver.rb
@@ -130,9 +130,9 @@ module Arrow
end
end
- def save_raw(writer_class)
+ def save_raw(writer_class, *args)
open_output_stream do |output|
- writer_class.open(output, @table.schema) do |writer|
+ writer_class.open(output, @table.schema, *args) do |writer|
writer.write_table(@table)
end
end
@@ -144,7 +144,7 @@ module Arrow
# @since 1.0.0
def save_as_arrow_file
- save_raw(RecordBatchFileWriter)
+ save_raw(RecordBatchFileWriter, nil, @options[:metadata])
end
# @deprecated Use `format: :arrow_batch` instead.