[
https://issues.apache.org/jira/browse/PARQUET-1348?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16560772#comment-16560772
]
ASF GitHub Bot commented on PARQUET-1348:
-----------------------------------------
xhochy closed pull request #481: PARQUET-1348: Add ability to write
FileMetaData in arrow FileWriter
URL: https://github.com/apache/parquet-cpp/pull/481
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 1c2f3225..02b8d528 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -851,6 +851,38 @@ TYPED_TEST(TestParquetIO,
SingleColumnTableOptionalChunkedWrite) {
ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values));
}
+TYPED_TEST(TestParquetIO, FileMetaDataWrite) {
+ std::shared_ptr<Array> values;
+ ASSERT_OK(NonNullArray<TypeParam>(SMALL_SIZE, &values));
+ std::shared_ptr<Table> table = MakeSimpleTable(values, false);
+ this->sink_ = std::make_shared<InMemoryOutputStream>();
+ ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(),
this->sink_,
+ values->length(),
default_writer_properties()));
+
+ std::unique_ptr<FileReader> reader;
+ ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
+ const std::shared_ptr<FileMetaData> fileMetaData =
reader->parquet_reader()->metadata();
+ ASSERT_EQ(1, fileMetaData->num_columns());
+ ASSERT_EQ(100, fileMetaData->num_rows());
+
+ this->sink_ = std::make_shared<InMemoryOutputStream>();
+
+ std::unique_ptr<FileMetaData> uniqueFileMetaData(fileMetaData.get());
+
+ ASSERT_OK_NO_THROW(FileWriter::WriteMetaData(uniqueFileMetaData,
this->sink_));
+
+ ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
+ const std::shared_ptr<FileMetaData> fileMetaDataWritten =
+ reader->parquet_reader()->metadata();
+ ASSERT_EQ(fileMetaData->size(), fileMetaDataWritten->size());
+ ASSERT_EQ(fileMetaData->num_row_groups(),
fileMetaDataWritten->num_row_groups());
+ ASSERT_EQ(fileMetaData->num_rows(), fileMetaDataWritten->num_rows());
+ ASSERT_EQ(fileMetaData->num_columns(), fileMetaDataWritten->num_columns());
+ ASSERT_EQ(fileMetaData->RowGroup(0)->num_rows(),
+ fileMetaDataWritten->RowGroup(0)->num_rows());
+ uniqueFileMetaData.release();
+}
+
using TestInt96ParquetIO = TestParquetIO<::arrow::TimestampType>;
TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) {
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index f3ddda90..d1697c34 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -1092,6 +1092,19 @@ Status FileWriter::Open(const ::arrow::Schema& schema,
::arrow::MemoryPool* pool
return Open(schema, pool, wrapper, properties, arrow_properties, writer);
}
+Status FileWriter::WriteMetaData(const std::unique_ptr<FileMetaData>&
fileMetaData,
+ const std::shared_ptr<OutputStream>& sink) {
+ ParquetFileWriter::WriteMetaData(sink, fileMetaData);
+ return Status::OK();
+}
+
+Status FileWriter::WriteMetaData(const std::unique_ptr<FileMetaData>&
fileMetaData,
+ const
std::shared_ptr<::arrow::io::OutputStream>& sink) {
+ auto wrapper = std::make_shared<ArrowOutputStream>(sink);
+ return WriteMetaData(fileMetaData, wrapper);
+}
+
+
namespace {} // namespace
Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) {
diff --git a/src/parquet/arrow/writer.h b/src/parquet/arrow/writer.h
index 06008d2f..d62d3b0e 100644
--- a/src/parquet/arrow/writer.h
+++ b/src/parquet/arrow/writer.h
@@ -132,6 +132,14 @@ class PARQUET_EXPORT FileWriter {
const std::shared_ptr<ArrowWriterProperties>& arrow_properties,
std::unique_ptr<FileWriter>* writer);
+ static ::arrow::Status WriteMetaData(
+ const std::unique_ptr<FileMetaData>& fileMetaData,
+ const std::shared_ptr<OutputStream>& sink);
+
+ static ::arrow::Status WriteMetaData(
+ const std::unique_ptr<FileMetaData>& fileMetaData,
+ const std::shared_ptr<::arrow::io::OutputStream>& sink);
+
/// \brief Write a Table to Parquet.
::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size);
diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc
index 1e4a09e2..cc34fd0b 100644
--- a/src/parquet/file_writer.cc
+++ b/src/parquet/file_writer.cc
@@ -160,6 +160,20 @@ class FileSerializer : public ParquetFileWriter::Contents {
return result;
}
+ static void WriteMetaData(
+ const std::shared_ptr<OutputStream>& sink,
+ const std::unique_ptr<FileMetaData>& fileMetaData) {
+ // Write MetaData
+ uint32_t metadata_len = static_cast<uint32_t>(sink->Tell());
+
+ fileMetaData->WriteTo(sink.get());
+ metadata_len = static_cast<uint32_t>(sink->Tell()) - metadata_len;
+
+ // Write Footer
+ sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4);
+ sink->Write(PARQUET_MAGIC, 4);
+ }
+
void Close() override {
if (is_open_) {
if (row_group_writer_) {
@@ -234,17 +248,8 @@ class FileSerializer : public ParquetFileWriter::Contents {
}
void WriteMetaData() {
- // Write MetaData
- uint32_t metadata_len = static_cast<uint32_t>(sink_->Tell());
-
- // Get a FileMetaData
auto metadata = metadata_->Finish();
- metadata->WriteTo(sink_.get());
- metadata_len = static_cast<uint32_t>(sink_->Tell()) - metadata_len;
-
- // Write Footer
- sink_->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4);
- sink_->Write(PARQUET_MAGIC, 4);
+ WriteMetaData(sink_, metadata);
}
};
@@ -280,6 +285,18 @@ std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
return result;
}
+void ParquetFileWriter::WriteMetaData(
+ const std::shared_ptr<::arrow::io::OutputStream> &sink,
+ const std::unique_ptr<FileMetaData> &fileMetaData) {
+ WriteMetaData(std::make_shared<ArrowOutputStream>(sink), fileMetaData);
+}
+
+void ParquetFileWriter::WriteMetaData(
+ const std::shared_ptr<OutputStream> &sink,
+ const std::unique_ptr<FileMetaData> &fileMetaData) {
+ FileSerializer::WriteMetaData(sink, fileMetaData);
+}
+
const SchemaDescriptor* ParquetFileWriter::schema() const { return
contents_->schema(); }
const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
diff --git a/src/parquet/file_writer.h b/src/parquet/file_writer.h
index 9c28531f..e0d1dae9 100644
--- a/src/parquet/file_writer.h
+++ b/src/parquet/file_writer.h
@@ -133,6 +133,14 @@ class PARQUET_EXPORT ParquetFileWriter {
const std::shared_ptr<WriterProperties>& properties =
default_writer_properties(),
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata =
nullptr);
+ static void WriteMetaData(
+ const std::shared_ptr<::arrow::io::OutputStream> &sink,
+ const std::unique_ptr<FileMetaData> &fileMetaData);
+
+ static void WriteMetaData(
+ const std::shared_ptr<OutputStream> &sink,
+ const std::unique_ptr<FileMetaData> &fileMetaData);
+
void Open(std::unique_ptr<Contents> contents);
void Close();
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++] Allow Arrow FileWriter To Write FileMetaData
> --------------------------------------------------
>
> Key: PARQUET-1348
> URL: https://issues.apache.org/jira/browse/PARQUET-1348
> Project: Parquet
> Issue Type: Improvement
> Components: parquet-cpp
> Reporter: Robert Gruener
> Priority: Major
> Labels: pull-request-available
> Fix For: cpp-1.5.0
>
>
> The arrow [FileWriter open
> method|https://github.com/apache/parquet-cpp/blob/master/src/parquet/arrow/writer.h#L111]
> only takes in a schema (which does not include row group information) and
> not the full FileMetaData. This does not allow the summary _metadata file to
> be created, and will need to be changed to write the full file metadata
> object.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)