This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d21a924f30 GH-42102: [C++][Parquet] Add binary that extracts a footer
from a parquet file (#42174)
d21a924f30 is described below
commit d21a924f3012c1e589a3393ebae2c78ee290ba5c
Author: Alkis Evlogimenos <[email protected]>
AuthorDate: Mon Jul 22 17:55:28 2024 +0300
GH-42102: [C++][Parquet] Add binary that extracts a footer from a parquet
file (#42174)
### Rationale for this change
This binary will make it a lot easier for customers to share their parquet
metadata with the community so that we can build a repository of footers that
can be used for advancing the state of metadata in parquet.
### What changes are included in this PR?
Usage from the file binary itself:
```
Usage: parquet-dump-footer
-h|--help Print help and exit
--no-scrub Do not scrub potentially confidential metadata
--debug Output text represenation of footer for inspection
--in <uri> Input file (required): must be an URI or an absolute local
path
--out <path> Output file (optional, default stdout)
Dump the footer of a Parquet file to stdout or to a file, optionally with
potentially confidential metadata scrubbed.
```
### Are these changes tested?
Manually on existing parquet files.
### Are there any user-facing changes?
No.
* GitHub Issue: #42102
Lead-authored-by: Alkis Evlogimenos <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/metadata.cc | 65 +++++++++++++++
cpp/src/parquet/metadata.h | 7 ++
cpp/tools/parquet/CMakeLists.txt | 3 +-
cpp/tools/parquet/parquet_dump_footer.cc | 135 +++++++++++++++++++++++++++++++
4 files changed, 209 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index ee83918189..7bab910461 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -21,6 +21,8 @@
#include <cinttypes>
#include <memory>
#include <ostream>
+#include <random>
+#include <sstream>
#include <string>
#include <string_view>
#include <utility>
@@ -29,6 +31,7 @@
#include "arrow/io/memory.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
+#include "arrow/util/pcg_random.h"
#include "parquet/encryption/encryption_internal.h"
#include "parquet/encryption/internal_file_decryptor.h"
#include "parquet/exception.h"
@@ -599,6 +602,49 @@ std::vector<SortingColumn>
RowGroupMetaData::sorting_columns() const {
return impl_->sorting_columns();
}
+// Replace string data with random-generated uppercase characters
+static void Scrub(std::string* s) {
+ static ::arrow::random::pcg64 rng;
+ std::uniform_int_distribution<> caps(65, 90);
+ for (auto& c : *s) c = caps(rng);
+}
+
+// Replace potentially sensitive metadata with random data
+static void Scrub(format::FileMetaData* md) {
+ for (auto& s : md->schema) {
+ Scrub(&s.name);
+ }
+ for (auto& r : md->row_groups) {
+ for (auto& c : r.columns) {
+ Scrub(&c.file_path);
+ if (c.__isset.meta_data) {
+ auto& m = c.meta_data;
+ for (auto& p : m.path_in_schema) Scrub(&p);
+ for (auto& kv : m.key_value_metadata) {
+ Scrub(&kv.key);
+ Scrub(&kv.value);
+ }
+ Scrub(&m.statistics.max_value);
+ Scrub(&m.statistics.min_value);
+ Scrub(&m.statistics.min);
+ Scrub(&m.statistics.max);
+ }
+
+ if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY;
+ for (auto& p : m.path_in_schema) Scrub(&p);
+ Scrub(&m.key_metadata);
+ }
+ Scrub(&c.encrypted_column_metadata);
+ }
+ }
+ for (auto& kv : md->key_value_metadata) {
+ Scrub(&kv.key);
+ Scrub(&kv.value);
+ }
+ Scrub(&md->footer_signing_key_metadata);
+}
+
// file metadata
class FileMetaData::FileMetaDataImpl {
public:
@@ -821,6 +867,21 @@ class FileMetaData::FileMetaDataImpl {
return out;
}
+ std::string SerializeUnencrypted(bool scrub, bool debug) const {
+ auto md = *metadata_;
+ if (scrub) Scrub(&md);
+ if (debug) {
+ std::ostringstream ss;
+ md.printTo(ss);
+ return ss.str();
+ } else {
+ ThriftSerializer serializer;
+ std::string out;
+ serializer.SerializeToString(&md, &out);
+ return out;
+ }
+ }
+
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor>
file_decryptor) {
file_decryptor_ = std::move(file_decryptor);
}
@@ -992,6 +1053,10 @@ std::shared_ptr<FileMetaData> FileMetaData::Subset(
return impl_->Subset(row_groups);
}
+std::string FileMetaData::SerializeUnencrypted(bool scrub, bool json) const {
+ return impl_->SerializeUnencrypted(scrub, json);
+}
+
void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
const std::shared_ptr<Encryptor>& encryptor) const {
return impl_->WriteTo(dst, encryptor);
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index 9fc30df58e..e02d2e7c85 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -396,6 +396,13 @@ class PARQUET_EXPORT FileMetaData {
/// FileMetaData.
std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups)
const;
+ /// \brief Serialize metadata unencrypted as string
+ ///
+ /// \param[in] scrub whether to remove sensitive information from the
metadata.
+ /// \param[in] debug whether to serialize the metadata as Thrift (if false)
or
+ /// debug text (if true).
+ std::string SerializeUnencrypted(bool scrub, bool debug) const;
+
private:
friend FileMetaDataBuilder;
friend class SerializedFile;
diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
index 81ab49421d..e05645da28 100644
--- a/cpp/tools/parquet/CMakeLists.txt
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -16,7 +16,7 @@
# under the License.
if(PARQUET_BUILD_EXECUTABLES)
- set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan)
+ set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader
parquet-scan)
foreach(TOOL ${PARQUET_TOOLS})
string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL})
@@ -31,6 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES)
install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
endforeach(TOOL)
+ target_link_libraries(parquet-dump-footer ${ARROW_LIBRARIES})
add_dependencies(parquet ${PARQUET_TOOLS})
endif()
diff --git a/cpp/tools/parquet/parquet_dump_footer.cc
b/cpp/tools/parquet/parquet_dump_footer.cc
new file mode 100644
index 0000000000..c7a4b78fdd
--- /dev/null
+++ b/cpp/tools/parquet/parquet_dump_footer.cc
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <optional>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/metadata.h"
+
+namespace parquet {
+namespace {
+uint32_t ReadLE32(const void* p) {
+ uint32_t x = ::arrow::util::SafeLoadAs<uint32_t>(static_cast<const
uint8_t*>(p));
+ return ::arrow::bit_util::FromLittleEndian(x);
+}
+
+void AppendLE32(uint32_t v, std::string* out) {
+ v = ::arrow::bit_util::ToLittleEndian(v);
+ out->append(reinterpret_cast<const char*>(&v), sizeof(v));
+}
+
+int DoIt(std::string in, bool scrub, bool json, std::string out) {
+ std::string path;
+ auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie();
+ auto file = fs->OpenInputFile(path).ValueOrDie();
+ int64_t file_len = file->GetSize().ValueOrDie();
+ if (file_len < 8) {
+ std::cerr << "File too short: " << in << "\n";
+ return 3;
+ }
+ // First do an opportunistic read of up to 1 MiB to try and get the entire
footer.
+ int64_t tail_len = std::min(file_len, int64_t{1} << 20);
+ std::string tail;
+ tail.resize(tail_len);
+ char* data = tail.data();
+ file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie();
+ if (auto magic = ReadLE32(data + tail_len - 4); magic != ReadLE32("PAR1")) {
+ std::cerr << "Not a Parquet file: " << in << "\n";
+ return 4;
+ }
+ uint32_t metadata_len = ReadLE32(data + tail_len - 8);
+ if (tail_len >= metadata_len + 8) {
+ // The footer is entirely in the initial read. Trim to size.
+ tail = tail.substr(tail_len - (metadata_len + 8));
+ } else {
+ // The footer is larger than the initial read, read again the exact size.
+ if (metadata_len > file_len) {
+ std::cerr << "File too short: " << in << "\n";
+ return 5;
+ }
+ tail_len = metadata_len + 8;
+ tail.resize(tail_len);
+ data = tail.data();
+ file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie();
+ }
+ auto md = FileMetaData::Make(tail.data(), &metadata_len);
+ std::string ser = md->SerializeUnencrypted(scrub, json);
+ if (!json) {
+ AppendLE32(static_cast<uint32_t>(ser.size()), &ser);
+ ser.append("PAR1", 4);
+ }
+ std::optional<std::fstream> fout;
+ if (!out.empty()) fout.emplace(out, std::ios::out);
+ std::ostream& os = fout ? *fout : std::cout;
+ if (!os.write(ser.data(), ser.size())) {
+ std::cerr << "Failed to write to output file: " << out << "\n";
+ return 6;
+ }
+
+ return 0;
+}
+} // namespace
+} // namespace parquet
+
+static int PrintHelp() {
+ std::cerr << R"(Usage: parquet-dump-footer
+ -h|--help Print help and exit
+ --no-scrub Do not scrub potentially confidential metadata
+ --debug Output text represenation of footer for inspection
+ --in <uri> Input file (required): must be an URI or an absolute local path
+ --out <path> Output file (optional, default stdout)
+
+ Dump the footer of a Parquet file to stdout or to a file, optionally with
+ potentially confidential metadata scrubbed.
+)";
+ return 1;
+}
+
+int main(int argc, char** argv) {
+ bool scrub = true;
+ bool json = false;
+ std::string in;
+ std::string out;
+ for (int i = 1; i < argc; i++) {
+ char* arg = argv[i];
+ if (!std::strcmp(arg, "-h") || !std::strcmp(arg, "--help")) {
+ return PrintHelp();
+ } else if (!std::strcmp(arg, "--no-scrub")) {
+ scrub = false;
+ } else if (!std::strcmp(arg, "--json")) {
+ json = true;
+ } else if (!std::strcmp(arg, "--in")) {
+ if (i + 1 >= argc) return PrintHelp();
+ in = argv[++i];
+ } else if (!std::strcmp(arg, "--out")) {
+ if (i + 1 >= argc) return PrintHelp();
+ out = argv[++i];
+ } else {
+ // Unknown option.
+ return PrintHelp();
+ }
+ }
+ if (in.empty()) return PrintHelp();
+
+ return parquet::DoIt(in, scrub, json, out);
+}