Tom-Newton commented on code in PR #38780:
URL: https://github.com/apache/arrow/pull/38780#discussion_r1398395091
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -461,6 +463,224 @@ class ObjectInputFile final : public io::RandomAccessFile
{
int64_t content_length_ = kNoSize;
std::shared_ptr<const KeyValueMetadata> metadata_;
};
+
+Status CreateEmptyBlockBlob(
+ std::shared_ptr<Azure::Storage::Blobs::BlockBlobClient> block_blob_client)
{
+ std::string s = "";
+ try {
+ block_blob_client->UploadFrom(
+ const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(s.data())),
s.size());
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "UploadFrom failed for '" + block_blob_client->GetUrl() +
+ "' with an unexpected Azure error. There is no existing blob at
this "
+ "location or the existing blob must be replaced so
ObjectAppendStream must "
+ "create a new empty block blob.",
+ exception);
+ }
+ return Status::OK();
+}
+
+Result<Azure::Storage::Blobs::Models::GetBlockListResult> GetBlockList(
+ std::shared_ptr<Azure::Storage::Blobs::BlockBlobClient> block_blob_client)
{
+ try {
+ return block_blob_client->GetBlockList().Value;
+ } catch (Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "GetBlockList failed for '" + block_blob_client->GetUrl() +
+ "' with an unexpected Azure error. Cannot write to a file without
first "
+ "fetching the existing block list.",
+ exception);
+ }
+}
+
+Azure::Storage::Metadata ArrowMetadataToAzureMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& arrow_metadata) {
+ Azure::Storage::Metadata azure_metadata;
+ for (auto key_value : arrow_metadata->sorted_pairs()) {
+ azure_metadata[key_value.first] = key_value.second;
+ }
+ return azure_metadata;
+}
+
+Status CommitBlockList(
+ std::shared_ptr<Azure::Storage::Blobs::BlockBlobClient> block_blob_client,
+ const std::vector<std::string>& block_ids, const Azure::Storage::Metadata&
metadata) {
+ Azure::Storage::Blobs::CommitBlockListOptions options;
+ options.Metadata = metadata;
+ try {
+ // CommitBlockList puts all block_ids in the latest element. That means in
the case of
+ // overlapping block_ids the newly staged block ids will always replace the
+ // previously committed blocks.
+ //
https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body
+ block_blob_client->CommitBlockList(block_ids, options);
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "CommitBlockList failed for '" + block_blob_client->GetUrl() +
+ "' with an unexpected Azure error. Committing is required to flush
an "
+ "output/append stream.",
+ exception);
+ }
+ return Status::OK();
+}
+
+class ObjectAppendStream final : public io::OutputStream {
+ public:
+ ObjectAppendStream(
+ std::shared_ptr<Azure::Storage::Blobs::BlockBlobClient>
block_blob_client,
+ const io::IOContext& io_context, const AzureLocation& location,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ const AzureOptions& options, int64_t size = kNoSize)
+ : block_blob_client_(std::move(block_blob_client)),
+ io_context_(io_context),
+ location_(location),
+ content_length_(size) {
+ if (metadata && metadata->size() != 0) {
+ metadata_ = ArrowMetadataToAzureMetadata(metadata);
+ } else if (options.default_metadata && options.default_metadata->size() !=
0) {
+ metadata_ = ArrowMetadataToAzureMetadata(options.default_metadata);
+ }
Review Comment:
Closing/flushing an append stream will always completely replace the old
metadata. This is covered by the `AzuriteFileSystemTest, TestWriteMetadata`
test which I largely copied from `gcsfs_test.cc`.
I don't feel strongly but I think this is a reasonable choice. I think if id
did merge then there would be no way to remove metadata keys through the arrow
file system. Also replacing is simpler to implement than merging.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]