Tom-Newton commented on code in PR #38505:
URL: https://github.com/apache/arrow/pull/38505#discussion_r1382616550
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -78,18 +81,17 @@ struct AzurePath {
"Expected an Azure object path of the form 'container/path...', got
a URI: '",
s, "'");
}
- const auto src = internal::RemoveTrailingSlash(s);
Review Comment:
This was preventing `GetFileInfo` working on directories. The other
filesystems did not have this.
##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -216,23 +227,223 @@ class TestAzureFileSystem : public ::testing::Test {
void UploadLines(const std::vector<std::string>& lines, const char*
path_to_file,
int total_size) {
// TODO(GH-38333): Switch to using Azure filesystem to write once its
implemented.
- auto blob_client =
service_client_->GetBlobContainerClient(PreexistingContainerName())
- .GetBlockBlobClient(path_to_file);
+ auto blob_client =
+
blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient(path_to_file);
std::string all_lines = std::accumulate(lines.begin(), lines.end(),
std::string(""));
blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(all_lines.data()),
total_size);
}
};
-TEST_F(TestAzureFileSystem, OpenInputStreamString) {
+class AzuriteFileSystemTest : public AzureFileSystemTest {
+ Result<AzureOptions> MakeOptions() {
+ EXPECT_THAT(GetAzuriteEnv(), NotNull());
+ ARROW_EXPECT_OK(GetAzuriteEnv()->status());
+ AzureOptions options;
+ options.backend = AzureBackend::Azurite;
+ ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(
+ GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key()));
+ return options;
+ }
+};
+
+class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest {
+ Result<AzureOptions> MakeOptions() override {
+ AzureOptions options;
+ if (char* account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME"))
{
+ char* account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY");
+ EXPECT_THAT(account_key, NotNull());
+ ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name,
account_key));
+ return options;
+ }
+ return Status::Cancelled(
+ "Connection details not provided for a real flat namespace "
+ "account.");
+ }
+};
+
+class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest {
+ Result<AzureOptions> MakeOptions() override {
+ AzureOptions options;
+ if (char* account_name =
std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME")) {
+ char* account_key =
std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY");
+ EXPECT_THAT(account_key, NotNull());
+ ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name,
account_key));
+ return options;
+ }
+ return Status::Cancelled(
+ "Connection details not provided for a real hierachical namespace "
+ "account.");
+ }
+};
+
+TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) {
+ auto hierarchical_namespace = internal::HierarchicalNamespaceDetector();
+ ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_));
+ ASSERT_OK_AND_EQ(false,
hierarchical_namespace.Enabled(PreexistingContainerName()));
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) {
+ auto hierarchical_namespace = internal::HierarchicalNamespaceDetector();
+ ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_));
+ ASSERT_OK_AND_EQ(true,
hierarchical_namespace.Enabled(PreexistingContainerName()));
+}
+
+TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) {
+ auto hierarchical_namespace = internal::HierarchicalNamespaceDetector();
+ ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_));
+ ASSERT_OK_AND_EQ(false,
hierarchical_namespace.Enabled(PreexistingContainerName()));
+}
+
+TEST_F(AzuriteFileSystemTest,
DetectHierarchicalNamespaceFailsWithMissingContainer) {
+ auto hierarchical_namespace = internal::HierarchicalNamespaceDetector();
+ ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_));
+ ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container"));
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) {
+ arrow::fs::AssertFileInfo(fs_.get(), "", FileType::Directory);
+
+ // URI
+ ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://"));
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) {
+ arrow::fs::AssertFileInfo(fs_.get(), PreexistingContainerName(),
FileType::Directory);
+
+ arrow::fs::AssertFileInfo(fs_.get(), "non-existent-container",
FileType::NotFound);
+
+ // URI
+ ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" +
PreexistingContainerName()));
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) {
+ // Adds detailed tests to handle cases of different edge cases
+ // with directory naming conventions (e.g. with and without slashes).
+ constexpr auto kObjectName =
"test-object-dir/some_other_dir/another_dir/foo";
+ // TODO(GH-38333): Switch to using Azure filesystem to write once its
implemented.
+ blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient(kObjectName)
+ .UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
strlen(kLoremIpsum));
+
+ // 0 is immediately after "/" lexicographically, ensure that this doesn't
+ // cause unexpected issues.
+ // TODO(GH-38333): Switch to using Azure filesystem to write once its
implemented.
+ blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient("test-object-dir/some_other_dir0")
+ .UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
strlen(kLoremIpsum));
+
+ blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient(std::string(kObjectName) + "0")
+ .UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
strlen(kLoremIpsum));
+
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName,
FileType::File);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/",
+ FileType::NotFound);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir",
+ FileType::Directory);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/",
+ FileType::Directory);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() +
"test-object-dir/some_other_dir",
+ FileType::Directory);
+ AssertFileInfo(fs_.get(),
+ PreexistingContainerPath() +
"test-object-dir/some_other_dir/",
+ FileType::Directory);
+
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di",
+ FileType::NotFound);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() +
"test-object-dir/some_other_di",
+ FileType::NotFound);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest,
GetFileInfoObjectWithNestedStructure) {
+ // Adds detailed tests to handle cases of different edge cases
+ // with directory naming conventions (e.g. with and without slashes).
+ constexpr auto kObjectName =
"test-object-dir/some_other_dir/another_dir/foo";
+ // TODO(GH-38333): Switch to using Azure filesystem to write once its
implemented.
+ blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient(kObjectName)
+ .UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
strlen(kLoremIpsum));
+
+ // 0 is immediately after "/" lexicographically, ensure that this doesn't
+ // cause unexpected issues.
+ // TODO(GH-38333): Switch to using Azure filesystem to write once its
implemented.
+ blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient("test-object-dir/some_other_dir0")
+ .UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
strlen(kLoremIpsum));
+
+ blob_service_client_->GetBlobContainerClient(PreexistingContainerName())
+ .GetBlockBlobClient(std::string(kObjectName) + "0")
+ .UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
strlen(kLoremIpsum));
+
+ datalake_service_client_->GetFileSystemClient(PreexistingContainerName())
+ .GetDirectoryClient("test-empty-object-dir")
+ .Create();
+
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName,
FileType::File);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/",
+ FileType::NotFound);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir",
+ FileType::Directory);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/",
+ FileType::Directory);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() +
"test-object-dir/some_other_dir",
+ FileType::Directory);
+ AssertFileInfo(fs_.get(),
+ PreexistingContainerPath() +
"test-object-dir/some_other_dir/",
+ FileType::Directory);
+
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di",
+ FileType::NotFound);
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() +
"test-object-dir/some_other_di",
+ FileType::NotFound);
+
+ AssertFileInfo(fs_.get(), PreexistingContainerPath() +
"test-empty-object-dir",
+ FileType::Directory);
Review Comment:
Ideally I would have liked to add an assertion here which confirms that with
the hierarchical namespace there are no calls to `ListBlobs`. That would
require patching an Azure container client, which I didn't know how to do. If
anyone was any suggestions that would be appreciated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]