This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new cf80bd1135 GH-38618: [C++] S3FileSystem: fix regression in deleting 
explicitly created sub-directories (#38845)
cf80bd1135 is described below

commit cf80bd1135bbd9cee7c0ae3e6370f93270cba250
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Dec 5 18:23:15 2023 +0100

    GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created 
sub-directories (#38845)
    
    ### Rationale for this change
    
    See https://github.com/apache/arrow/issues/38618#issuecomment-1821252024 
and below for the analysis. When deleting the dir contents, we use a 
GetFileInfo with recursive FileSelector to list all objects to delete, but when 
doing that the file paths for directories don't end in a trailing `/`, so for 
deleting explicitly created directories we need to add the `kSep` here as well 
to properly delete the object.
    
    ### Are these changes tested?
    
    I tested them manually with an actual S3 bucket. The problem is that MinIO 
doesn't have the same problem, and so it's not actually tested with the test I 
added using our MinIO testing setup.
    
    ### Are there any user-facing changes?
    
    Fixes the regression
    * Closes: #38618
    
    Lead-authored-by: Joris Van den Bossche <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 cpp/src/arrow/filesystem/s3fs.cc | 11 ++++++++++-
 python/pyarrow/tests/test_fs.py  | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 511448cb2f..62bec9b23b 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -2409,7 +2409,16 @@ class S3FileSystem::Impl : public 
std::enable_shared_from_this<S3FileSystem::Imp
                   std::vector<std::string> file_paths;
                   for (const auto& file_info : file_infos) {
                     DCHECK_GT(file_info.path().size(), bucket.size());
-                    file_paths.push_back(file_info.path().substr(bucket.size() 
+ 1));
+                    auto file_path = file_info.path().substr(bucket.size() + 
1);
+                    if (file_info.IsDirectory()) {
+                      // The selector returns FileInfo objects for directories 
with a
+                      // a path that never ends in a trailing slash, but for 
AWS the file
+                      // needs to have a trailing slash to recognize it as 
directory
+                      // (https://github.com/apache/arrow/issues/38618)
+                      DCHECK_OK(internal::AssertNoTrailingSlash(file_path));
+                      file_path = file_path + kSep;
+                    }
+                    file_paths.push_back(std::move(file_path));
                   }
                   scheduler->AddSimpleTask(
                       [=, file_paths = std::move(file_paths)] {
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 1002e13471..59c9c44942 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -760,6 +760,38 @@ def test_delete_dir(fs, pathfn):
         fs.delete_dir(d)
 
 
+def test_delete_dir_with_explicit_subdir(fs, pathfn):
+    # GH-38618: regression with AWS failing to delete directories,
+    # depending on whether they were created explicitly. Note that
+    # Minio doesn't reproduce the issue, so this test is not a regression
+    # test in itself.
+    skip_fsspec_s3fs(fs)
+
+    d = pathfn('directory/')
+    nd = pathfn('directory/nested/')
+
+    # deleting dir with explicit subdir
+    fs.create_dir(d)
+    fs.create_dir(nd)
+    fs.delete_dir(d)
+    dir_info = fs.get_file_info(d)
+    assert dir_info.type == FileType.NotFound
+
+    # deleting dir with blob in explicit subdir
+    d = pathfn('directory2')
+    nd = pathfn('directory2/nested')
+    f = pathfn('directory2/nested/target-file')
+
+    fs.create_dir(d)
+    fs.create_dir(nd)
+    with fs.open_output_stream(f) as s:
+        s.write(b'data')
+
+    fs.delete_dir(d)
+    dir_info = fs.get_file_info(d)
+    assert dir_info.type == FileType.NotFound
+
+
 def test_delete_dir_contents(fs, pathfn):
     skip_fsspec_s3fs(fs)
 

Reply via email to