rok commented on code in PR #49372:
URL: https://github.com/apache/arrow/pull/49372#discussion_r2880038402


##########
python/pyarrow/fs.py:
##########
@@ -82,6 +82,32 @@ def __getattr__(name):
     )
 
 
+def _is_likely_uri(path):

Review Comment:
   So I think calling the C++ function would be preferable. The change to 
expose the functionallity could look like:
   ```diff
   diff --git i/cpp/src/arrow/filesystem/filesystem.h 
w/cpp/src/arrow/filesystem/filesystem.h
   index 3a47eb62f5..85ee3ff114 100644
   --- i/cpp/src/arrow/filesystem/filesystem.h
   +++ w/cpp/src/arrow/filesystem/filesystem.h
   @@ -23,6 +23,7 @@
    #include <iosfwd>
    #include <memory>
    #include <string>
   +#include <string_view>
    #include <utility>
    #include <vector>
   
   @@ -529,6 +530,12 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem {
    /// The user is responsible for synchronization of calls to this function.
    void EnsureFinalized();
   
   +/// \brief Return whether a path string is likely a URI.
   +///
   +/// This heuristic is conservative and may return false for malformed URIs.
   +ARROW_EXPORT
   +bool IsLikelyUri(std::string_view path);
   +
    /// \defgroup filesystem-factories Functions for creating FileSystem 
instances
    ///
    /// @{
   diff --git i/cpp/src/arrow/filesystem/path_util.cc 
w/cpp/src/arrow/filesystem/path_util.cc
   index dc82afd07e..cdc1807781 100644
   --- i/cpp/src/arrow/filesystem/path_util.cc
   +++ w/cpp/src/arrow/filesystem/path_util.cc
   @@ -28,8 +28,28 @@
    #include "arrow/util/uri.h"
   
    namespace arrow {
   -
    namespace fs {
   +
   +bool IsLikelyUri(std::string_view v) {
   +  if (v.empty() || v[0] == '/') {
   +    return false;
   +  }
   +  const auto pos = v.find_first_of(':');
   +  if (pos == v.npos) {
   +    return false;
   +  }
   +  if (pos < 2) {
   +    // One-letter URI schemes don't officially exist, perhaps a Windows 
drive letter?
   +    return false;
   +  }
   +  if (pos > 36) {
   +    // The largest IANA-registered URI scheme is 
"microsoft.windows.camera.multipicker"
   +    // with 36 characters.
   +    return false;
   +  }
   +  return ::arrow::util::IsValidUriScheme(v.substr(0, pos));
   +}
   +
    namespace internal {
   
    // XXX How does this encode Windows UNC paths?
   @@ -337,26 +357,6 @@ bool IsEmptyPath(std::string_view v) {
      return true;
    }
   
   diff --git i/cpp/src/arrow/filesystem/path_util.h 
w/cpp/src/arrow/filesystem/path_util.h
   index d49d9d2efa..2f767fc7dc 100644
   --- i/cpp/src/arrow/filesystem/path_util.h
   +++ w/cpp/src/arrow/filesystem/path_util.h
   @@ -27,6 +27,10 @@
   
    namespace arrow {
    namespace fs {
   +
   +ARROW_EXPORT
   +bool IsLikelyUri(std::string_view s);
   +
    namespace internal {
   
    constexpr char kSep = '/';
   @@ -159,8 +163,7 @@ std::string ToSlashes(std::string_view s);
    ARROW_EXPORT
    bool IsEmptyPath(std::string_view s);
   
   -ARROW_EXPORT
   -bool IsLikelyUri(std::string_view s);
   +inline bool IsLikelyUri(std::string_view s) { return 
::arrow::fs::IsLikelyUri(s); }
   
    class ARROW_EXPORT Globber {
     public:
   diff --git i/python/pyarrow/_fs.pyx w/python/pyarrow/_fs.pyx
   index 0739b6acba..0b8ab7480f 100644
   --- i/python/pyarrow/_fs.pyx
   +++ w/python/pyarrow/_fs.pyx
   @@ -84,6 +84,14 @@ def _file_type_to_string(ty):
        return f"{ty.__class__.__name__}.{ty._name_}"
   
   
   +def _is_likely_uri(path):
   +    cdef c_string c_path
   +    if not isinstance(path, str):
   +        raise TypeError("Path must be a string")
   +    c_path = tobytes(path)
   +    return CIsLikelyUri(cpp_string_view(c_path))
   +
   +
    cdef class FileInfo(_Weakrefable):
        """
        FileSystem entry info.
   diff --git i/python/pyarrow/fs.py w/python/pyarrow/fs.py
   index 41308cd60c..b0acb97d94 100644
   --- i/python/pyarrow/fs.py
   +++ w/python/pyarrow/fs.py
   @@ -31,6 +31,7 @@ from pyarrow._fs import (  # noqa
        _MockFileSystem,
        FileSystemHandler,
        PyFileSystem,
   +    _is_likely_uri,
        _copy_files,
        _copy_files_selector,
    )
   @@ -82,32 +83,6 @@ def __getattr__(name):
        )
   
   diff --git i/python/pyarrow/includes/libarrow_fs.pxd 
w/python/pyarrow/includes/libarrow_fs.pxd
   index af01c47c8c..48604972d2 100644
   --- i/python/pyarrow/includes/libarrow_fs.pxd
   +++ w/python/pyarrow/includes/libarrow_fs.pxd
   @@ -92,6 +92,7 @@ cdef extern from "arrow/filesystem/api.h" namespace 
"arrow::fs" nogil:
        CResult[shared_ptr[CFileSystem]] CFileSystemFromUriOrPath \
            "arrow::fs::FileSystemFromUriOrPath"(const c_string& uri,
                                                 c_string* out_path)
   +    c_bool CIsLikelyUri "arrow::fs::IsLikelyUri"(cpp_string_view path)
   
        cdef cppclass CFileSystemGlobalOptions \
                "arrow::fs::FileSystemGlobalOptions":
   diff --git i/python/pyarrow/tests/test_fs.py 
w/python/pyarrow/tests/test_fs.py
   index 537ceb9999..31f3f8a8ca 100644
   --- i/python/pyarrow/tests/test_fs.py
   +++ w/python/pyarrow/tests/test_fs.py
   @@ -1715,6 +1715,8 @@ def test_is_likely_uri():
        assert not _is_likely_uri("C:/Users/foo")
        assert not _is_likely_uri("3bucket://key")       # scheme starts with 
digit
        assert not _is_likely_uri("-scheme://key")        # scheme starts with 
dash
   +    assert not _is_likely_uri("schéme://bucket/key")  # non-ASCII in scheme
   +    assert not _is_likely_uri("漢字://bucket/key")     # non-ASCII in scheme
   ```
   
   Does that make sense?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to