westonpace commented on code in PR #12625:
URL: https://github.com/apache/arrow/pull/12625#discussion_r850030115
##########
cpp/src/arrow/dataset/discovery.h:
##########
@@ -248,6 +248,18 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public
DatasetFactory {
std::shared_ptr<fs::FileSystem> filesystem, const
std::vector<fs::FileInfo>& files,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+ /// \brief Build a FileSystemDatasetFactory from an explicit list of
+ /// file selectors.
+ ///
+ /// \param[in] filesystem passed to FileSystemDataset
+ /// \param[in] selectors used to crawl and search files
+ /// \param[in] format passed to FileSystemDataset
+ /// \param[in] options see FileSystemFactoryOptions for more information.
Review Comment:
```suggestion
/// \param[in] options controlling factory operation
```
##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -122,18 +122,30 @@ ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
const FileInfo&);
/// \brief File selector for filesystem APIs
struct ARROW_EXPORT FileSelector {
- /// The directory in which to select files.
- /// If the path exists but doesn't point to a directory, this should be an
error.
+ /// The path or glob in which to select files.
Review Comment:
```suggestion
/// \brief The path or glob in which to select files.
```
##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -122,18 +122,30 @@ ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
const FileInfo&);
/// \brief File selector for filesystem APIs
struct ARROW_EXPORT FileSelector {
- /// The directory in which to select files.
- /// If the path exists but doesn't point to a directory, this should be an
error.
+ /// The path or glob in which to select files.
std::string base_dir;
+ // Selector type of file selector
+ SelectorType type;
/// The behavior if `base_dir` isn't found in the filesystem. If false,
/// an error is returned. If true, an empty selection is returned.
bool allow_not_found;
- /// Whether to recurse into subdirectories.
- bool recursive;
/// The maximum number of subdirectories to recurse into.
int32_t max_recursion;
- FileSelector() : allow_not_found(false), recursive(false),
max_recursion(INT32_MAX) {}
+ FileSelector()
+ : type(SelectorType::Directory), allow_not_found(false),
max_recursion(0) {}
+
+ FileSelector(std::string base_dir, SelectorType type)
+ : base_dir(base_dir), type(type), allow_not_found(false),
max_recursion(0) {}
+
+ bool recursive() const { return (type == SelectorType::Directory &&
max_recursion); }
Review Comment:
Isn't a glob selector recursive if it has `**`?
##########
cpp/src/arrow/dataset/discovery.h:
##########
@@ -248,6 +248,18 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public
DatasetFactory {
std::shared_ptr<fs::FileSystem> filesystem, const
std::vector<fs::FileInfo>& files,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+ /// \brief Build a FileSystemDatasetFactory from an explicit list of
+ /// file selectors.
+ ///
+ /// \param[in] filesystem passed to FileSystemDataset
+ /// \param[in] selectors used to crawl and search files
+ /// \param[in] format passed to FileSystemDataset
Review Comment:
```suggestion
/// \param[in] format of the files, describes how to read and write files
##########
cpp/src/arrow/python/filesystem.cc:
##########
@@ -98,6 +98,10 @@ Result<std::vector<FileInfo>>
PyFileSystem::GetFileInfo(const FileSelector& sele
return infos;
}
+Result<std::vector<FileInfo>> PyFileSystem::GetFileInfoGlob(const
FileSelector& select) {
+ return Status::NotImplemented("Glob file with PyFileSystem");
Review Comment:
```suggestion
return Status::NotImplemented("Glob selector with PyFileSystem");
```
##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -122,18 +122,30 @@ ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
const FileInfo&);
/// \brief File selector for filesystem APIs
struct ARROW_EXPORT FileSelector {
- /// The directory in which to select files.
- /// If the path exists but doesn't point to a directory, this should be an
error.
+ /// The path or glob in which to select files.
std::string base_dir;
+ // Selector type of file selector
Review Comment:
```suggestion
/// brief type of file selector
```
##########
cpp/src/arrow/filesystem/mockfs.cc:
##########
@@ -587,6 +588,10 @@ Result<FileInfoVector> MockFileSystem::GetFileInfo(const
FileSelector& selector)
return results;
}
+Result<FileInfoVector> MockFileSystem::GetFileInfoGlob(const FileSelector&
selector) {
+ return Status::NotImplemented("Glob file with MockFileSystem");
Review Comment:
```suggestion
return Status::NotImplemented("Glob selector with MockFileSystem");
```
##########
cpp/src/arrow/dataset/discovery.h:
##########
@@ -248,6 +248,18 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public
DatasetFactory {
std::shared_ptr<fs::FileSystem> filesystem, const
std::vector<fs::FileInfo>& files,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+ /// \brief Build a FileSystemDatasetFactory from an explicit list of
+ /// file selectors.
+ ///
+ /// \param[in] filesystem passed to FileSystemDataset
Review Comment:
```suggestion
/// \param[in] filesystem to use for all operations
```
##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1939,7 +1940,8 @@ class S3FileSystem::Impl : public
std::enable_shared_from_this<S3FileSystem::Imp
return false;
}
RETURN_NOT_OK(self->CheckNestingDepth(nesting_depth));
- return select.recursive && nesting_depth <= select.max_recursion;
+ return select.type == SelectorType::Directory &&
Review Comment:
Same here, I'm not sure how we reach this point unless we are already
scanning a directory so this extra check seems redundant.
##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -186,12 +198,16 @@ class ARROW_EXPORT FileSystem : public
std::enable_shared_from_this<FileSystem>
virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0;
/// Same, for many targets at once.
virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>&
paths);
+ /// Get file for file selector
+ virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select);
/// Same, according to a selector.
///
/// The selector's base directory will not be part of the results, even if
/// it exists.
/// If it doesn't exist, see `FileSelector::allow_not_found`.
- virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0;
+ virtual Result<FileInfoVector> GetFileInfoDir(const FileSelector& select) =
0;
Review Comment:
I think `GetFileInfo(const FileSelector& select)` is sufficient. If we want
`GetFileInfoDir` and `GetFileInfoGlob` behind the scenes then that is fine but
they should be private methods (or they could be protected methods if you're
trying to provide a default implementation). Otherwise I could call
`GetFileInfoGlob` with a file selector with the directory type and it's not
clear what that should do.
##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1897,7 +1897,8 @@ class S3FileSystem::Impl : public
std::enable_shared_from_this<S3FileSystem::Imp
auto handle_recursion = [&](int32_t nesting_depth) -> Result<bool> {
RETURN_NOT_OK(CheckNestingDepth(nesting_depth));
- return select.recursive && nesting_depth <= select.max_recursion;
+ return select.type == SelectorType::Directory &&
Review Comment:
Is it possible to reach this point and `select.type !=
SelectorType::Directory`?
##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -2276,6 +2278,10 @@ Result<FileInfoVector> S3FileSystem::GetFileInfo(const
FileSelector& select) {
return results;
}
+Result<FileInfoVector> S3FileSystem::GetFileInfoGlob(const FileSelector&
select) {
+ return Status::NotImplemented("Glob file with S3FileSystem");
Review Comment:
```suggestion
return Status::NotImplemented("Glob selector with S3FileSystem");
```
##########
cpp/src/arrow/filesystem/gcsfs.cc:
##########
@@ -735,8 +735,12 @@ Result<FileInfo> GcsFileSystem::GetFileInfo(const
std::string& path) {
return impl_->GetFileInfo(p);
}
-Result<FileInfoVector> GcsFileSystem::GetFileInfo(const FileSelector& select) {
- return impl_->GetFileInfo(select);
+Result<FileInfoVector> GcsFileSystem::GetFileInfoDir(const FileSelector&
select) {
+ return impl_->GetFileInfoDir(select);
+}
+
+Result<FileInfoVector> GcsFileSystem::GetFileInfoGlob(const FileSelector&
select) {
+ return Status::NotImplemented("Glob file with GcsFileSystem");
Review Comment:
```suggestion
return Status::NotImplemented("Glob selector with GcsFileSystem");
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]