westonpace commented on code in PR #12625:
URL: https://github.com/apache/arrow/pull/12625#discussion_r860259253
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
return Status::OK();
}
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>&
filesystem,
+ std::string& path) {
+ fs::FileInfoVector results, temp;
+ fs::FileSelector selector;
+ std::string cur;
+ size_t i = 0;
+
+#if _WIN32
+ auto split_path = fs::internal::SplitAbstractPath(path, '\\');
+ ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo(split_path[i++] +
"\\"));
+#else
+ auto split_path = fs::internal::SplitAbstractPath(path, '/');
+ ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
+#endif
Review Comment:
Filesystems always work on `/` separated paths (regardless of the separated
used behind the scenes). So this method should assume the path is always `/`
separated. Since you are working with a filesystem you shouldn't need `#if
_WIN32`.
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -88,35 +142,41 @@ Result<compute::Declaration> FromProto(const
substrait::Rel& rel,
std::shared_ptr<dataset::FileFormat> format;
auto filesystem = std::make_shared<fs::LocalFileSystem>();
- std::vector<std::shared_ptr<dataset::FileFragment>> fragments;
+ std::vector<fs::FileInfo> files;
for (const auto& item : read.local_files().items()) {
- if (item.path_type_case() !=
- substrait::ReadRel_LocalFiles_FileOrFiles::kUriFile) {
- return Status::NotImplemented(
- "substrait::ReadRel::LocalFiles::FileOrFiles with "
- "path_type other than uri_file");
+ std::string path;
+ if (item.path_type_case() ==
+ substrait::ReadRel_LocalFiles_FileOrFiles::kUriPath) {
+ path = item.uri_path();
+ } else if (item.path_type_case() ==
+ substrait::ReadRel_LocalFiles_FileOrFiles::kUriFile) {
+ path = item.uri_file();
+ } else if (item.path_type_case() ==
+ substrait::ReadRel_LocalFiles_FileOrFiles::kUriFolder) {
+ path = item.uri_folder();
+ } else {
+ path = item.uri_path_glob();
}
if (item.format() ==
substrait::ReadRel::LocalFiles::FileOrFiles::FILE_FORMAT_PARQUET) {
format = std::make_shared<dataset::ParquetFileFormat>();
- } else if (util::string_view{item.uri_file()}.ends_with(".arrow")) {
+ } else if (util::string_view{path}.ends_with(".arrow")) {
format = std::make_shared<dataset::IpcFileFormat>();
- } else if (util::string_view{item.uri_file()}.ends_with(".feather")) {
+ } else if (util::string_view{path}.ends_with(".feather")) {
format = std::make_shared<dataset::IpcFileFormat>();
} else {
return Status::NotImplemented(
"substrait::ReadRel::LocalFiles::FileOrFiles::format "
"other than FILE_FORMAT_PARQUET");
}
- if (!util::string_view{item.uri_file()}.starts_with("file:///")) {
+ if (!util::string_view{path}.starts_with("file:///")) {
return Status::NotImplemented(
- "substrait::ReadRel::LocalFiles::FileOrFiles::uri_file "
+ "substrait::ReadRel::LocalFiles::FileOrFiles::uri_path "
Review Comment:
Couldn't we still get here with a `uri_path` (or a `uri_folder` or a
`uri_path_glob`)?
Maybe just:
```
return Status::NotImplemented("substrait::ReadRel::LocalFiles item with
other than local filesystem (file:///)");
```
Also, we could probably include `path` as part of the error message.
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -88,35 +142,41 @@ Result<compute::Declaration> FromProto(const
substrait::Rel& rel,
std::shared_ptr<dataset::FileFormat> format;
auto filesystem = std::make_shared<fs::LocalFileSystem>();
- std::vector<std::shared_ptr<dataset::FileFragment>> fragments;
+ std::vector<fs::FileInfo> files;
for (const auto& item : read.local_files().items()) {
- if (item.path_type_case() !=
- substrait::ReadRel_LocalFiles_FileOrFiles::kUriFile) {
- return Status::NotImplemented(
- "substrait::ReadRel::LocalFiles::FileOrFiles with "
- "path_type other than uri_file");
+ std::string path;
+ if (item.path_type_case() ==
+ substrait::ReadRel_LocalFiles_FileOrFiles::kUriPath) {
+ path = item.uri_path();
+ } else if (item.path_type_case() ==
+ substrait::ReadRel_LocalFiles_FileOrFiles::kUriFile) {
+ path = item.uri_file();
+ } else if (item.path_type_case() ==
+ substrait::ReadRel_LocalFiles_FileOrFiles::kUriFolder) {
+ path = item.uri_folder();
+ } else {
+ path = item.uri_path_glob();
}
if (item.format() ==
substrait::ReadRel::LocalFiles::FileOrFiles::FILE_FORMAT_PARQUET) {
format = std::make_shared<dataset::ParquetFileFormat>();
- } else if (util::string_view{item.uri_file()}.ends_with(".arrow")) {
+ } else if (util::string_view{path}.ends_with(".arrow")) {
Review Comment:
This seems a bit incorrect (e.g. the format is parquet if my glob is
`dataset/files/*.arrow` but not `dataset/files/*`) but this particular branch
is a bit of a hack until https://github.com/substrait-io/substrait/pull/169
merges anyways.
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
return Status::OK();
}
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>&
filesystem,
+ std::string& path) {
Review Comment:
Minor nit: should this parameter be named `glob`?
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
return Status::OK();
}
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>&
filesystem,
+ std::string& path) {
+ fs::FileInfoVector results, temp;
+ fs::FileSelector selector;
+ std::string cur;
+ size_t i = 0;
+
+#if _WIN32
+ auto split_path = fs::internal::SplitAbstractPath(path, '\\');
+ ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo(split_path[i++] +
"\\"));
+#else
+ auto split_path = fs::internal::SplitAbstractPath(path, '/');
+ ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
Review Comment:
Should this be `split_path[i++] + "/"`?
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
return Status::OK();
}
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>&
filesystem,
+ std::string& path) {
+ fs::FileInfoVector results, temp;
+ fs::FileSelector selector;
+ std::string cur;
+ size_t i = 0;
+
+#if _WIN32
+ auto split_path = fs::internal::SplitAbstractPath(path, '\\');
+ ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo(split_path[i++] +
"\\"));
+#else
+ auto split_path = fs::internal::SplitAbstractPath(path, '/');
+ ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
+#endif
+ results.push_back(std::move(file));
+
+ for (; i < split_path.size(); i++) {
+ if (split_path[i].find_first_of("*?") == std::string::npos) {
+ if (cur.empty())
+ cur = split_path[i];
+ else
+ cur = fs::internal::ConcatAbstractPath(cur, split_path[i]);
+ continue;
+ } else {
+ for (auto res : results) {
+ if (res.type() != fs::FileType::Directory) continue;
+ selector.base_dir = res.path() + cur;
+ ARROW_ASSIGN_OR_RAISE(auto entries, filesystem->GetFileInfo(selector));
+ fs::internal::Globber globber(
+ fs::internal::ConcatAbstractPath(selector.base_dir,
split_path[i]));
+ for (auto entry : entries) {
+ if (globber.Matches(entry.path())) {
+ temp.push_back(std::move(entry));
+ }
+ }
+ }
+ results = temp;
+ temp.clear();
Review Comment:
```suggestion
results = std::move(temp);
```
##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,38 @@ bool IsLikelyUri(util::string_view v) {
return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
}
+struct Globber::Impl {
+ std::regex pattern_;
+
+ explicit Impl(std::string p) : pattern_(std::regex(std::move(transform(p))))
{}
+
+ std::string transform(std::string p) {
Review Comment:
I suspect there are other regex characters to worry about. For example, in
a URI path you are allowed to use `(` and `)`.
If we are going to go this route we either should make it very clear
somewhere what characters we expect to see in a filename or we need to escape
all of `.^$*+?()[{\|`
Pity there is no equivalent to
https://docs.python.org/3/library/re.html#re.escape
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]