[GitHub] [arrow] fsaintjacques commented on a change in pull request #7156: ARROW-8074: [C++][Dataset][Python] FileFragments from buffers and NativeFiles

2020-06-17 Thread GitBox


fsaintjacques commented on a change in pull request #7156:
URL: https://github.com/apache/arrow/pull/7156#discussion_r441740202



##
File path: python/pyarrow/_dataset.pyx
##
@@ -43,6 +44,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
 raise TypeError(msg)
 
 
+cdef class FileSource:
+
+cdef:
+CFileSource wrapped
+
+def __cinit__(self, file, FileSystem filesystem=None):
+cdef:
+shared_ptr[CFileSystem] c_filesystem
+c_string c_path
+shared_ptr[CRandomAccessFile] c_file
+shared_ptr[CBuffer] c_buffer
+
+if isinstance(file, FileSource):

Review comment:
   This is odd. Should you have `_ensure_file_source` like other methods? 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] fsaintjacques commented on a change in pull request #7156: ARROW-8074: [C++][Dataset][Python] FileFragments from buffers and NativeFiles

2020-06-05 Thread GitBox


fsaintjacques commented on a change in pull request #7156:
URL: https://github.com/apache/arrow/pull/7156#discussion_r426085245



##
File path: python/pyarrow/_dataset.pyx
##
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
 raise TypeError(msg)
 
 
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+cdef:
+# XXX why is shared_ptr necessary here? CFileSource shouldn't need it

Review comment:
   This comment is dead.

##
File path: cpp/src/arrow/dataset/test_util.h
##
@@ -251,6 +251,15 @@ class JSONRecordBatchFileFormat : public FileFormat {
   SchemaResolver resolver_;
 };
 
+inline static std::vector SourcesFromPaths(

Review comment:
   Seems like a useful static method `FileSource::FromPaths`.

##
File path: cpp/src/arrow/result.h
##
@@ -298,7 +298,7 @@ class ARROW_MUST_USE_TYPE Result : public 
util::EqualityComparable> {
   ///
   /// \return The stored non-OK status object, or an OK status if this object
   /// has a value.
-  Status status() const { return status_; }
+  const Status& status() const { return status_; }

Review comment:
   Good catch.

##
File path: cpp/src/arrow/flight/client.cc
##
@@ -108,7 +108,7 @@ class FinishableStream {
   std::shared_ptr stream() const { return stream_; }
 
   /// \brief Finish the call, adding server context to the given status.
-  virtual Status Finish(Status&& st) {
+  virtual Status Finish(Status st) {

Review comment:
   This PR shouldn't touch this code unless it's failing at building?

##
File path: cpp/src/arrow/util/checked_cast.h
##
@@ -39,11 +40,11 @@ inline OutputType checked_cast(InputType&& value) {
 }
 
 template 
-std::shared_ptr checked_pointer_cast(const std::shared_ptr& r) noexcept {
+std::shared_ptr checked_pointer_cast(std::shared_ptr r) noexcept {

Review comment:
   Shouldn't we stick with the signature from the standard?
   
   https://en.cppreference.com/w/cpp/memory/shared_ptr/pointer_cast

##
File path: cpp/src/arrow/status.h
##
@@ -306,8 +306,12 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public 
util::EqualityComparable<
   std::string message() const { return ok() ? "" : state_->msg; }
 
   /// \brief Return the status detail attached to this message.
-  std::shared_ptr detail() const {
-return state_ == NULLPTR ? NULLPTR : state_->detail;
+  const std::shared_ptr& detail() const {
+if (state_ == NULLPTR) {

Review comment:
   ```suggestion
   static std::shared_ptr no_detail = NULLPTR;
   return state_ ? state_->detail : no_detail;
   ```

##
File path: cpp/src/arrow/dataset/file_base.h
##
@@ -45,82 +46,150 @@ namespace dataset {
 /// be read like a file
 class ARROW_DS_EXPORT FileSource {
  public:
-  // NOTE(kszucs): it'd be better to separate the BufferSource from FileSource
-  enum SourceKind { PATH, BUFFER };
-
   FileSource(std::string path, std::shared_ptr filesystem,
- Compression::type compression = Compression::UNCOMPRESSED,
- bool writable = true)
+ Compression::type compression = Compression::UNCOMPRESSED)
   : impl_(PathAndFileSystem{std::move(path), std::move(filesystem)}),
-compression_(compression),
-writable_(writable) {}
+compression_(compression) {}
 
   explicit FileSource(std::shared_ptr buffer,
   Compression::type compression = 
Compression::UNCOMPRESSED)
   : impl_(std::move(buffer)), compression_(compression) {}
 
-  explicit FileSource(std::shared_ptr buffer,
-  Compression::type compression = 
Compression::UNCOMPRESSED)
-  : impl_(std::move(buffer)), compression_(compression), writable_(true) {}
-
-  bool operator==(const FileSource& other) const {
-if (id() != other.id()) {
-  return false;
-}
+  using CustomOpen = 
std::function>()>;
+  explicit FileSource(CustomOpen open) : impl_(std::move(open)) {}
 
-if (id() == PATH) {
-  return path() == other.path() && filesystem() == other.filesystem();
-}
+  using CustomOpenWithCompression =
+  
std::function>(Compression::type)>;
+  explicit FileSource(CustomOpenWithCompression open_with_compression,
+  Compression::type compression = 
Compression::UNCOMPRESSED)
+  : impl_(std::bind(std::move(open_with_compression), compression)),
+compression_(compression) {}
 
-return buffer()->Equals(*other.buffer());
-  }
+  explicit FileSource(std::shared_ptr file,
+  Compression::type compression = 
Compression::UNCOMPRESSED)
+  : impl_([=] { return ToResult(file); }), compression_(compression) {}
 
-  /// \brief The kind of file, whether stored in a filesystem, memory
-  /// resident, or other
-  SourceKind id() const { return static_cast(impl_.index()); }
+  FileSource() : impl_(CustomOpen{}) {}
 
   /// \brief Return