jorisvandenbossche commented on a change in pull request #9480:
URL: https://github.com/apache/arrow/pull/9480#discussion_r575250046



##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -2125,12 +2125,44 @@ cdef class ScanTask(_Weakrefable):
         -------
         record_batches : iterator of RecordBatch
         """
-        cdef shared_ptr[CRecordBatch] record_batch
+        # Return an explicit iterator object instead of using a
+        # generator so that this method is eagerly evaluated (a
+        # generator would mean no work gets done until the first
+        # iteration). This also works around a bug in Cython's
+        # generator.
+        cdef CRecordBatchIterator iterator
         with nogil:
-            for maybe_batch in GetResultValue(self.task.Execute()):
-                record_batch = GetResultValue(move(maybe_batch))
-                with gil:
-                    yield pyarrow_wrap_batch(record_batch)
+            iterator = move(GetResultValue(self.task.Execute()))
+            with gil:
+                return RecordBatchIterator.wrap(self, move(iterator))

Review comment:
       This can also be put after the nogil context (so you don't need a `with 
gil`)? (or does that something different in cython?)

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -2125,12 +2125,44 @@ cdef class ScanTask(_Weakrefable):
         -------
         record_batches : iterator of RecordBatch
         """
-        cdef shared_ptr[CRecordBatch] record_batch
+        # Return an explicit iterator object instead of using a
+        # generator so that this method is eagerly evaluated (a
+        # generator would mean no work gets done until the first
+        # iteration). This also works around a bug in Cython's
+        # generator.
+        cdef CRecordBatchIterator iterator
         with nogil:
-            for maybe_batch in GetResultValue(self.task.Execute()):
-                record_batch = GetResultValue(move(maybe_batch))
-                with gil:
-                    yield pyarrow_wrap_batch(record_batch)
+            iterator = move(GetResultValue(self.task.Execute()))
+            with gil:
+                return RecordBatchIterator.wrap(self, move(iterator))
+
+
+cdef class RecordBatchIterator(_Weakrefable):
+    """An iterator over a sequence of record batches."""
+    cdef:
+        ScanTask task
+        CRecordBatchIterator iterator
+
+    def __init__(self):
+        _forbid_instantiation(self.__class__, subclasses_instead=False)
+
+    @staticmethod
+    cdef wrap(ScanTask task, CRecordBatchIterator iterator):
+        cdef RecordBatchIterator self = \
+            RecordBatchIterator.__new__(RecordBatchIterator)
+        self.task = task
+        self.iterator = move(iterator)
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        cdef shared_ptr[CRecordBatch] record_batch
+        record_batch = GetResultValue(move(self.iterator.Next()))

Review comment:
       Does this need a nogil context?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to