kosiew commented on code in PR #1222: URL: https://github.com/apache/datafusion-python/pull/1222#discussion_r2346183373
########## python/tests/test_io.py: ########## @@ -92,3 +96,38 @@ def test_read_avro(): path = Path.cwd() / "testing/data/avro/alltypes_plain.avro" avro_df = read_avro(path=path) assert avro_df is not None + + +def test_arrow_c_stream_large_dataset(ctx): + """DataFrame.__arrow_c_stream__ yields batches incrementally. + + This test constructs a DataFrame that would be far larger than available + memory if materialized. The ``__arrow_c_stream__`` method should expose a + stream of record batches without collecting the full dataset, so reading a + handful of batches should not exhaust process memory. + """ + # Create a very large DataFrame using range; this would be terabytes if collected + df = range_table(ctx, 0, 1 << 40) + + reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__()) + + # Track RSS before consuming batches + psutil = pytest.importorskip("psutil") + process = psutil.Process() + start_rss = process.memory_info().rss + + for _ in range(5): + batch = reader.read_next_batch() + assert batch is not None + assert len(batch) > 0 + current_rss = process.memory_info().rss + # Ensure memory usage hasn't grown substantially (>50MB) + assert current_rss - start_rss < 50 * 1024 * 1024 + + +def test_table_from_batches_stream(ctx, fail_collect): + df = range_table(ctx, 0, 10) + + table = pa.Table.from_batches(batch.to_pyarrow() for batch in df) Review Comment: Good point. I also renamed the test to test_table_from_arrow_c_stream -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org