mapleFU commented on issue #41339:
URL: https://github.com/apache/arrow/issues/41339#issuecomment-2074922591
Aha, I use master code and run `ReadInBatches` in
`cpp/examples/arrow/parquet_read_write`'s `ReadInBatches`:
```c++
arrow::Status ReadInBatches(std::string path_to_file) {
// #include "arrow/io/api.h"
// #include "arrow/parquet/arrow/reader.h"
arrow::MemoryPool* pool = arrow::default_memory_pool();
// Configure general Parquet reader settings
auto reader_properties = parquet::ReaderProperties(pool);
reader_properties.set_buffer_size(4096 * 4);
reader_properties.enable_buffered_stream();
// Configure Arrow-specific Parquet reader settings
auto arrow_reader_props = parquet::ArrowReaderProperties();
arrow_reader_props.set_batch_size(3); // default 64 * 1024
arrow_reader_props.set_use_threads(true);
parquet::arrow::FileReaderBuilder reader_builder;
ARROW_RETURN_NOT_OK(
reader_builder.OpenFile(path_to_file, /*memory_map=*/true,
reader_properties));
reader_builder.memory_pool(pool);
reader_builder.properties(arrow_reader_props);
std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader));
std::shared_ptr<::arrow::RecordBatch> batch;
while (rb_reader->ReadNext(&batch).ok() && batch != nullptr) {
std::cout << "Read:" << batch->ToString() << '\n';
}
// for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch :
*rb_reader) {
// if (!maybe_batch.ok()) {
// std::cout << "Error reading batch: " <<
maybe_batch.status().message() << std::endl;
// } else {
// std::shared_ptr<arrow::RecordBatch> batch =
maybe_batch.ValueOrDie();
// std::cout << "Read batch with " << batch->num_rows() << " rows"
<< std::endl;
// }
// }
return arrow::Status::OK();
}
arrow::Status RunExamples(std::string path_to_file) {
// ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
// ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
// ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
return arrow::Status::OK();
}
```
This doesn't crash. I'm running on My M1 MacOS and master branch. Would you
mind provide some configs?
By the way, the stack below it's a little confusing, 🤔 why
`parquet::PhysicalType<(parquet::Type::type)5>` calls
`parquet::PhysicalType<(parquet::Type::type)7>` ...
```
#8 0x00007fb7a284f5cd in parquet::internal::(anonymous
namespace)::TypedRecordReader<parquet::PhysicalType<(parquet::Type::type)7>
>::bytes_for_values(long) const [clone .isra.1197] () from
/lib64/libparquet.so.1500
#9 0x00007fb7a28512bd in parquet::internal::(anonymous
namespace)::TypedRecordReader<parquet::PhysicalType<(parquet::Type::type)5>
>::ReserveValues(long) () from /lib64/libparquet.so.1500
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]