westonpace commented on issue #10753: URL: https://github.com/apache/arrow/issues/10753#issuecomment-883308382
Things get a little tricky in C++ with all the typing and the classes there aren't as well documented. For a robust solution you'd probably want to do something with templates but this example should get you started: ``` arrow::fs::LocalFileSystem file_system; ARROW_ASSIGN_OR_RAISE(auto input, file_system.OpenInputFile("data.parquet")); parquet::ArrowReaderProperties arrow_reader_properties = parquet::default_arrow_reader_properties(); arrow_reader_properties.set_pre_buffer(true); arrow_reader_properties.set_use_threads(true); parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); // Open Parquet file reader std::unique_ptr<parquet::arrow::FileReader> arrow_reader; auto reader_builder = parquet::arrow::FileReaderBuilder(); reader_builder.properties(arrow_reader_properties); ARROW_RETURN_NOT_OK(reader_builder.Open(std::move(input), reader_properties)); ARROW_RETURN_NOT_OK(reader_builder.Build(&arrow_reader)); std::shared_ptr<arrow::Schema> schema; ARROW_RETURN_NOT_OK(arrow_reader->GetSchema(&schema)); auto metadata = arrow_reader->parquet_reader()->metadata(); for (int i = 0; i < metadata->num_row_groups(); i++) { auto row_group = metadata->RowGroup(i); std::cout << "Row group: " << i << " (" << row_group->num_rows() << " rows)" << std::endl; for (int j = 0; j < row_group->num_columns(); j++) { auto column = row_group->ColumnChunk(j); auto field = schema->fields()[j]; std::cout << " Column: " << field->name() << " (" << field->type()->ToString() << ")" << std::endl; if (column->statistics()->HasMinMax()) { if (field->type()->id() == arrow::float64()->id()) { auto double_field = std::dynamic_pointer_cast< parquet::TypedStatistics<parquet::DoubleType>>( column->statistics()); std::cout << " Minimum: " << double_field->min() << std::endl; std::cout << " Maximum: " << double_field->max() << std::endl; } else if (field->type()->id() == arrow::timestamp(arrow::TimeUnit::MILLI)->id()) { auto int_field = std::dynamic_pointer_cast< parquet::TypedStatistics<parquet::Int64Type>>( column->statistics()); std::cout << " Minimum: " << int_field->min() << std::endl; std::cout << " Maximum: " << int_field->max() << std::endl; } else { std::cout << " Minimum: " << column->statistics()->EncodeMin() << std::endl; std::cout << " Maximum: " << column->statistics()->EncodeMax() << std::endl; } } else { std::cout << " Minimum: unknown" << std::endl; std::cout << " Maximum: unknown" << std::endl; } } } ``` The base `parquet::Statistics` class only has `EncodeMin` and `EncodeMax` which encodes the min/max into a byte array (not necessarily a printable string). If you want the value you need to cast it to one of the `parquet::Statistics` subclasses. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org