westonpace commented on issue #10753:
URL: https://github.com/apache/arrow/issues/10753#issuecomment-883308382


   Things get a little tricky in C++ with all the typing and the classes there 
aren't as well documented.  For a robust solution you'd probably want to do 
something with templates but this example should get you started:
   
   ```
     arrow::fs::LocalFileSystem file_system;
     ARROW_ASSIGN_OR_RAISE(auto input, 
file_system.OpenInputFile("data.parquet"));
   
     parquet::ArrowReaderProperties arrow_reader_properties =
         parquet::default_arrow_reader_properties();
   
     arrow_reader_properties.set_pre_buffer(true);
     arrow_reader_properties.set_use_threads(true);
   
     parquet::ReaderProperties reader_properties =
         parquet::default_reader_properties();
   
     // Open Parquet file reader
     std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
     auto reader_builder = parquet::arrow::FileReaderBuilder();
     reader_builder.properties(arrow_reader_properties);
     ARROW_RETURN_NOT_OK(reader_builder.Open(std::move(input), 
reader_properties));
     ARROW_RETURN_NOT_OK(reader_builder.Build(&arrow_reader));
   
     std::shared_ptr<arrow::Schema> schema;
     ARROW_RETURN_NOT_OK(arrow_reader->GetSchema(&schema));
     auto metadata = arrow_reader->parquet_reader()->metadata();
     for (int i = 0; i < metadata->num_row_groups(); i++) {
       auto row_group = metadata->RowGroup(i);
       std::cout << "Row group: " << i << " (" << row_group->num_rows() << " 
rows)"
                 << std::endl;
       for (int j = 0; j < row_group->num_columns(); j++) {
         auto column = row_group->ColumnChunk(j);
         auto field = schema->fields()[j];
         std::cout << "  Column: " << field->name() << " ("
                   << field->type()->ToString() << ")" << std::endl;
         if (column->statistics()->HasMinMax()) {
           if (field->type()->id() == arrow::float64()->id()) {
             auto double_field = std::dynamic_pointer_cast<
                 parquet::TypedStatistics<parquet::DoubleType>>(
                 column->statistics());
             std::cout << "    Minimum: " << double_field->min() << std::endl;
             std::cout << "    Maximum: " << double_field->max() << std::endl;
           } else if (field->type()->id() ==
                      arrow::timestamp(arrow::TimeUnit::MILLI)->id()) {
             auto int_field = std::dynamic_pointer_cast<
                 parquet::TypedStatistics<parquet::Int64Type>>(
                 column->statistics());
             std::cout << "    Minimum: " << int_field->min() << std::endl;
             std::cout << "    Maximum: " << int_field->max() << std::endl;
           } else {
             std::cout << "    Minimum: " << column->statistics()->EncodeMin()
                       << std::endl;
             std::cout << "    Maximum: " << column->statistics()->EncodeMax()
                       << std::endl;
           }
         } else {
           std::cout << "    Minimum: unknown" << std::endl;
           std::cout << "    Maximum: unknown" << std::endl;
         }
       }
     }
   
   ```
   
   The base `parquet::Statistics` class only has `EncodeMin` and `EncodeMax` 
which encodes the min/max into a byte array (not necessarily a printable 
string).  If you want the value you need to cast it to one of the 
`parquet::Statistics` subclasses.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to