bkietz commented on a change in pull request #7692: URL: https://github.com/apache/arrow/pull/7692#discussion_r452486734
########## File path: cpp/src/arrow/dataset/file_parquet.cc ########## @@ -335,91 +335,39 @@ static inline bool RowGroupInfosAreComplete(const std::vector<RowGroupInfo>& inf [](const RowGroupInfo& i) { return i.HasStatistics(); }); } -static inline std::vector<RowGroupInfo> FilterRowGroups( - std::vector<RowGroupInfo> row_groups, const Expression& predicate) { - auto filter = [&predicate](const RowGroupInfo& info) { - return !info.Satisfy(predicate); - }; - auto end = std::remove_if(row_groups.begin(), row_groups.end(), filter); - row_groups.erase(end, row_groups.end()); - return row_groups; -} - -static inline Result<std::vector<RowGroupInfo>> AugmentRowGroups( - std::vector<RowGroupInfo> row_groups, parquet::arrow::FileReader* reader) { - auto metadata = reader->parquet_reader()->metadata(); - auto manifest = reader->manifest(); - auto num_row_groups = metadata->num_row_groups(); - - if (row_groups.empty()) { - row_groups = RowGroupInfo::FromCount(num_row_groups); - } - - // Augment a RowGroup with statistics if missing. - auto augment = [&](RowGroupInfo& info) { - if (!info.HasStatistics() && info.id() < num_row_groups) { - auto row_group = metadata->RowGroup(info.id()); - info.set_num_rows(row_group->num_rows()); - info.set_total_byte_size(row_group->total_byte_size()); - info.set_statistics(RowGroupStatisticsAsStructScalar(*row_group, manifest)); - } - }; - std::for_each(row_groups.begin(), row_groups.end(), augment); - - return row_groups; -} - Result<ScanTaskIterator> ParquetFileFormat::ScanFile(std::shared_ptr<ScanOptions> options, std::shared_ptr<ScanContext> context, FileFragment* fragment) const { - const auto& source = fragment->source(); - auto row_groups = checked_cast<const ParquetFileFragment*>(fragment)->row_groups(); - - bool row_groups_are_complete = RowGroupInfosAreComplete(row_groups); - // The following block is required to avoid any IO if all RowGroups are - // excluded due to prior statistics knowledge. - if (row_groups_are_complete) { - // physical_schema should be cached at this point - ARROW_ASSIGN_OR_RAISE(auto physical_schema, fragment->ReadPhysicalSchema()); - RETURN_NOT_OK(options->filter->Validate(*physical_schema)); - - // Apply a pre-filtering if the user requested an explicit sub-set of - // row-groups. In the case where a RowGroup doesn't have statistics - // metdata, it will not be excluded. - row_groups = FilterRowGroups(std::move(row_groups), *options->filter); + auto& parquet_fragment = checked_cast<ParquetFileFragment&>(*fragment); Review comment: okay ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org