ffacs commented on code in PR #2436:
URL: https://github.com/apache/orc/pull/2436#discussion_r2425128106
##########
c++/src/sargs/SargsApplier.cc:
##########
@@ -185,4 +202,90 @@ namespace orc {
}
return fileStatsEvalResult_;
}
+
+ TruthValue SargsApplier::evaluateDictionaryForColumn(const StringDictionary&
dictionary,
+ const PredicateLeaf&
leaf) const {
+ // Only handle IN expressions for dictionary filtering
+ if (leaf.getOperator() != PredicateLeaf::Operator::IN) {
+ return TruthValue::YES_NO_NULL;
+ }
+
+ const std::vector<Literal>& literals = leaf.getLiteralList();
+ if (literals.empty()) {
+ return TruthValue::YES_NO_NULL;
+ }
+
+ // Check if any dictionary entry matches any literal in the IN list
+ const int64_t* offsets = dictionary.dictionaryOffset.data();
+ const char* blob = dictionary.dictionaryBlob.data();
+ size_t dictSize = dictionary.dictionaryOffset.size() - 1;
+
+ for (size_t i = 0; i < dictSize; ++i) {
+ int64_t start = offsets[i];
+ int64_t length = offsets[i + 1] - start;
+ std::string_view dictEntry(blob + start, static_cast<size_t>(length));
+
+ // Check if this dictionary entry matches any literal in the IN list
+ for (const auto& literal : literals) {
+ if (dictEntry == literal.getStringView()) {
+ // Found a match - stripe might contain matching rows
+ return TruthValue::YES_NO_NULL;
Review Comment:
We can return TruthValue::YES when dictionary is a subset of literals
##########
c++/src/Reader.cc:
##########
@@ -1119,6 +1123,73 @@ namespace orc {
return getStripeSize(stripeInfo) <= threshold;
}
+ /**
+ * Load stripe dictionaries for dictionary-based predicate pushdown.
+ * Only loads dictionaries for STRING/VARCHAR/CHAR columns with IN
expressions.
+ */
+ std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>
loadStripeDictionaries(
+ const proto::Footer& footer, const std::vector<bool>& selectedColumns,
+ const std::vector<uint64_t>& columnsWithInExpr, StripeStreams& stripe,
+ size_t dictSizeThreshold) {
+ std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>
dictionaries;
+
+ // Only load dictionaries for selected columns with IN expressions
+ for (uint64_t colId : columnsWithInExpr) {
+ if (!selectedColumns[colId] || colId >=
static_cast<uint64_t>(footer.types_size())) {
+ continue;
+ }
+
+ auto encoding = stripe.getEncoding(colId);
+ if (encoding.kind() != proto::ColumnEncoding_Kind_DICTIONARY &&
+ encoding.kind() != proto::ColumnEncoding_Kind_DICTIONARY_V2) {
+ continue;
+ }
+
+ auto typeKind = footer.types(static_cast<int>(colId)).kind();
+ if (typeKind != proto::Type_Kind_STRING && typeKind !=
proto::Type_Kind_VARCHAR &&
+ typeKind != proto::Type_Kind_CHAR) {
+ continue;
+ }
+
+ if (encoding.dictionary_size() > dictSizeThreshold) {
+ continue;
+ }
+
+ dictionaries[colId] = loadStringDictionary(colId, stripe,
stripe.getMemoryPool());
+ }
+
+ return dictionaries;
+ }
+
+ // Evaluate dictionaries for the current stripe to determine if it can be
skipped.
+ bool evaluateStripeDictionaries(RowReaderImpl& reader, const proto::Footer&
footer,
+ const std::vector<bool>& selectedColumns,
+ const proto::StripeFooter& stripeFooter,
+ const proto::StripeInformation& stripeInfo,
+ uint64_t currentStripe, SargsApplier*
sargsApplier,
+ const Timezone& localTimezone, const
Timezone& readerTimezone) {
+ const std::vector<uint64_t>& columnsWithInExpr =
sargsApplier->getColumnsWithInExpressions();
+ if (columnsWithInExpr.empty()) {
+ return true;
+ }
+
+ const Timezone& writerTimezone = stripeFooter.has_writer_timezone()
+ ?
getTimezoneByName(stripeFooter.writer_timezone())
+ : localTimezone;
+ StripeStreamsImpl stripeStreams(reader, currentStripe, stripeInfo,
stripeFooter,
+ stripeInfo.offset(),
*reader.getFileContents().stream,
+ writerTimezone, readerTimezone);
+
+ auto dictionaries =
Review Comment:
Dictionaries would be read twice if the stripe is needed, we could add a
stripe wise dictionary cache for column reader.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]