hhr293 commented on code in PR #12299:
URL: https://github.com/apache/gluten/pull/12299#discussion_r3412383227
##########
cpp/core/utils/tac/TypeAwareCompressCodec.cc:
##########
@@ -65,18 +80,38 @@ TypeAwareCompressCodec::decompress(const uint8_t* input,
int64_t inputLen, uint8
auto* in = input;
auto codecId = static_cast<CodecId>(*in++);
- [[maybe_unused]] auto tacType = *in++;
+ auto tacType = static_cast<int8_t>(*in++);
auto dataLen = inputLen - kPayloadHeaderSize;
- switch (codecId) {
- case CodecId::kFFor: {
- ARROW_ASSIGN_OR_RAISE(auto nDecoded, FForCodec::decompress(in, dataLen,
output, outputLen));
- (void)nDecoded;
- return inputLen;
+ if (codecId != CodecId::kFFor) {
+ return arrow::Status::Invalid("Unknown type-aware codec ID: ",
static_cast<int>(codecId));
+ }
+
+ int64_t nDecoded = 0;
+ int64_t valueSize = 0;
+ const char* typeName = nullptr;
+ switch (tacType) {
+ case tac::kUInt64: {
+ ARROW_ASSIGN_OR_RAISE(nDecoded, FForCodec::decompress(in, dataLen,
output, outputLen));
+ valueSize = sizeof(uint64_t);
+ typeName = "uint64";
+ break;
+ }
+ case tac::kUInt128: {
+ ARROW_ASSIGN_OR_RAISE(nDecoded, FForCodec::decompress128(in, dataLen,
output, outputLen));
+ valueSize = 2 * sizeof(uint64_t);
+ typeName = "uint128";
+ break;
}
default:
- return arrow::Status::Invalid("Unknown type-aware codec ID: ",
static_cast<int>(codecId));
+ return arrow::Status::Invalid("Unknown tac type in decompress: ",
static_cast<int>(tacType));
+ }
+ const int64_t expected = outputLen / valueSize;
+ if (nDecoded != expected) {
+ return arrow::Status::Invalid(
+ "TAC decompress ", typeName, " value count mismatch: expected ",
expected, " got ", nDecoded);
}
+ return inputLen;
Review Comment:
outputLen is always the exact uncompressed size — it comes from shuffle
metadata
which records the precise original byte length. Callers never pass an
oversized
buffer; the buffer is allocated to exactly outputLen bytes. This is the
same
contract as the 64-bit path and is consistent with how Arrow's
Codec::Decompress
works (caller provides exact expected output size).
Relaxing to <= would silently accept partial decompression on corrupt
input,
which is worse. The strict equality check is intentional — any mismatch
means
data corruption and should fail loudly.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]