hhr293 commented on code in PR #12299:
URL: https://github.com/apache/gluten/pull/12299#discussion_r3412383227


##########
cpp/core/utils/tac/TypeAwareCompressCodec.cc:
##########
@@ -65,18 +80,38 @@ TypeAwareCompressCodec::decompress(const uint8_t* input, 
int64_t inputLen, uint8
 
   auto* in = input;
   auto codecId = static_cast<CodecId>(*in++);
-  [[maybe_unused]] auto tacType = *in++;
+  auto tacType = static_cast<int8_t>(*in++);
   auto dataLen = inputLen - kPayloadHeaderSize;
 
-  switch (codecId) {
-    case CodecId::kFFor: {
-      ARROW_ASSIGN_OR_RAISE(auto nDecoded, FForCodec::decompress(in, dataLen, 
output, outputLen));
-      (void)nDecoded;
-      return inputLen;
+  if (codecId != CodecId::kFFor) {
+    return arrow::Status::Invalid("Unknown type-aware codec ID: ", 
static_cast<int>(codecId));
+  }
+
+  int64_t nDecoded = 0;
+  int64_t valueSize = 0;
+  const char* typeName = nullptr;
+  switch (tacType) {
+    case tac::kUInt64: {
+      ARROW_ASSIGN_OR_RAISE(nDecoded, FForCodec::decompress(in, dataLen, 
output, outputLen));
+      valueSize = sizeof(uint64_t);
+      typeName = "uint64";
+      break;
+    }
+    case tac::kUInt128: {
+      ARROW_ASSIGN_OR_RAISE(nDecoded, FForCodec::decompress128(in, dataLen, 
output, outputLen));
+      valueSize = 2 * sizeof(uint64_t);
+      typeName = "uint128";
+      break;
     }
     default:
-      return arrow::Status::Invalid("Unknown type-aware codec ID: ", 
static_cast<int>(codecId));
+      return arrow::Status::Invalid("Unknown tac type in decompress: ", 
static_cast<int>(tacType));
+  }
+  const int64_t expected = outputLen / valueSize;
+  if (nDecoded != expected) {
+    return arrow::Status::Invalid(
+        "TAC decompress ", typeName, " value count mismatch: expected ", 
expected, " got ", nDecoded);
   }
+  return inputLen;

Review Comment:
   outputLen is always the exact uncompressed size — it comes from shuffle 
metadata
      which records the precise original byte length. Callers never pass an 
oversized
      buffer; the buffer is allocated to exactly outputLen bytes. This is the 
same
     contract as the 64-bit path and is consistent with how Arrow's 
Codec::Decompress
      works (caller provides exact expected output size).
   
     Relaxing to <= would silently accept partial decompression on corrupt 
input,
     which is worse. The strict equality check is intentional — any mismatch 
means
     data corruption and should fail loudly.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to