This is an automated email from the ASF dual-hosted git repository.

vhs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new f063aa580771 fix(hfile): use Hadoop WritableUtils VarInt encoding in 
HFile block index writer (#18465)
f063aa580771 is described below

commit f063aa58077122d78ff82e9284af7c41232669db
Author: Asish Kumar <[email protected]>
AuthorDate: Wed Apr 8 20:33:14 2026 +0530

    fix(hfile): use Hadoop WritableUtils VarInt encoding in HFile block index 
writer (#18465)
    
    * fix(hfile): use Hadoop WritableUtils VarInt encoding in HFile block index 
writer
    
    The native HFile writer used protobuf varint encoding 
(CodedOutputStream.writeUInt32NoTag)
    for key lengths in root and meta index blocks, while the reader used Hadoop 
WritableUtils
    VarLong decoding. These two encodings are incompatible for values >= 128: 
protobuf uses
    base-128 little-endian with MSB continuation bits, while Hadoop uses a 
header byte
    indicating the number of following big-endian value bytes.
    
    For keys with content length >= 126 characters (varint value >= 128 after 
adding the
    2-byte row key length prefix), the reader misinterpreted the 
protobuf-encoded bytes as
    a Hadoop VarLong, producing a negative key length and causing 
NegativeArraySizeException.
    
    This change switches HFileRootIndexBlock and HFileMetaIndexBlock writers to 
use Hadoop
    WritableUtils VarInt encoding via a new IOUtils.writeVarInt method, 
matching the HBase
    HFile format that the reader already expects. Existing files with keys 
shorter than 126
    characters are unaffected since both encodings produce identical 
single-byte output for
    values 0-127. Files with longer keys were already unreadable due to this 
bug.
    
    Signed-off-by: Asish Kumar <[email protected]>
    
    * fix(hfile): update test comment to reference Hadoop VarInt encoding
    
    Updated the comment in TestHFileWriter from "protobuf varint" to "Hadoop
    VarInt" to accurately reflect the encoding used after the fix.
    
    Signed-off-by: Asish Kumar <[email protected]>
    
    ---------
    
    Signed-off-by: Asish Kumar <[email protected]>
---
 .../apache/hudi/io/hfile/HFileMetaIndexBlock.java  | 12 +++---
 .../apache/hudi/io/hfile/HFileRootIndexBlock.java  | 15 +++----
 .../main/java/org/apache/hudi/io/util/IOUtils.java | 36 +++++++++++++++++
 .../org/apache/hudi/io/hfile/TestHFileWriter.java  | 47 ++++++++++++++++++++++
 .../java/org/apache/hudi/io/util/TestIOUtils.java  | 33 +++++++++++++++
 5 files changed, 127 insertions(+), 16 deletions(-)

diff --git 
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileMetaIndexBlock.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileMetaIndexBlock.java
index ef292e5370ab..72f720b9c494 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileMetaIndexBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileMetaIndexBlock.java
@@ -24,6 +24,8 @@ import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
+import static org.apache.hudi.io.util.IOUtils.writeVarInt;
+
 public class HFileMetaIndexBlock extends HFileIndexBlock {
 
   private HFileMetaIndexBlock(HFileContext context) {
@@ -42,13 +44,9 @@ public class HFileMetaIndexBlock extends HFileIndexBlock {
         outputStream.writeLong(entry.getOffset());
         outputStream.writeInt(entry.getSize());
         // Key length.
-        try {
-          byte[] keyLength = 
getVariableLengthEncodedBytes(entry.getFirstKey().getLength());
-          outputStream.write(keyLength);
-        } catch (IOException e) {
-          throw new RuntimeException(
-              "Failed to serialize number: " + 
entry.getFirstKey().getLength());
-        }
+        // Use Hadoop WritableUtils VarInt encoding to match HBase's HFile 
format.
+        byte[] keyLength = writeVarInt(entry.getFirstKey().getLength());
+        outputStream.write(keyLength);
         // Note that: NO two-bytes for encoding key length.
         // Key.
         outputStream.write(entry.getFirstKey().getBytes());
diff --git 
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java
index 07d7b190c9c7..5b32a5df8925 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java
@@ -35,6 +35,7 @@ import static 
org.apache.hudi.io.util.IOUtils.decodeVarLongSizeOnDisk;
 import static org.apache.hudi.io.util.IOUtils.readInt;
 import static org.apache.hudi.io.util.IOUtils.readLong;
 import static org.apache.hudi.io.util.IOUtils.readVarLong;
+import static org.apache.hudi.io.util.IOUtils.writeVarInt;
 
 /**
  * Represents a {@link HFileBlockType#ROOT_INDEX} block.
@@ -108,15 +109,11 @@ public class HFileRootIndexBlock extends HFileIndexBlock {
         outputStream.writeLong(entry.getOffset());
         outputStream.writeInt(entry.getSize());
 
-        // Key length + 2.
-        try {
-          byte[] keyLength = getVariableLengthEncodedBytes(
-              entry.getFirstKey().getLength() + SIZEOF_INT16);
-          outputStream.write(keyLength);
-        } catch (IOException e) {
-          throw new RuntimeException(
-              "Failed to serialize number: " + entry.getFirstKey().getLength() 
+ SIZEOF_INT16);
-        }
+        // Key length + 2 (SIZEOF_INT16 for the 2-byte row key length prefix).
+        // Use Hadoop WritableUtils VarInt encoding to match HBase's HFile 
format.
+        byte[] keyLength = writeVarInt(
+            entry.getFirstKey().getLength() + SIZEOF_INT16);
+        outputStream.write(keyLength);
         // Key length.
         outputStream.writeShort((short) entry.getFirstKey().getLength());
         // Key.
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/util/IOUtils.java 
b/hudi-io/src/main/java/org/apache/hudi/io/util/IOUtils.java
index 3fd5930add46..274dfe6d8f90 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/util/IOUtils.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/util/IOUtils.java
@@ -150,6 +150,42 @@ public class IOUtils {
     return value < -120 || (value >= -112 && value < 0);
   }
 
+  /**
+   * Encodes an integer using Hadoop-compatible variable-length encoding
+   * (WritableUtils VarInt format) and returns the encoded bytes.
+   *
+   * <p>For values between -112 and 127 (inclusive), a single byte is used.
+   * For other values, the first byte indicates the number of following bytes
+   * and the sign, followed by the value in big-endian order.
+   *
+   * @param value the integer value to encode.
+   * @return the encoded byte array.
+   */
+  public static byte[] writeVarInt(int value) {
+    if (value >= -112 && value <= 127) {
+      return new byte[] {(byte) value};
+    }
+    long longValue = value;
+    int len = -112;
+    if (longValue < 0) {
+      longValue ^= -1L;
+      len = -120;
+    }
+    long tmp = longValue;
+    while (tmp != 0) {
+      tmp >>= 8;
+      len--;
+    }
+    int numBytes = (len < -120) ? -(len + 120) : -(len + 112);
+    byte[] result = new byte[1 + numBytes];
+    result[0] = (byte) len;
+    for (int idx = 0; idx < numBytes; idx++) {
+      int shiftBits = (numBytes - idx - 1) * 8;
+      result[1 + idx] = (byte) ((longValue >> shiftBits) & 0xFF);
+    }
+    return result;
+  }
+
   /**
    * @param bytes  input byte array.
    * @param offset offset to start reading.
diff --git 
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java 
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
index d7048738afb4..a4488351c7e2 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
@@ -185,6 +185,53 @@ class TestHFileWriter {
     }
   }
 
+  @Test
+  void testLongKeys() throws IOException {
+    // Test that HFile blocks with long keys (>= 126 chars) can be written and 
read correctly.
+    // This verifies the fix for the varint encoding mismatch in the root 
index block.
+    HFileContext context = new HFileContext.Builder().blockSize(100).build();
+    String testFile = TEST_FILE;
+    int numRecords = 10;
+    // Generate keys longer than 126 characters to trigger multi-byte Hadoop 
VarInt encoding
+    // in the root index block. The varint encodes (key_content_length + 2), 
so content >= 126
+    // produces a value >= 128 which requires 2+ bytes in Hadoop VarInt format.
+    char[] chars = new char[200];
+    Arrays.fill(chars, 'a');
+    String longPrefix = new String(chars);
+    try (DataOutputStream outputStream =
+             new DataOutputStream(Files.newOutputStream(Paths.get(testFile)));
+         HFileWriter writer = new HFileWriterImpl(context, outputStream)) {
+      for (int i = 0; i < numRecords; i++) {
+        String key = longPrefix + String.format("%04d", i);
+        writer.append(key, String.format("value%04d", i).getBytes());
+      }
+    }
+
+    // Validate that all records can be read back correctly.
+    try (FileChannel channel = FileChannel.open(Paths.get(testFile), 
StandardOpenOption.READ)) {
+      ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, 
channel.size());
+      SeekableDataInputStream inputStream =
+          new ByteArraySeekableDataInputStream(new 
ByteBufferBackedInputStream(buf));
+      HFileReaderImpl reader = new HFileReaderImpl(inputStream, 
channel.size());
+      reader.initializeMetadata();
+      assertEquals(numRecords, reader.getNumKeyValueEntries());
+      reader.seekTo();
+      for (int i = 0; i < numRecords; i++) {
+        KeyValue kv = reader.getKeyValue().get();
+        String expectedKey = longPrefix + String.format("%04d", i);
+        assertEquals(expectedKey, kv.getKey().getContentInString());
+        assertArrayEquals(
+            String.format("value%04d", i).getBytes(),
+            Arrays.copyOfRange(
+                kv.getBytes(),
+                kv.getValueOffset(),
+                kv.getValueOffset() + kv.getValueLength())
+        );
+        reader.next();
+      }
+    }
+  }
+
   private static void writeTestFile() throws Exception {
     try (
         DataOutputStream outputStream =
diff --git a/hudi-io/src/test/java/org/apache/hudi/io/util/TestIOUtils.java 
b/hudi-io/src/test/java/org/apache/hudi/io/util/TestIOUtils.java
index bc20d47a860b..a100446428e1 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/util/TestIOUtils.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/util/TestIOUtils.java
@@ -92,6 +92,39 @@ public class TestIOUtils {
     assertEquals(expectedNumber < 0, IOUtils.isNegativeVarLong(bytes[0]));
   }
 
+  @Test
+  public void testWriteVarIntRoundTrip() {
+    // Verify that writeVarInt produces bytes that readVarLong can decode 
correctly
+    int[] testValues = {0, 1, 127, 128, 146, 200, 255, 256, 300, 1000, 32080, 
65535, 100000,
+        2034958, 632492350, Integer.MAX_VALUE};
+    for (int value : testValues) {
+      byte[] encoded = IOUtils.writeVarInt(value);
+      int size = IOUtils.decodeVarLongSizeOnDisk(encoded, 0);
+      assertEquals(encoded.length, size, "Size mismatch for value " + value);
+      long decoded = IOUtils.readVarLong(encoded, 0, size);
+      assertEquals(value, decoded, "Round-trip mismatch for value " + value);
+    }
+  }
+
+  @Test
+  public void testWriteVarIntMatchesExistingTestVectors() {
+    // Cross-check writeVarInt against known Hadoop VarLong encoding from the 
existing test data
+    assertEquals(1, IOUtils.writeVarInt(0).length);
+    assertEquals(0, IOUtils.writeVarInt(0)[0]);
+    assertEquals(1, IOUtils.writeVarInt(98).length);
+    assertEquals(98, IOUtils.writeVarInt(98)[0]);
+
+    // Value 208 requires 2 bytes
+    byte[] enc208 = IOUtils.writeVarInt(208);
+    assertEquals(2, enc208.length);
+    assertEquals(208, IOUtils.readVarLong(enc208, 0));
+
+    // Value 32080 requires 3 bytes
+    byte[] enc32080 = IOUtils.writeVarInt(32080);
+    assertEquals(3, enc32080.length);
+    assertEquals(32080, IOUtils.readVarLong(enc32080, 0));
+  }
+
   @Test
   public void testByteArrayCompareTo() {
     byte[] bytes1 = new byte[] {(byte) 0x9b, 0, 0x18, 0x65, 0x2e, (byte) 0xf3};

Reply via email to