This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 7c9f3ae4f3 Support reading var-length type (STRING/BYTES/BIG_DECIMAL) 
as murmur3 32/64/128 bits hash (#15213)
7c9f3ae4f3 is described below

commit 7c9f3ae4f3bfd06b005f50f600935ebad9f0a44d
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Sat Mar 29 18:21:34 2025 -0600

    Support reading var-length type (STRING/BYTES/BIG_DECIMAL) as murmur3 
32/64/128 bits hash (#15213)
---
 .../org/apache/pinot/core/common/BlockValSet.java  |  12 ++
 .../apache/pinot/core/common/DataBlockCache.java   |  21 +++
 .../org/apache/pinot/core/common/DataFetcher.java  |  54 ++++++++
 .../operator/docvalsets/ProjectionBlockValSet.java |  24 ++++
 .../operator/docvalsets/TransformBlockValSet.java  |  34 +++++
 .../local/io/util/FixedByteValueReaderWriter.java  |  20 +--
 .../pinot/segment/local/io/util/ValueReader.java   |  43 +++++-
 .../local/io/util/VarLengthValueReader.java        |   6 +-
 .../index/readers/BaseImmutableDictionary.java     |  12 ++
 .../index/readers/BigDecimalDictionary.java        |  24 ++++
 .../segment/index/readers/BytesDictionary.java     |  24 ++++
 .../segment/index/readers/StringDictionary.java    |  24 ++++
 .../pinot/segment/spi/index/reader/Dictionary.java |  31 +++++
 .../spi/index/reader/ForwardIndexReader.java       |  21 ++-
 .../pinot/spi/utils/hash/MurmurHashFunctions.java  | 148 ++++++++-------------
 15 files changed, 374 insertions(+), 124 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java 
b/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
index c8aac0bdfb..495f517dfa 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
@@ -113,6 +113,18 @@ public interface BlockValSet {
    */
   byte[][] getBytesValuesSV();
 
+  default int[] get32BitsMurmur3HashValuesSV() {
+    throw new UnsupportedOperationException();
+  }
+
+  default long[] get64BitsMurmur3HashValuesSV() {
+    throw new UnsupportedOperationException();
+  }
+
+  default long[][] get128BitsMurmur3HashValuesSV() {
+    throw new UnsupportedOperationException();
+  }
+
   /**
    * MULTI-VALUED COLUMN APIs
    */
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java 
b/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
index d5ba2e1202..18ddda44b0 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
@@ -242,6 +242,27 @@ public class DataBlockCache implements AutoCloseable {
     return bytesValues;
   }
 
+  public int[] get32BitsMurmur3HashValuesForSVColumn(String column) {
+    // TODO: This is not cached
+    int[] hashValues = new int[_length];
+    _dataFetcher.fetch32BitsMurmur3HashValues(column, _docIds, _length, 
hashValues);
+    return hashValues;
+  }
+
+  public long[] get64BitsMurmur3HashValuesForSVColumn(String column) {
+    // TODO: This is not cached
+    long[] hashValues = new long[_length];
+    _dataFetcher.fetch64BitsMurmur3HashValues(column, _docIds, _length, 
hashValues);
+    return hashValues;
+  }
+
+  public long[][] get128BitsMurmur3HashValuesForSVColumn(String column) {
+    // TODO: This is not cached
+    long[][] hashValues = new long[_length][];
+    _dataFetcher.fetch128BitsMurmur3HashValues(column, _docIds, _length, 
hashValues);
+    return hashValues;
+  }
+
   /**
    * MULTI-VALUED COLUMN API
    */
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java 
b/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
index 681b1f58db..384014c808 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
@@ -185,6 +185,18 @@ public class DataFetcher implements AutoCloseable {
     _columnValueReaderMap.get(column).readMapValues(inDocIds, length, 
outValues);
   }
 
+  public void fetch32BitsMurmur3HashValues(String column, int[] inDocIds, int 
length, int[] outValues) {
+    _columnValueReaderMap.get(column).read32BitsMurmur3HashValues(inDocIds, 
length, outValues);
+  }
+
+  public void fetch64BitsMurmur3HashValues(String column, int[] inDocIds, int 
length, long[] outValues) {
+    _columnValueReaderMap.get(column).read64BitsMurmur3HashValues(inDocIds, 
length, outValues);
+  }
+
+  public void fetch128BitsMurmur3HashValues(String column, int[] inDocIds, int 
length, long[][] outValues) {
+    _columnValueReaderMap.get(column).read128BitsMurmur3HashValues(inDocIds, 
length, outValues);
+  }
+
   /**
    * MULTI-VALUED COLUMN API
    */
@@ -422,6 +434,48 @@ public class DataFetcher implements AutoCloseable {
       }
     }
 
+    void read32BitsMurmur3HashValues(int[] docIds, int length, int[] 
valueBuffer) {
+      Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+      ForwardIndexReaderContext readerContext = getReaderContext();
+      if (_dictionary != null) {
+        int[] dictIdBuffer = THREAD_LOCAL_DICT_IDS.get();
+        _reader.readDictIds(docIds, length, dictIdBuffer, readerContext);
+        _dictionary.read32BitsMurmur3HashValues(dictIdBuffer, length, 
valueBuffer);
+      } else {
+        for (int i = 0; i < length; i++) {
+          valueBuffer[i] = _reader.get32BitsMurmur3Hash(docIds[i], 
readerContext);
+        }
+      }
+    }
+
+    void read64BitsMurmur3HashValues(int[] docIds, int length, long[] 
valueBuffer) {
+      Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+      ForwardIndexReaderContext readerContext = getReaderContext();
+      if (_dictionary != null) {
+        int[] dictIdBuffer = THREAD_LOCAL_DICT_IDS.get();
+        _reader.readDictIds(docIds, length, dictIdBuffer, readerContext);
+        _dictionary.read64BitsMurmur3HashValues(dictIdBuffer, length, 
valueBuffer);
+      } else {
+        for (int i = 0; i < length; i++) {
+          valueBuffer[i] = _reader.get64BitsMurmur3Hash(docIds[i], 
readerContext);
+        }
+      }
+    }
+
+    void read128BitsMurmur3HashValues(int[] docIds, int length, long[][] 
valueBuffer) {
+      Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+      ForwardIndexReaderContext readerContext = getReaderContext();
+      if (_dictionary != null) {
+        int[] dictIdBuffer = THREAD_LOCAL_DICT_IDS.get();
+        _reader.readDictIds(docIds, length, dictIdBuffer, readerContext);
+        _dictionary.read128BitsMurmur3HashValues(dictIdBuffer, length, 
valueBuffer);
+      } else {
+        for (int i = 0; i < length; i++) {
+          valueBuffer[i] = _reader.get128BitsMurmur3Hash(docIds[i], 
readerContext);
+        }
+      }
+    }
+
     void readDictIdsMV(int[] docIds, int length, int[][] dictIdsBuffer) {
       Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
       ForwardIndexReaderContext readerContext = getReaderContext();
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
index 76cc124e1f..0dfd8b52d6 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
@@ -162,6 +162,30 @@ public class ProjectionBlockValSet implements BlockValSet {
     }
   }
 
+  @Override
+  public int[] get32BitsMurmur3HashValuesSV() {
+    try (InvocationScope scope = 
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
+      recordReadValues(scope, DataType.BYTES, true);
+      return _dataBlockCache.get32BitsMurmur3HashValuesForSVColumn(_column);
+    }
+  }
+
+  @Override
+  public long[] get64BitsMurmur3HashValuesSV() {
+    try (InvocationScope scope = 
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
+      recordReadValues(scope, DataType.BYTES, true);
+      return _dataBlockCache.get64BitsMurmur3HashValuesForSVColumn(_column);
+    }
+  }
+
+  @Override
+  public long[][] get128BitsMurmur3HashValuesSV() {
+    try (InvocationScope scope = 
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
+      recordReadValues(scope, DataType.BYTES, true);
+      return _dataBlockCache.get128BitsMurmur3HashValuesForSVColumn(_column);
+    }
+  }
+
   @Override
   public int[][] getDictionaryIdsMV() {
     try (InvocationScope scope = 
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
index 6ae584a486..823dd05cbe 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
@@ -30,6 +30,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.trace.InvocationRecording;
 import org.apache.pinot.spi.trace.InvocationScope;
 import org.apache.pinot.spi.trace.Tracing;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
 import org.roaringbitmap.RoaringBitmap;
 
 
@@ -142,6 +143,39 @@ public class TransformBlockValSet implements BlockValSet {
     }
   }
 
+  @Override
+  public int[] get32BitsMurmur3HashValuesSV() {
+    byte[][] bytes = getBytesValuesSV();
+    int length = bytes.length;
+    int[] hashValues = new int[length];
+    for (int i = 0; i < length; i++) {
+      hashValues[i] = MurmurHashFunctions.murmurHash3X64Bit32(bytes[i], 0);
+    }
+    return hashValues;
+  }
+
+  @Override
+  public long[] get64BitsMurmur3HashValuesSV() {
+    byte[][] bytes = getBytesValuesSV();
+    int length = bytes.length;
+    long[] hashValues = new long[length];
+    for (int i = 0; i < length; i++) {
+      hashValues[i] = MurmurHashFunctions.murmurHash3X64Bit64(bytes[i], 0);
+    }
+    return hashValues;
+  }
+
+  @Override
+  public long[][] get128BitsMurmur3HashValuesSV() {
+    byte[][] bytes = getBytesValuesSV();
+    int length = bytes.length;
+    long[][] hashValues = new long[length][];
+    for (int i = 0; i < length; i++) {
+      hashValues[i] = 
MurmurHashFunctions.murmurHash3X64Bit128AsLongs(bytes[i], 0);
+    }
+    return hashValues;
+  }
+
   @Override
   public int[][] getDictionaryIdsMV() {
     try (InvocationScope scope = 
Tracing.getTracer().createScope(TransformBlockValSet.class)) {
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
index 73ca8c09f4..2e85a81c1a 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
@@ -53,10 +53,8 @@ public final class FixedByteValueReaderWriter implements 
ValueReader {
     return _dataBuffer.getDouble((long) index * Double.BYTES);
   }
 
-  /**
-   * Reads the unpadded bytes into the given buffer and returns the length.
-   */
-  private int readUnpaddedBytes(int index, int numBytesPerValue, byte[] 
buffer) {
+  @Override
+  public int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer) 
{
     // Based on the ZeroInWord algorithm: 
http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
     assert buffer.length >= numBytesPerValue;
     long startOffset = (long) index * numBytesPerValue;
@@ -85,20 +83,6 @@ public final class FixedByteValueReaderWriter implements 
ValueReader {
     return i;
   }
 
-  @Override
-  public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] 
buffer) {
-    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
-    byte[] bytes = new byte[length];
-    System.arraycopy(buffer, 0, bytes, 0, length);
-    return bytes;
-  }
-
-  @Override
-  public String getUnpaddedString(int index, int numBytesPerValue, byte[] 
buffer) {
-    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
-    return new String(buffer, 0, length, UTF_8);
-  }
-
   @Override
   public String getPaddedString(int index, int numBytesPerValue, byte[] 
buffer) {
     assert buffer.length >= numBytesPerValue;
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
index 4eaf17e37e..9aa9382e31 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
@@ -21,6 +21,7 @@ package org.apache.pinot.segment.local.io.util;
 import java.io.Closeable;
 import java.math.BigDecimal;
 import org.apache.pinot.spi.utils.BigDecimalUtils;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
 
 
 /**
@@ -40,16 +41,30 @@ public interface ValueReader extends Closeable {
     return BigDecimalUtils.deserialize(getBytes(index, numBytesPerValue));
   }
 
+  /**
+   * Reads the unpadded bytes into the given buffer and returns the length.
+   * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
+   */
+  int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer);
+
   /**
    * Returns un-padded bytes for string.
    * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
    */
-  byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer);
+  default byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] 
buffer) {
+    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+    byte[] value = new byte[length];
+    System.arraycopy(buffer, 0, value, 0, length);
+    return value;
+  }
 
   /**
    * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
    */
-  String getUnpaddedString(int index, int numBytesPerValue, byte[] buffer);
+  default String getUnpaddedString(int index, int numBytesPerValue, byte[] 
buffer) {
+    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+    return new String(buffer, 0, length);
+  }
 
   /**
    * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
@@ -61,6 +76,30 @@ public interface ValueReader extends Closeable {
    */
   byte[] getBytes(int index, int numBytesPerValue);
 
+  /**
+   * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
+   */
+  default int get32BitsMurmur3Hash(int index, int numBytesPerValue, byte[] 
buffer) {
+    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+    return MurmurHashFunctions.murmurHash3X64Bit32(buffer, length, 0);
+  }
+
+  /**
+   * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
+   */
+  default long get64BitsMurmur3Hash(int index, int numBytesPerValue, byte[] 
buffer) {
+    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+    return MurmurHashFunctions.murmurHash3X64Bit64(buffer, length, 0);
+  }
+
+  /**
+   * NOTE: The passed in reusable buffer should have capacity of at least 
{@code numBytesPerValue}.
+   */
+  default long[] get128BitsMurmur3Hash(int index, int numBytesPerValue, byte[] 
buffer) {
+    int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+    return MurmurHashFunctions.murmurHash3X64Bit128AsLongs(buffer, length, 0);
+  }
+
   /**
    * Returns the comparison result of the UTF-8 decoded values.
    */
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
index d9384bf16f..502d38cfaa 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
@@ -22,8 +22,6 @@ import java.util.List;
 import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
 import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 
 /**
  * The value reader for var-length values (STRING and BYTES). See {@link 
VarLengthValueWriter} for the file layout.
@@ -89,7 +87,7 @@ public class VarLengthValueReader implements ValueReader {
   }
 
   @Override
-  public String getUnpaddedString(int index, int numBytesPerValue, byte[] 
buffer) {
+  public int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer) 
{
     assert buffer.length >= numBytesPerValue;
 
     // Read the offset of the byte array first and then read the actual byte 
array.
@@ -100,7 +98,7 @@ public class VarLengthValueReader implements ValueReader {
 
     assert numBytesPerValue >= length;
     _dataBuffer.copyTo(startOffset, buffer, 0, length);
-    return new String(buffer, 0, length, UTF_8);
+    return length;
   }
 
   public void recordOffsetRanges(int index, long baseOffset, 
List<ForwardIndexReader.ByteRange> rangeList) {
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
index d075306000..4ba8895bea 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
@@ -278,6 +278,18 @@ public abstract class BaseImmutableDictionary implements 
Dictionary {
     return _valueReader.getBytes(dictId, _numBytesPerValue);
   }
 
+  public int get32BitsMurmur3Hash(int dictId, byte[] buffer) {
+    return _valueReader.get32BitsMurmur3Hash(dictId, _numBytesPerValue, 
buffer);
+  }
+
+  public long get64BitsMurmur3Hash(int dictId, byte[] buffer) {
+    return _valueReader.get64BitsMurmur3Hash(dictId, _numBytesPerValue, 
buffer);
+  }
+
+  public long[] get128BitsMurmur3HashValue(int dictId, byte[] buffer) {
+    return _valueReader.get128BitsMurmur3Hash(dictId, _numBytesPerValue, 
buffer);
+  }
+
   protected byte[] getBuffer() {
     return new byte[_numBytesPerValue];
   }
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
index a6cfa817c6..00efcc3b61 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
@@ -97,4 +97,28 @@ public class BigDecimalDictionary extends 
BaseImmutableDictionary {
   public byte[] getBytesValue(int dictId) {
     return getBytes(dictId);
   }
+
+  @Override
+  public void read32BitsMurmur3HashValues(int[] dictIds, int length, int[] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get32BitsMurmur3Hash(dictIds[i], buffer);
+    }
+  }
+
+  @Override
+  public void read64BitsMurmur3HashValues(int[] dictIds, int length, long[] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get64BitsMurmur3Hash(dictIds[i], buffer);
+    }
+  }
+
+  @Override
+  public void read128BitsMurmur3HashValues(int[] dictIds, int length, long[][] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get128BitsMurmur3HashValue(dictIds[i], buffer);
+    }
+  }
 }
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
index 3528f1e74d..db3401d3af 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
@@ -104,4 +104,28 @@ public class BytesDictionary extends 
BaseImmutableDictionary {
   public byte[] getBytesValue(int dictId) {
     return getBytes(dictId);
   }
+
+  @Override
+  public void read32BitsMurmur3HashValues(int[] dictIds, int length, int[] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get32BitsMurmur3Hash(dictIds[i], buffer);
+    }
+  }
+
+  @Override
+  public void read64BitsMurmur3HashValues(int[] dictIds, int length, long[] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get64BitsMurmur3Hash(dictIds[i], buffer);
+    }
+  }
+
+  @Override
+  public void read128BitsMurmur3HashValues(int[] dictIds, int length, long[][] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get128BitsMurmur3HashValue(dictIds[i], buffer);
+    }
+  }
 }
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
index 28f510eed8..60f604660f 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
@@ -169,4 +169,28 @@ public class StringDictionary extends 
BaseImmutableDictionary {
       outValues[i] = getUnpaddedBytes(dictIds[i], buffer);
     }
   }
+
+  @Override
+  public void read32BitsMurmur3HashValues(int[] dictIds, int length, int[] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get32BitsMurmur3Hash(dictIds[i], buffer);
+    }
+  }
+
+  @Override
+  public void read64BitsMurmur3HashValues(int[] dictIds, int length, long[] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get64BitsMurmur3Hash(dictIds[i], buffer);
+    }
+  }
+
+  @Override
+  public void read128BitsMurmur3HashValues(int[] dictIds, int length, long[][] 
outValues) {
+    byte[] buffer = getBuffer();
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get128BitsMurmur3HashValue(dictIds[i], buffer);
+    }
+  }
 }
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
index 2edcf51720..0f61d3947e 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
@@ -27,6 +27,7 @@ import org.apache.pinot.segment.spi.index.IndexReader;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.utils.ByteArray;
 import org.apache.pinot.spi.utils.MapUtils;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
 
 
 /**
@@ -202,6 +203,18 @@ public interface Dictionary extends IndexReader {
     return new ByteArray(getBytesValue(dictId));
   }
 
+  default int get32BitsMurmur3HashValue(int dictId) {
+    return MurmurHashFunctions.murmurHash3X64Bit32(getBytesValue(dictId), 0);
+  }
+
+  default long get64BitsMurmur3HashValue(int dictId) {
+    return MurmurHashFunctions.murmurHash3X64Bit64(getBytesValue(dictId), 0);
+  }
+
+  default long[] get128BitsMurmur3HashValue(int dictId) {
+    return 
MurmurHashFunctions.murmurHash3X64Bit128AsLongs(getBytesValue(dictId), 0);
+  }
+
   // Batch read APIs
 
   default void readIntValues(int[] dictIds, int length, int[] outValues) {
@@ -276,6 +289,24 @@ public interface Dictionary extends IndexReader {
     }
   }
 
+  default void read32BitsMurmur3HashValues(int[] dictIds, int length, int[] 
outValues) {
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get32BitsMurmur3HashValue(dictIds[i]);
+    }
+  }
+
+  default void read64BitsMurmur3HashValues(int[] dictIds, int length, long[] 
outValues) {
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get64BitsMurmur3HashValue(dictIds[i]);
+    }
+  }
+
+  default void read128BitsMurmur3HashValues(int[] dictIds, int length, 
long[][] outValues) {
+    for (int i = 0; i < length; i++) {
+      outValues[i] = get128BitsMurmur3HashValue(dictIds[i]);
+    }
+  }
+
   default void getDictIds(List<String> values, IntSet dictIds) {
     for (String value : values) {
       int dictId = indexOf(value);
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
index b3d9f842d7..dc0d37b986 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
@@ -30,6 +30,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.utils.BigDecimalUtils;
 import org.apache.pinot.spi.utils.BytesUtils;
 import org.apache.pinot.spi.utils.MapUtils;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
 
 
 /**
@@ -512,6 +513,18 @@ public interface ForwardIndexReader<T extends 
ForwardIndexReaderContext> extends
         + "ForwardIndexReader is being created to read this column.");
   }
 
+  default int get32BitsMurmur3Hash(int docId, T context) {
+    return MurmurHashFunctions.murmurHash3X64Bit32(getBytes(docId, context), 
0);
+  }
+
+  default long get64BitsMurmur3Hash(int docId, T context) {
+    return MurmurHashFunctions.murmurHash3X64Bit64(getBytes(docId, context), 
0);
+  }
+
+  default long[] get128BitsMurmur3Hash(int docId, T context) {
+    return MurmurHashFunctions.murmurHash3X64Bit128AsLongs(getBytes(docId, 
context), 0);
+  }
+
   /**
    * MULTI-VALUE COLUMN RAW INDEX APIs
    */
@@ -995,10 +1008,10 @@ public interface ForwardIndexReader<T extends 
ForwardIndexReaderContext> extends
 
   /**
    * Returns whether the forward index supports recording the byte ranges 
accessed while reading a given docId.
-   * For readers that do support this info, caller should check if the buffer 
is a {@link isFixedOffsetMappingType()}.
-   * If yes, the byte range mapping for a docId can be calculated using the 
{@link getRawDataStartOffset()} and the
-   * {@link getDocLength()} functions.
-   * if not, caller should use the {@link recordDocIdByteRanges()} function to 
get the list of byte ranges accessed
+   * For readers that do support this info, caller should check if the buffer 
is a {@link #isFixedOffsetMappingType}.
+   * If yes, the byte range mapping for a docId can be calculated using the 
{@link #getRawDataStartOffset} and the
+   * {@link #getDocLength} functions.
+   * if not, caller should use the {@link #recordDocIdByteRanges} function to 
get the list of byte ranges accessed
    * for a docId.
    */
   default boolean isBufferByteRangeInfoSupported() {
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
index 276eb0f46e..0c43ed4b0d 100644
--- 
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
+++ 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
@@ -279,94 +279,60 @@ public class MurmurHashFunctions {
   }
 
   /**
-   * Hash a value using the x64 128 bit variant of MurmurHash3
-   *
-   * @param key value to hash
-   * @param seed random value
-   * @return 128 bit hashed key, in an array containing two longs
+   * Hash a value using the x64 128 bit variant of MurmurHash3.
    */
-  public static byte[] murmurHash3X64Bit128(final byte[] key, final int seed) {
-    State state = new State();
-
-    state._h1 = 0x9368e53c2f6af274L ^ seed;
-    state._h2 = 0x586dcd208f7cd3fdL ^ seed;
-
-    state._c1 = 0x87c37b91114253d5L;
-    state._c2 = 0x4cf5ad432745937fL;
-
-    for (int i = 0; i < key.length / 16; i++) {
-      state._k1 = getblock(key, i * 2 * 8);
-      state._k2 = getblock(key, (i * 2 + 1) * 8);
-
-      bmix(state);
-    }
+  public static byte[] murmurHash3X64Bit128(byte[] key, int length, int seed) {
+    State state = murmurHash3X64(key, length, seed);
+    ByteBuffer buffer = ByteBuffer.allocate(16);
+    buffer.putLong(state._h1);
+    buffer.putLong(state._h2);
+    return buffer.array();
+  }
 
-    state._k1 = 0;
-    state._k2 = 0;
+  public static byte[] murmurHash3X64Bit128(byte[] key, int seed) {
+    return murmurHash3X64Bit128(key, key.length, seed);
+  }
 
-    int tail = (key.length >>> 4) << 4;
-    // CHECKSTYLE:OFF: checkstyle:coding
-    switch (key.length & 15) {
-      case 15:
-        state._k2 ^= (long) key[tail + 14] << 48;
-      case 14:
-        state._k2 ^= (long) key[tail + 13] << 40;
-      case 13:
-        state._k2 ^= (long) key[tail + 12] << 32;
-      case 12:
-        state._k2 ^= (long) key[tail + 11] << 24;
-      case 11:
-        state._k2 ^= (long) key[tail + 10] << 16;
-      case 10:
-        state._k2 ^= (long) key[tail + 9] << 8;
-      case 9:
-        state._k2 ^= key[tail + 8];
-      case 8:
-        state._k1 ^= (long) key[tail + 7] << 56;
-      case 7:
-        state._k1 ^= (long) key[tail + 6] << 48;
-      case 6:
-        state._k1 ^= (long) key[tail + 5] << 40;
-      case 5:
-        state._k1 ^= (long) key[tail + 4] << 32;
-      case 4:
-        state._k1 ^= (long) key[tail + 3] << 24;
-      case 3:
-        state._k1 ^= (long) key[tail + 2] << 16;
-      case 2:
-        state._k1 ^= (long) key[tail + 1] << 8;
-      case 1:
-        state._k1 ^= key[tail + 0];
-        bmix(state);
-    }
-    // CHECKSTYLE:ON: checkstyle:coding
+  /**
+   * Hash a value using the x64 128 bit variant of MurmurHash3.
+   */
+  public static long[] murmurHash3X64Bit128AsLongs(byte[] key, int length, int 
seed) {
+    State state = murmurHash3X64(key, length, seed);
+    return new long[]{state._h1, state._h2};
+  }
 
-    state._h2 ^= key.length;
+  public static long[] murmurHash3X64Bit128AsLongs(byte[] key, int seed) {
+    return murmurHash3X64Bit128AsLongs(key, key.length, seed);
+  }
 
-    state._h1 += state._h2;
-    state._h2 += state._h1;
+  /**
+   * Hash a value using the x64 64 bit variant of murmurHash3.
+   */
+  public static long murmurHash3X64Bit64(byte[] key, int length, int seed) {
+    return murmurHash3X64(key, length, seed)._h1;
+  }
 
-    state._h1 = fmix(state._h1);
-    state._h2 = fmix(state._h2);
+  public static long murmurHash3X64Bit64(byte[] key, int seed) {
+    return murmurHash3X64Bit64(key, key.length, seed);
+  }
 
-    state._h1 += state._h2;
-    state._h2 += state._h1;
+  /**
+   * Hash a value using the x64 64 bit variant of murmurHash3.
+   */
+  public static int murmurHash3X64Bit32(byte[] key, int length, int seed) {
+    return (int) (murmurHash3X64(key, length, seed)._h1 >>> 32);
+  }
 
-    ByteBuffer buffer = ByteBuffer.allocate(16);
-    buffer.putLong(state._h1);
-    buffer.putLong(state._h2);
-    return buffer.array();
+  public static int murmurHash3X64Bit32(byte[] key, int seed) {
+    return murmurHash3X64Bit32(key, key.length, seed);
   }
 
   /**
-   * Hash a value using the x64 64 bit variant of murmurHash3
-   *
-   * @param key value to hash
-   * @param seed random value
-   * @return 64 bit hashed key
+   * Taken and modified from
+   * <a 
href="https://github.com/infinispan/infinispan/blob/main/commons/all/src/main/java/org/infinispan/commons/hash/
+   * MurmurHash3.java">Infinispan code base</a>.
    */
-  public static long murmurHash3X64Bit64(final byte[] key, final int seed) {
-    // Exactly the same as murmurHash3X64Bit128, except it only returns 
state.h1
+  private static State murmurHash3X64(byte[] key, int length, int seed) {
     State state = new State();
 
     state._h1 = 0x9368e53c2f6af274L ^ seed;
@@ -375,9 +341,10 @@ public class MurmurHashFunctions {
     state._c1 = 0x87c37b91114253d5L;
     state._c2 = 0x4cf5ad432745937fL;
 
-    for (int i = 0; i < key.length / 16; i++) {
-      state._k1 = getblock(key, i * 2 * 8);
-      state._k2 = getblock(key, (i * 2 + 1) * 8);
+    int end = length - 15;
+    for (int i = 0; i < end; i += 16) {
+      state._k1 = getblock(key, i);
+      state._k2 = getblock(key, i + 8);
 
       bmix(state);
     }
@@ -385,9 +352,9 @@ public class MurmurHashFunctions {
     state._k1 = 0;
     state._k2 = 0;
 
-    int tail = (key.length >>> 4) << 4;
+    int tail = length & 0xFFFFFFF0;
     // CHECKSTYLE:OFF: checkstyle:coding
-    switch (key.length & 15) {
+    switch (length & 15) {
       case 15:
         state._k2 ^= (long) key[tail + 14] << 48;
       case 14:
@@ -422,7 +389,7 @@ public class MurmurHashFunctions {
     }
     // CHECKSTYLE:ON: checkstyle:coding
 
-    state._h2 ^= key.length;
+    state._h2 ^= length;
 
     state._h1 += state._h2;
     state._h2 += state._h1;
@@ -433,18 +400,7 @@ public class MurmurHashFunctions {
     state._h1 += state._h2;
     state._h2 += state._h1;
 
-    return state._h1;
-  }
-
-  /**
-   * Hash a value using the x64 32 bit variant of murmurHash3
-   *
-   * @param key value to hash
-   * @param seed random value
-   * @return 32 bit hashed key
-   */
-  public static int murmurHash3X64Bit32(final byte[] key, final int seed) {
-    return (int) (murmurHash3X64Bit64(key, seed) >>> 32);
+    return state;
   }
 
   private static void addByte(State state, byte b, int len) {
@@ -462,8 +418,8 @@ public class MurmurHashFunctions {
     }
   }
 
-  static long getblock(byte[] key, int i) {
-    return ((key[i + 0] & 0x00000000000000FFL)) | ((key[i + 1] & 
0x00000000000000FFL) << 8) | (
+  private static long getblock(byte[] key, int i) {
+    return (key[i] & 0x00000000000000FFL) | ((key[i + 1] & 
0x00000000000000FFL) << 8) | (
         (key[i + 2] & 0x00000000000000FFL) << 16) | ((key[i + 3] & 
0x00000000000000FFL) << 24) | (
         (key[i + 4] & 0x00000000000000FFL) << 32) | ((key[i + 5] & 
0x00000000000000FFL) << 40) | (
         (key[i + 6] & 0x00000000000000FFL) << 48) | ((key[i + 7] & 
0x00000000000000FFL) << 56);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to