This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch clucene-2.0
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-2.0 by this push:
new d3a628663ad [improve](pfor) add non-simd implementation for PFOR 256
(#297)
d3a628663ad is described below
commit d3a628663ad07a38568f07deb04f5bc07fcc5869
Author: airborne12 <[email protected]>
AuthorDate: Sun Mar 23 22:17:26 2025 +0800
[improve](pfor) add non-simd implementation for PFOR 256 (#297)
* [improve](pfor) add non-simd implementation for PFOR 256
* [improve](pfor) add non-simd implementation for PFOR 256
* [improve](pfor) add non-simd implementation for PFOR 256
* add zigzag scalar function
* add unitest for pfor encode/decode
* add unitest for pfor encode/decode
* add unitest for pfor encode/decode
* add unitest for pfor encode/decode
* add unitest for pfor encode/decode
---
CMakeLists.txt | 27 +-
cmake/Toolchain-aarch64.cmake | 5 +
src/core/CLucene/index/CodeMode.h | 4 +-
src/core/CLucene/index/FieldInfos.cpp | 56 +-
src/core/CLucene/index/SDocumentWriter.cpp | 20 +-
src/core/CLucene/index/SegmentTermDocs.cpp | 43 +-
src/core/CLucene/index/_FieldInfos.h | 9 +-
src/core/CLucene/index/_SegmentHeader.h | 6 +-
src/core/CLucene/util/PFORUtil.cpp | 190 +++-
src/core/CLucene/util/PFORUtil.h | 10 +-
src/ext/for/CMakeLists.txt | 16 +
src/ext/for/bitpack.h | 17 +
src/ext/for/bitunpack.c | 1202 ++++++++++++++++++++
src/ext/for/test_bitd1unpack.cpp | 399 +++++++
src/ext/for/vp4.h | 2 +
src/ext/for/vp4d.c | 30 +
src/test/CMakeLists.txt | 3 +-
.../pfor_p4ndx_compat_gen_by_old_version_arm.dat | Bin 0 -> 1168 bytes
...pfor_p4ndx_compat_gen_by_old_version_x86_64.dat | Bin 0 -> 1164 bytes
src/test/store/testPFOR.cpp | 546 +++++++++
src/test/test.h | 2 +-
src/test/tests.cpp | 3 +-
22 files changed, 2494 insertions(+), 96 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3b77861857..44a673a9e4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,8 +19,6 @@ SET(CLUCENE_VERSION
"${CLUCENE_VERSION_MAJOR}.${CLUCENE_VERSION_MINOR}.${CLUCENE
#CMake 2.6+ is recommended to an improved Boost module
CMAKE_MINIMUM_REQUIRED(VERSION 2.4.0 FATAL_ERROR)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
-
if(COMMAND cmake_policy)
cmake_policy(SET CMP0003 NEW)
cmake_policy(SET CMP0043 NEW)
@@ -135,9 +133,30 @@ elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "LSAN")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_LSAN}")
endif()
-if (USE_AVX2)
- SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX2")
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64")
+ set (ARCH_AMD64 1)
+endif ()
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*)")
+ set (ARCH_AARCH64 1)
+endif ()
+if (ARCH_AARCH64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+ set (ARCH_ARM 1)
+endif ()
+if (ARCH_AMD64)
+ if (USE_SSE4_2)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+ endif()
+ message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+ if (USE_AVX2)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DUSE_AVX2")
+ endif()
endif()
+
+if (ARCH_ARM)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc")
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+
if (__COMPILER_CLANG)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-narrowing -g
-fno-omit-frame-pointer")
else ()
diff --git a/cmake/Toolchain-aarch64.cmake b/cmake/Toolchain-aarch64.cmake
new file mode 100644
index 00000000000..948164d513f
--- /dev/null
+++ b/cmake/Toolchain-aarch64.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
diff --git a/src/core/CLucene/index/CodeMode.h
b/src/core/CLucene/index/CodeMode.h
index 3c39e94ecb6..05dd8b82649 100644
--- a/src/core/CLucene/index/CodeMode.h
+++ b/src/core/CLucene/index/CodeMode.h
@@ -5,7 +5,9 @@ CL_NS_DEF(index)
enum class CodeMode {
kDefault = 0,
kPfor = 1,
- kRange = 2
+ kRange = 2,
+ kPfor256 = 3,
+ kPfor128 = 4
};
CL_NS_END
\ No newline at end of file
diff --git a/src/core/CLucene/index/FieldInfos.cpp
b/src/core/CLucene/index/FieldInfos.cpp
index 00e0c4275a5..155c14e945f 100644
--- a/src/core/CLucene/index/FieldInfos.cpp
+++ b/src/core/CLucene/index/FieldInfos.cpp
@@ -21,24 +21,20 @@ CL_NS_USE(document)
CL_NS_USE(util)
CL_NS_DEF(index)
-
-FieldInfo::FieldInfo(const TCHAR *_fieldName,
- const bool _isIndexed,
- const int32_t _fieldNumber,
- const bool _storeTermVector,
- const bool _storeOffsetWithTermVector,
- const bool _storePositionWithTermVector,
- const bool _omitNorms,
-
const bool _hasProx,
- const bool _storePayloads) :
name(CLStringIntern::intern(_fieldName )),
- isIndexed(_isIndexed),
- number(_fieldNumber),
-
storeTermVector(_storeTermVector),
-
storeOffsetWithTermVector(_storeOffsetWithTermVector),
-
storePositionWithTermVector(_storePositionWithTermVector),
- omitNorms(_omitNorms),
hasProx(_hasProx),
-
storePayloads(_storePayloads) {
-}
+FieldInfo::FieldInfo(const TCHAR* _fieldName, const bool _isIndexed, const
int32_t _fieldNumber,
+ const bool _storeTermVector, const bool
_storeOffsetWithTermVector,
+ const bool _storePositionWithTermVector, const bool
_omitNorms,
+ const bool _hasProx, const bool _storePayloads, const
bool _compatibleRead)
+ : name(CLStringIntern::intern(_fieldName)),
+ isIndexed(_isIndexed),
+ number(_fieldNumber),
+ storeTermVector(_storeTermVector),
+ storeOffsetWithTermVector(_storeOffsetWithTermVector),
+ storePositionWithTermVector(_storePositionWithTermVector),
+ omitNorms(_omitNorms),
+ hasProx(_hasProx),
+ storePayloads(_storePayloads),
+ compatibleRead(_compatibleRead) {}
FieldInfo::~FieldInfo(){
CL_NS(util)::CLStringIntern::unintern(name);
@@ -46,7 +42,7 @@ FieldInfo::~FieldInfo(){
FieldInfo* FieldInfo::clone() {
return _CLNEW FieldInfo(name, isIndexed, number, storeTermVector,
storePositionWithTermVector,
- storeOffsetWithTermVector, omitNorms, hasProx, storePayloads);
+ storeOffsetWithTermVector, omitNorms, hasProx, storePayloads,
compatibleRead);
}
FieldInfos::FieldInfos():
@@ -103,6 +99,17 @@ bool FieldInfos::hasProx() {
return false;
}
+bool FieldInfos::compatibleRead() {
+ int numFields = byNumber.size();
+ for (int i = 0; i < numFields; i++) {
+ FieldInfo* fi = fieldInfo(i);
+ if (fi->compatibleRead) {
+ return true;
+ }
+ }
+ return false;
+}
+
IndexVersion FieldInfos::getIndexVersion() {
int numFields = byNumber.size();
for (int i = 0; i < numFields; i++) {
@@ -137,11 +144,11 @@ void FieldInfos::add(const TCHAR** names, const bool
isIndexed, const bool store
FieldInfo* FieldInfos::add(const TCHAR* name, const bool isIndexed, const bool
storeTermVector,
const bool storePositionWithTermVector,
const bool storeOffsetWithTermVector, const bool
omitNorms,
- const bool hasProx, const bool storePayloads) {
+ const bool hasProx, const bool storePayloads, const
bool compatibleRead) {
FieldInfo* fi = fieldInfo(name);
if (fi == NULL) {
return addInternal(name, isIndexed, storeTermVector,
storePositionWithTermVector,
-
storeOffsetWithTermVector, omitNorms, hasProx, storePayloads);
+
storeOffsetWithTermVector, omitNorms, hasProx, storePayloads,
compatibleRead);
} else {
if (fi->isIndexed != isIndexed) {
fi->isIndexed = true; // once
indexed, always index
@@ -164,6 +171,9 @@ FieldInfo* FieldInfos::add(const TCHAR* name, const bool
isIndexed, const bool s
if (fi->storePayloads != storePayloads) {
fi->storePayloads = true;
}
+ if (fi->compatibleRead != compatibleRead) {
+ fi->compatibleRead = compatibleRead;
+ }
}
return fi;
}
@@ -172,10 +182,10 @@ FieldInfo* FieldInfos::addInternal(const TCHAR* name,
const bool isIndexed,
const bool storeTermVector,
const bool storePositionWithTermVector,
const bool storeOffsetWithTermVector, const
bool omitNorms,
- const bool hasProx, const bool
storePayloads) {
+ const bool hasProx, const bool
storePayloads, const bool compatibleRead) {
FieldInfo* fi = _CLNEW FieldInfo(name, isIndexed, byNumber.size(),
storeTermVector,
storePositionWithTermVector, storeOffsetWithTermVector,
-
omitNorms,
hasProx, storePayloads);
+
omitNorms,
hasProx, storePayloads, compatibleRead);
byNumber.push_back(fi);
byName.put( fi->name, fi);
return fi;
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp
b/src/core/CLucene/index/SDocumentWriter.cpp
index 2b85fe5bbca..8d5df79a447 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -1198,31 +1198,13 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
skipListWriter->resetSkip();
- auto encode = [](IndexOutput* out, std::vector<uint32_t>& buffer, bool
isDoc) {
- std::vector<uint8_t> compress(4 * buffer.size() + PFOR_BLOCK_SIZE);
- size_t size = 0;
- if (isDoc) {
- size = P4ENC(buffer.data(), buffer.size(), compress.data());
- } else {
- size = P4NZENC(buffer.data(), buffer.size(), compress.data());
- }
- out->writeVInt(size);
- out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()),
size);
- buffer.resize(0);
- };
-
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
while (numToMerge > 0) {
if ((++df % skipInterval) == 0) {
- freqOut->writeByte((char)CodeMode::kPfor);
- freqOut->writeVInt(docDeltaBuffer.size());
- encode(freqOut, docDeltaBuffer, true);
- if (hasProx_) {
- encode(freqOut, freqBuffer, false);
- }
+ pfor_encode(freqOut, docDeltaBuffer, freqBuffer, hasProx_);
skipListWriter->setSkipData(lastDoc,
currentFieldStorePayloads, lastPayloadLength);
skipListWriter->bufferSkip(df);
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp
b/src/core/CLucene/index/SegmentTermDocs.cpp
index e346dc0ca24..ae9e3a4508f 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -22,7 +22,7 @@ SegmentTermDocs::SegmentTermDocs(const SegmentReader
*_parent) : parent(_parent)
count(0),
df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0),
skipInterval(_parent->tis->getSkipInterval()),
maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL),
freqBasePointer(0), proxBasePointer(0),
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0),
indexVersion_(_parent->_fieldInfos->getIndexVersion()),
-
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx,
indexVersion_) {
+
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx,
indexVersion_, _parent->_fieldInfos->compatibleRead()) {
CND_CONDITION(_parent != NULL, "Parent is NULL");
memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
@@ -222,13 +222,13 @@ int32_t TermDocsBuffer::refillV0() {
uint32_t SerializedSize = freqStream_->readVInt();
std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
freqStream_->readBytes(buf.data(), SerializedSize);
- P4DEC(buf.data(), arraySize, docs_.data());
+ util::P4DEC(buf.data(), arraySize, docs_.data());
}
{
uint32_t SerializedSize = freqStream_->readVInt();
std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
freqStream_->readBytes(buf.data(), SerializedSize);
- P4NZDEC(buf.data(), arraySize, freqs_.data());
+ util::P4NZDEC(buf.data(), arraySize, freqs_.data());
}
} else if (mode == (char)CodeMode::kDefault) {
uint32_t docDelta = 0;
@@ -258,7 +258,7 @@ int32_t TermDocsBuffer::refillV0() {
uint32_t serializedSize = freqStream_->readVInt();
std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE);
freqStream_->readBytes(buf.data(), serializedSize);
- P4DEC(buf.data(), arraySize, docs_.data());
+ util::P4DEC(buf.data(), arraySize, docs_.data());
}
}
return arraySize;
@@ -266,40 +266,7 @@ int32_t TermDocsBuffer::refillV0() {
}
int32_t TermDocsBuffer::refillV1() {
- char mode = freqStream_->readByte();
- uint32_t arraySize = freqStream_->readVInt();
- if (mode == (char)CodeMode::kPfor) {
- {
- uint32_t SerializedSize = freqStream_->readVInt();
- std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
- freqStream_->readBytes(buf.data(), SerializedSize);
- P4DEC(buf.data(), arraySize, docs_.data());
- }
- if (hasProx_) {
- uint32_t SerializedSize = freqStream_->readVInt();
- std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
- freqStream_->readBytes(buf.data(), SerializedSize);
- P4NZDEC(buf.data(), arraySize, freqs_.data());
- }
- } else if (mode == (char)CodeMode::kDefault) {
- uint32_t docDelta = 0;
- for (uint32_t i = 0; i < arraySize; i++) {
- uint32_t docCode = freqStream_->readVInt();
- if (hasProx_) {
- docDelta += (docCode >> 1);
- docs_[i] = docDelta;
- if ((docCode & 1) != 0) {
- freqs_[i] = 1;
- } else {
- freqs_[i] = freqStream_->readVInt();
- }
- } else {
- docDelta += docCode;
- docs_[i] = docDelta;
- }
- }
- }
- return arraySize;
+ return pfor_decode(freqStream_, docs_, freqs_, hasProx_, compatibleRead_);
}
CL_NS_END
diff --git a/src/core/CLucene/index/_FieldInfos.h
b/src/core/CLucene/index/_FieldInfos.h
index ed142c4435c..f80388bb73d 100644
--- a/src/core/CLucene/index/_FieldInfos.h
+++ b/src/core/CLucene/index/_FieldInfos.h
@@ -38,6 +38,7 @@ class FieldInfo :LUCENE_BASE{
IndexVersion indexVersion_ = IndexVersion::kV1;
bool storePayloads; // whether this field stores payloads together with
term positions
+ bool compatibleRead; // whether index docid list is read cross
platform(eg x86 and arm64)
//Func - Constructor
// Initialises FieldInfo.
@@ -59,7 +60,8 @@ class FieldInfo :LUCENE_BASE{
const bool storePositionWithTermVector,
const bool omitNorms,
const bool hasProx,
- const bool storePayloads);
+ const bool storePayloads,
+ const bool compatibleRead);
//Func - Destructor
//Pre - true
@@ -133,6 +135,7 @@ public:
void addIndexed(const TCHAR** names, const bool storeTermVectors, const
bool storePositionWithTermVector, const bool storeOffsetWithTermVector);
bool hasProx();
+ bool compatibleRead();
IndexVersion getIndexVersion();
/**
@@ -167,13 +170,13 @@ public:
FieldInfo* add(const TCHAR* name, const bool isIndexed, const bool
storeTermVector = false,
const
bool storePositionWithTermVector = false,
const
bool storeOffsetWithTermVector = false, const bool omitNorms = false,
- const
bool hasProx = false, const bool storePayloads = false);
+ const
bool hasProx = false, const bool storePayloads = false, const bool
compatibleRead = false);
// was void
FieldInfo* addInternal(const TCHAR* name, const bool isIndexed, const
bool storeTermVector,
const bool storePositionWithTermVector,
const bool storeOffsetWithTermVector, const bool
omitNorms,
-
const bool hasProx, const bool storePayloads);
+
const bool hasProx, const bool storePayloads, const
bool compatibleRead = false);
int32_t fieldNumber(const TCHAR* fieldName)const;
diff --git a/src/core/CLucene/index/_SegmentHeader.h
b/src/core/CLucene/index/_SegmentHeader.h
index c1f01e7cecb..8087423953b 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -32,12 +32,13 @@ class SegmentReader;
class TermDocsBuffer {
public:
- TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx,
IndexVersion indexVersion)
+ TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx,
IndexVersion indexVersion, bool compatibleRead)
: docs_(PFOR_BLOCK_SIZE + 3),
freqs_(PFOR_BLOCK_SIZE + 3),
freqStream_(freqStream),
hasProx_(hasProx),
- indexVersion_(indexVersion) {
+ indexVersion_(indexVersion),
+ compatibleRead_(compatibleRead) {
}
~TermDocsBuffer() {
@@ -83,6 +84,7 @@ private:
CL_NS(store)::IndexInput* freqStream_ = nullptr;
bool hasProx_ = false;
+ bool compatibleRead_ = false;
IndexVersion indexVersion_ = IndexVersion::kV0;
};
diff --git a/src/core/CLucene/util/PFORUtil.cpp
b/src/core/CLucene/util/PFORUtil.cpp
index ae27f521553..d241a305acf 100644
--- a/src/core/CLucene/util/PFORUtil.cpp
+++ b/src/core/CLucene/util/PFORUtil.cpp
@@ -15,19 +15,20 @@
// specific language governing permissions and limitations
// under the License.
#include "PFORUtil.h"
+#include "CLucene/debug/error.h"
+#include "CLucene/index/CodeMode.h"
#include "vp4.h"
#if (defined(__i386) || defined(__x86_64__))
#include <cpuid.h>
#endif
-namespace {
+CL_NS_DEF(util)
using DEC_FUNC = size_t (*)(unsigned char *__restrict, size_t, uint32_t
*__restrict);
using ENC_FUNC = size_t (*)(uint32_t *__restrict in, size_t n, unsigned char
*__restrict out);
DEC_FUNC g_p4nd1dec;
DEC_FUNC g_p4nzdec;
ENC_FUNC g_p4nd1enc;
ENC_FUNC g_p4nzenc;
-} // anonymous namespace
size_t DefaultDEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict
out) {
size_t bufferSize = 0;
@@ -129,3 +130,188 @@ size_t P4ENC(uint32_t *__restrict in, size_t n, unsigned
char *__restrict out) {
size_t P4NZENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict
out) {
return g_p4nzenc(in, n, out);
}
+void pfor_encode(store::IndexOutput* out, std::vector<uint32_t>&
docDeltaBuffer, std::vector<uint32_t>& freqBuffer, bool has_prox) {
+#ifdef __AVX2__
+ out->writeByte((char)index::CodeMode::kPfor256);
+ out->writeVInt(docDeltaBuffer.size());
+ std::vector<uint8_t> compress(4 * docDeltaBuffer.size() + PFOR_BLOCK_SIZE);
+ size_t size = 0;
+ size = p4nd1enc256v32(docDeltaBuffer.data(), docDeltaBuffer.size(),
compress.data());
+ out->writeVInt(size);
+ out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size);
+ if (has_prox) {
+ size = p4nzenc256v32(freqBuffer.data(), freqBuffer.size(),
compress.data());
+ out->writeVInt(size);
+ out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()),
size);
+ }
+#elif (defined(__SSSE3__) || defined(__ARM_NEON))
+ out->writeByte((char)index::CodeMode::kPfor128);
+ out->writeVInt(docDeltaBuffer.size());
+ std::vector<uint8_t> compress(4 * docDeltaBuffer.size() + PFOR_BLOCK_SIZE);
+ size_t size = 0;
+ size = p4nd1enc32(docDeltaBuffer.data(), docDeltaBuffer.size(),
compress.data());
+ out->writeVInt(size);
+ out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size);
+ if (has_prox) {
+ size = p4nzenc32(freqBuffer.data(), freqBuffer.size(),
compress.data());
+ out->writeVInt(size);
+ out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()),
size);
+ }
+#else
+ out->writeByte((char)index::CodeMode::kDefault);
+ out->writeVInt(docDeltaBuffer.size());
+ uint32_t lastDoc = 0;
+ for (int32_t i = 0; i < docDeltaBuffer.size(); i++) {
+ uint32_t curDoc = docDeltaBuffer[i];
+ if (has_prox) {
+ uint32_t newDocCode = (curDoc - lastDoc) << 1;
+ lastDoc = curDoc;
+ uint32_t freq = freqBuffer[i];
+ if (1 == freq) {
+ out->writeVInt(newDocCode | 1);
+ } else {
+ out->writeVInt(newDocCode);
+ out->writeVInt(freq);
+ }
+ } else {
+ out->writeVInt(curDoc - lastDoc);
+ lastDoc = curDoc;
+ }
+ }
+#endif
+ docDeltaBuffer.resize(0);
+ freqBuffer.resize(0);
+}
+
+uint32_t pfor_decode(store::IndexInput* in, std::vector<uint32_t>& docs,
std::vector<uint32_t>& freqs, bool has_prox, bool compatibleRead) {
+ char mode = in->readByte();
+ uint32_t arraySize = in->readVInt();
+ // old version, need to separate read based on compatibleRead
+ if (mode == (char)index::CodeMode::kPfor) {
+ {
+ uint32_t SerializedSize = in->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+ // if compatibleRead is true, means we are reading old version
arm64 index in x86_64 platform.
+ if (compatibleRead) {
+ p4nd1dec32(buf.data(), arraySize, docs.data());
+ } else {
+ p4nd1dec256v32(buf.data(), arraySize, docs.data());
+ }
+#elif (defined(__ARM_NEON))
+ // if compatibleRead is true, means we are reading old version
x86_64 index in arm64 platform.
+ if (compatibleRead) {
+ p4nd1dec256scalarv32(buf.data(), arraySize, docs.data());
+ } else {
+ p4nd1dec32(buf.data(), arraySize, docs.data());
+ }
+#elif (defined(__SSSE3__))
+ // if compatibleRead is true, means we are reading old version
x86_64 index in x86_64 which does not support avx2.
+ if (compatibleRead) {
+ p4nd1dec256scalarv32(buf.data(), arraySize, docs.data());
+ } else {
+ DefaultDDEC(buf.data(), arraySize, docs.data());
+ }
+#else
+ DefaultDDEC(buf.data(), arraySize, docs.data());
+#endif
+ }
+ if (has_prox) {
+ uint32_t SerializedSize = in->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+ // if compatibleRead is true, means we are reading old version
arm64 index in x86_64 platform.
+ if (compatibleRead) {
+ p4nzdec32(buf.data(), arraySize, freqs.data());
+ } else {
+ p4nzdec256v32(buf.data(), arraySize, freqs.data());
+ }
+#elif (defined(__ARM_NEON))
+ // if compatibleRead is true, means we are reading old version
x86_64 index in arm64 platform.
+ if (compatibleRead) {
+ p4nzdec256scalarv32(buf.data(), arraySize, freqs.data());
+ } else {
+ p4nzdec32(buf.data(), arraySize, freqs.data());
+ }
+#elif (defined(__SSSE3__))
+ // if compatibleRead is true, means we are reading old version
x86_64 index in x86_64 which does not support avx2.
+ if (compatibleRead) {
+ p4nzdec256scalarv32(buf.data(), arraySize, freqs.data());
+ } else {
+ DefaultDEC(buf.data(), arraySize, freqs.data());
+ }
+#else
+ DefaultDEC(buf.data(), arraySize, freqs.data());
+#endif
+ }
+ } else if (mode == (char)index::CodeMode::kDefault) {
+ uint32_t docDelta = 0;
+ for (uint32_t i = 0; i < arraySize; i++) {
+ uint32_t docCode = in->readVInt();
+ if (has_prox) {
+ docDelta += (docCode >> 1);
+ docs[i] = docDelta;
+ if ((docCode & 1) != 0) {
+ freqs[i] = 1;
+ } else {
+ freqs[i] = in->readVInt();
+ }
+ } else {
+ docDelta += docCode;
+ docs[i] = docDelta;
+ }
+ }
+ } else if (mode == (char)index::CodeMode::kPfor256) {
+ // new version, read based on compatibleRead
+ {
+ uint32_t SerializedSize = in->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+ p4nd1dec256v32(buf.data(), arraySize, docs.data());
+#else
+ _CLTHROWA(CL_ERR_CorruptIndex, "PFOR256 is not supported on this
platform");
+#endif
+ }
+ if (has_prox) {
+ uint32_t SerializedSize = in->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+ p4nzdec256v32(buf.data(), arraySize, freqs.data());
+#else
+ _CLTHROWA(CL_ERR_CorruptIndex, "PFOR256 is not supported on this
platform");
+#endif
+ }
+ } else if (mode == (char)index::CodeMode::kPfor128) {
+ // new version, read based on compatibleRead
+ {
+ uint32_t SerializedSize = in->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+ p4nd1dec32(buf.data(), arraySize, docs.data());
+#elif (defined(__SSSE3__) || defined(__ARM_NEON))
+ p4nd1dec32(buf.data(), arraySize, docs.data());
+#else
+ _CLTHROWA(CL_ERR_CorruptIndex, "PFOR128 is not supported on this
platform");
+#endif
+ }
+ if (has_prox) {
+ uint32_t SerializedSize = in->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+ p4nzdec32(buf.data(), arraySize, freqs.data());
+#elif (defined(__SSSE3__) || defined(__ARM_NEON))
+ p4nzdec32(buf.data(), arraySize, freqs.data());
+#else
+ _CLTHROWA(CL_ERR_CorruptIndex, "PFOR128 is not supported on this
platform");
+#endif
+ }
+ }
+ return arraySize;
+}
+CL_NS_END
diff --git a/src/core/CLucene/util/PFORUtil.h b/src/core/CLucene/util/PFORUtil.h
index 29acb7fe7a6..bf44cb1bc23 100644
--- a/src/core/CLucene/util/PFORUtil.h
+++ b/src/core/CLucene/util/PFORUtil.h
@@ -18,9 +18,17 @@
#include <cstddef>
#include <cstdint>
+#include "CLucene/SharedHeader.h"
+#include "CLucene/CLConfig.h"
+#include "CLucene/store/IndexOutput.h"
+#include "CLucene/store/IndexInput.h"
+#include <vector>
+CL_NS_DEF(util)
size_t P4DEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t P4NZDEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict
out);
size_t P4ENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t P4NZENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict
out);
-
+void pfor_encode(store::IndexOutput* out, std::vector<uint32_t>&
docDeltaBuffer, std::vector<uint32_t>& freqBuffer, bool has_prox);
+uint32_t pfor_decode(store::IndexInput* in, std::vector<uint32_t>& docs,
std::vector<uint32_t>& freqs, bool has_prox, bool compatibleRead);
+CL_NS_END
diff --git a/src/ext/for/CMakeLists.txt b/src/ext/for/CMakeLists.txt
index 3b14781f69e..9c139a6f2a6 100644
--- a/src/ext/for/CMakeLists.txt
+++ b/src/ext/for/CMakeLists.txt
@@ -84,8 +84,24 @@ foreach(SRC_FILE ${SRC_FILES})
endif()
endforeach()
+add_executable(test_bitd1unpack test_bitd1unpack.cpp)
+
+target_link_libraries(test_bitd1unpack ic)
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+ target_compile_options(test_bitd1unpack PRIVATE ${AVX2} -DAVX2_ON ${DEBUG})
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+ target_compile_options(test_bitd1unpack PRIVATE -march=armv8-a ${DEBUG})
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+ target_compile_options(test_bitd1unpack PRIVATE -mcpu=power9 -mtune=power9
-D__SSSE3__ ${DEBUG})
+endif()
+
set(LIB_DESTINATION ../)
install(TARGETS ic
DESTINATION ${LIB_DESTINATION}
COMPONENT ext)
+
+install(TARGETS test_bitd1unpack
+ DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+ COMPONENT tests)
diff --git a/src/ext/for/bitpack.h b/src/ext/for/bitpack.h
index b0b9e02275a..b8f91ad5690 100644
--- a/src/ext/for/bitpack.h
+++ b/src/ext/for/bitpack.h
@@ -30,6 +30,7 @@
#include <stdint.h>
#endif
#include <stddef.h>
+#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
@@ -271,6 +272,7 @@ unsigned char *bitunpack128v64( const unsigned char
*__restrict in, unsigned n,
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned b);
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b);
+unsigned char *bitzunpack256scalarv32( const unsigned char *__restrict in,
unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b);
@@ -299,10 +301,25 @@ unsigned char *_bitd1unpack128h32(const unsigned char
*__restrict in, unsigned n
unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned
char *bb);
unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned
n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned
char *bb);
+unsigned char* bitd1unpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned start,
unsigned b);
+unsigned char* _bitd1unpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned start,
unsigned b,
+ unsigned* __restrict pex, unsigned char* bb);
+unsigned char *bitunpack256scalarv32(const unsigned char *__restrict in,
unsigned n,
+ unsigned *__restrict out, unsigned b);
+unsigned char *_bitunpack256scalarv32(const unsigned char *__restrict in,
+ unsigned n,
+ unsigned *__restrict out,
+ unsigned b,
+ unsigned *__restrict pex,
+ unsigned char *bb,
+ bool isZigZag);
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned
char *bb);
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict
pex, unsigned char *bb);
unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict
pex, unsigned char *bb);
unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned
n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict
pex, unsigned char *bb);
+unsigned char *_bitzunpack256scalarv32( const unsigned char *__restrict in,
unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned
*__restrict pex, unsigned char *bb);
#ifdef __cplusplus
}
#endif
diff --git a/src/ext/for/bitunpack.c b/src/ext/for/bitunpack.c
index 1dd78003ada..26817ac55bc 100644
--- a/src/ext/for/bitunpack.c
+++ b/src/ext/for/bitunpack.c
@@ -28,6 +28,9 @@
#include "bitutil.h"
#include "bitpack.h"
#include "vint.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
#define PAD8(_x_) (((_x_)+7)/8)
@@ -690,6 +693,1205 @@ unsigned char *bitunpack256w32( const unsigned char
*__restrict in, unsigned n,
BITUNPACK128V32(in, b, out, sv);
return (unsigned char *)_in+PAD8(256*b);
}
+static void applyException_8bits(uint8_t xm8, uint32_t** pPEX, int nb,
uint32_t ov[8]) {
+ uint32_t* ex = *pPEX;
+ for (int j = 0; j < 8; j++) {
+ if ((xm8 >> j) & 1) {
+ ov[j] += (ex[0] << nb);
+ ex++;
+ }
+ }
+ *pPEX = ex;
+}
+static inline uint32_t zigzagDecode_scalar(uint32_t x) {
+ // (x>>1) ^ -((x & 1) )
+ return (x >> 1) ^ -(x & 1);
+}
+static void bitunblk256v32_scalar_template(uint32_t** pIn, uint32_t** pOut,
int expansions_count,
+ const uint8_t* SHIFT_HI, const
uint8_t* SHIFT_LO,
+ const uint8_t* READ_FLAG, uint32_t
mask, int nb,
+ bool isZigZag) {
+ const uint32_t* oldp = NULL; // pointer to current block data
+ uint32_t ov[8], tmp[8];
+
+ for (int k = 0; k < expansions_count; k++) {
+ if (k == 0) {
+ // Step 0: Load input block and directly take the lower nb bits
+ oldp = *pIn;
+ *pIn += 8;
+ for (int j = 0; j < 8; j++) {
+ ov[j] = oldp[j] & mask;
+ }
+ } else {
+ // First right shift the current block data by SHIFT_HI[k]
+ for (int j = 0; j < 8; j++) {
+ ov[j] = oldp[j] >> SHIFT_HI[k];
+ }
+ if (READ_FLAG[k]) {
+ // Need to load a new block: left shift the new block data by
SHIFT_LO[k], then merge with ov
+ const uint32_t* newp = *pIn;
+ *pIn += 8;
+ for (int j = 0; j < 8; j++) {
+ uint32_t part_lo = (newp[j] << SHIFT_LO[k]) & mask;
+ ov[j] |= part_lo;
+ }
+ // Update current block pointer
+ oldp = newp;
+ } else {
+ // No need to load a new block, ensure the result is within mask
range
+ for (int j = 0; j < 8; j++) {
+ ov[j] &= mask;
+ }
+ }
+ }
+ // Write out the current 8 results
+ uint32_t* outp = *pOut;
+ for (int j = 0; j < 8; j++) {
+ if (isZigZag) {
+ outp[j] = zigzagDecode_scalar(ov[j]);
+ } else {
+ outp[j] = ov[j];
+ }
+ }
+ *pOut += 8;
+ }
+}
+/**
+ * Generic template: supports "some expansions don't need to read new blocks".
+ *
+ * Parameters:
+ * - expansions_count: total number of expansions (for 29-bit, it might be 32
times)
+ * - SHIFT_HI[k], SHIFT_LO[k]: right shift for leftover, left shift for new
block in k-th expansion
+ * - READ_FLAG[k]: whether k-th expansion needs to read a new block (1 means
yes, 0 means no)
+ * - mask: for 29-bit = (1u << 29) - 1
+ * - nb: base bits (29)
+ */
+static void bitunblk256v32_scalarBlock_ex_template(uint32_t** pIn, uint32_t**
pOut, uint32_t** pPEX,
+ unsigned char** pBB, int
expansions_count,
+ const uint8_t* SHIFT_HI,
const uint8_t* SHIFT_LO,
+ const uint8_t* READ_FLAG,
uint32_t mask, int nb,
+ bool isZigZag) {
+ const uint32_t* oldp = NULL; // leftover block (previous batch)
+
+ for (int k = 0; k < expansions_count; k++) {
+ uint32_t ov[8];
+
+ if (k == 0) {
+ // First time: directly read 8×32-bit and apply mask
+ oldp = *pIn;
+ *pIn += 8;
+ for (int j = 0; j < 8; j++) {
+ ov[j] = oldp[j] & mask;
+ }
+ } else {
+ // Subsequent expansions
+ uint8_t hi = SHIFT_HI[k];
+ uint8_t lo = SHIFT_LO[k];
+
+ // First shift leftover >> hi
+ for (int j = 0; j < 8; j++) {
+ ov[j] = (oldp[j] >> hi);
+ }
+
+ // If this expansion needs to read a new block, append newp << lo
+ if (READ_FLAG[k]) {
+ const uint32_t* newp = *pIn;
+ *pIn += 8;
+ for (int j = 0; j < 8; j++) {
+ uint32_t part_lo = (newp[j] << lo) & mask;
+ ov[j] |= part_lo;
+ }
+ // After reading, newp becomes the leftover for next time
+ oldp = newp;
+ } else {
+ // No need to read new block => just apply mask to leftover >> hi
+ for (int j = 0; j < 8; j++) {
+ ov[j] &= mask;
+ }
+ // leftover remains unchanged, continue using oldp
+ }
+ }
+
+ // Apply exceptions
+ uint8_t xm8 = **pBB;
+ (*pBB)++;
+ applyException_8bits(xm8, pPEX, nb, ov);
+
+ // Write out this batch of 8 results
+ uint32_t* outp = *pOut;
+ for (int j = 0; j < 8; j++) {
+ if (isZigZag) {
+ outp[j] = zigzagDecode_scalar(ov[j]);
+ } else {
+ outp[j] = ov[j];
+ }
+ }
+ *pOut += 8;
+ }
+}
+static void bitunpack256v32_0_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ uint32_t* op = *pOut;
+ for (int i = 0; i < 32; i++) {
+ // Read bitmap if exists, otherwise default to 0
+ uint8_t xm8 = (pBB != NULL) ? **pBB : 0;
+ if (pBB != NULL) {
+ (*pBB)++;
+ }
+ // Initialize output array (all zeros by default)
+ uint32_t ov[8] = {0};
+ if (xm8 != 0 && pPEX != NULL) {
+ applyException_8bits(xm8, pPEX, 0, ov);
+ }
+
+ // Directly write 8 values using a loop to avoid repeated memory copy
calls
+ for (int j = 0; j < 8; j++) {
+ if (isZigZag) {
+ op[j] = zigzagDecode_scalar(ov[j]);
+ } else {
+ op[j] = ov[j];
+ }
+ }
+ op += 8;
+ }
+ *pOut = op;
+}
+
+static void bitunpack256v32_1_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb1 = 1;
+ const uint32_t mask1 = 1; // 0x1
+ const int expansions_count_1 = 32;
+ static const uint8_t SHIFT_HI_1[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30,
31};
+ static const uint8_t SHIFT_LO_1[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
+ static const uint8_t READ_FLAG_1[32] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_1, SHIFT_HI_1,
+ SHIFT_LO_1, READ_FLAG_1, mask1,
nb1, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_1,
SHIFT_HI_1, SHIFT_LO_1,
+ READ_FLAG_1, mask1, nb1, isZigZag);
+ }
+}
+
+static void bitunpack256v32_2_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb2 = 2;
+ const uint32_t mask2 = (1u << nb2) - 1; // 0x3
+ const int expansions_count_2 = 16;
+ static const uint8_t SHIFT_HI_2[16] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18,
20, 22, 24, 26, 28, 30};
+ static const uint8_t SHIFT_LO_2[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
+ static const uint8_t READ_FLAG_2[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_2,
+ SHIFT_HI_2, SHIFT_LO_2,
READ_FLAG_2, mask2, nb2,
+ isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_2,
SHIFT_HI_2, SHIFT_LO_2,
+ READ_FLAG_2, mask2, nb2, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_3_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb3 = 3;
+ const uint32_t mask3 = (1u << nb3) - 1; // 0x7
+ const int expansions_count_3 = 32;
+ static const uint8_t SHIFT_HI_3[32] = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27,
30,
+ 1, 4, 7, 10, 13, 16, 19, 22, 25, 28,
31,
+ 2, 5, 8, 11, 14, 17, 20, 23, 26, 29};
+ static const uint8_t SHIFT_LO_3[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
+ static const uint8_t READ_FLAG_3[32] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_3, SHIFT_HI_3,
+ SHIFT_LO_3, READ_FLAG_3, mask3,
nb3, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_3,
SHIFT_HI_3, SHIFT_LO_3,
+ READ_FLAG_3, mask3, nb3, isZigZag);
+ }
+}
+
+static void bitunpack256v32_4_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const uint32_t mask4 = (1u << 4) - 1; // 0xF
+ const int nb = 4; // base bits
+ const int expansions_count = 8;
+ static const uint8_t SHIFT_HI_4[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+ static const uint8_t SHIFT_LO_4[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ static const uint8_t READ_FLAG_4[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count, SHIFT_HI_4,
+ SHIFT_LO_4, READ_FLAG_4,
mask4, nb, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count,
SHIFT_HI_4, SHIFT_LO_4,
+ READ_FLAG_4, mask4, nb, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_5_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb5 = 5;
+ const uint32_t mask5 = (1u << nb5) - 1; // 0x1F
+ const int expansions_count_5 = 32;
+ static const uint8_t SHIFT_HI_5[32] = {0, 5, 10, 15, 20, 25, 30, 3, 8,
13, 18,
+ 23, 28, 1, 6, 11, 16, 21, 26, 31,
4, 9,
+ 14, 19, 24, 29, 2, 7, 12, 17, 22,
27};
+ static const uint8_t SHIFT_LO_5[32] = {0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
4, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0,
0, 0, 0, 0};
+ static const uint8_t READ_FLAG_5[32] = {1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_5, SHIFT_HI_5,
+ SHIFT_LO_5, READ_FLAG_5, mask5,
nb5, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_5,
SHIFT_HI_5, SHIFT_LO_5,
+ READ_FLAG_5, mask5, nb5, isZigZag);
+ }
+}
+static void bitunpack256v32_6_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb6 = 6;
+ const uint32_t mask6 = (1u << nb6) - 1; // 0x3F
+ const int expansions_count_6 = 16;
+ static const uint8_t SHIFT_HI_6[16] = {0, 6, 12, 18, 24, 30, 4, 10, 16, 22,
28, 2, 8, 14, 20, 26};
+ static const uint8_t SHIFT_LO_6[16] = {0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0,
0, 0, 0, 0};
+ static const uint8_t READ_FLAG_6[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_6,
+ SHIFT_HI_6, SHIFT_LO_6,
READ_FLAG_6, mask6, nb6,
+ isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_6,
SHIFT_HI_6, SHIFT_LO_6,
+ READ_FLAG_6, mask6, nb6, isZigZag);
+ }
+ }
+}
+static void bitunpack256v32_7_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb7 = 7;
+ const uint32_t mask7 = (1u << nb7) - 1; // 0x7F
+ const int expansions_count = 32;
+ static const uint8_t SHIFT_HI_7[32] = {0, 7, 14, 21, 28, 3, 10, 17, 24,
31, 6,
+ 13, 20, 27, 2, 9, 16, 23, 30, 5,
12, 19,
+ 26, 1, 8, 15, 22, 29, 4, 11, 18,
25};
+ static const uint8_t SHIFT_LO_7[32] = {0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0,
0, 5, 0, 0,
+ 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 3,
0, 0, 0, 0};
+ static const uint8_t READ_FLAG_7[32] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0,
+ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
0, 0, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count, SHIFT_HI_7,
+ SHIFT_LO_7, READ_FLAG_7, mask7,
nb7, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_7,
SHIFT_LO_7,
+ READ_FLAG_7, mask7, nb7, isZigZag);
+ }
+}
+static void bitunpack256v32_8_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb8 = 8;
+ const uint32_t mask8 = (1u << nb8) - 1; // 0xFF
+ const int expansions_count_8 = 4;
+ static const uint8_t SHIFT_HI_8[4] = {0, 8, 16, 24};
+ static const uint8_t SHIFT_LO_8[4] = {0, 0, 0, 0};
+ static const uint8_t READ_FLAG_8[4] = {1, 0, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 8; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_8,
+ SHIFT_HI_8, SHIFT_LO_8,
READ_FLAG_8, mask8, nb8,
+ isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 8; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_8,
SHIFT_HI_8, SHIFT_LO_8,
+ READ_FLAG_8, mask8, nb8, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_9_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb9 = 9;
+ const uint32_t mask9 = (1u << nb9) - 1; // 0x1FF
+ const int expansions_count_9 = 32;
+ static const uint8_t SHIFT_HI_9[32] = {0, 9, 18, 27, 4, 13, 22, 31, 8,
17, 26,
+ 3, 12, 21, 30, 7, 16, 25, 2, 11,
20, 29,
+ 6, 15, 24, 1, 10, 19, 28, 5, 14,
23};
+ static const uint8_t SHIFT_LO_9[32] = {0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 6, 0,
0, 0, 2, 0,
+ 0, 7, 0, 0, 0, 3, 0, 0, 8, 0, 0, 0,
4, 0, 0, 0};
+ static const uint8_t READ_FLAG_9[32] = {1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0,
+ 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_9, SHIFT_HI_9,
+ SHIFT_LO_9, READ_FLAG_9, mask9,
nb9, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_9,
SHIFT_HI_9, SHIFT_LO_9,
+ READ_FLAG_9, mask9, nb9, isZigZag);
+ }
+}
+static void bitunpack256v32_10_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb10 = 10;
+ const uint32_t mask10 = (1u << nb10) - 1; // 0x3FF
+ const int expansions_count_10 = 16;
+ static const uint8_t SHIFT_HI_10[16] = {0, 10, 20, 30, 8, 18, 28, 6,
+ 16, 26, 4, 14, 24, 2, 12, 22};
+ static const uint8_t SHIFT_LO_10[16] = {0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0,
8, 0, 0, 0};
+ static const uint8_t READ_FLAG_10[16] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
1, 0, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_10,
+ SHIFT_HI_10, SHIFT_LO_10,
READ_FLAG_10, mask10,
+ nb10, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_10,
SHIFT_HI_10, SHIFT_LO_10,
+ READ_FLAG_10, mask10, nb10, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_11_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb11 = 11;
+ const uint32_t mask11 = (1u << nb11) - 1; // 0x7FF
+ const int expansions_count_11 = 32;
+ static const uint8_t SHIFT_HI_11[32] = {0, 11, 22, 1, 12, 23, 2, 13, 24,
3, 14,
+ 25, 4, 15, 26, 5, 16, 27, 6, 17,
28, 7,
+ 18, 29, 8, 19, 30, 9, 20, 31, 10,
21};
+ static const uint8_t SHIFT_LO_11[32] = {0, 0, 10, 0, 0, 9, 0, 0, 8, 0, 0, 7,
0, 0, 6, 0,
+ 0, 5, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0,
0, 1, 0, 0};
+ static const uint8_t READ_FLAG_11[32] = {1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0,
+ 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
0, 1, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_11, SHIFT_HI_11,
+ SHIFT_LO_11, READ_FLAG_11,
mask11, nb11, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_11,
SHIFT_HI_11, SHIFT_LO_11,
+ READ_FLAG_11, mask11, nb11, isZigZag);
+ }
+}
+
+static void bitunpack256v32_12_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb12 = 12;
+ const uint32_t mask12 = (1u << nb12) - 1; // 0xFFF
+ const int expansions_count_12 = 8;
+ static const uint8_t SHIFT_HI_12[8] = {0, 12, 24, 4, 16, 28, 8, 20};
+ static const uint8_t SHIFT_LO_12[8] = {0, 0, 8, 0, 0, 4, 0, 0};
+ static const uint8_t READ_FLAG_12[8] = {1, 0, 1, 0, 0, 1, 0, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_12,
+ SHIFT_HI_12, SHIFT_LO_12,
READ_FLAG_12, mask12,
+ nb12, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_12,
SHIFT_HI_12, SHIFT_LO_12,
+ READ_FLAG_12, mask12, nb12, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_13_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb13 = 13;
+ const uint32_t mask13 = (1u << nb13) - 1; // 0x1FFF
+ const int expansions_count_13 = 32;
+ static const uint8_t SHIFT_HI_13[32] = {0, 13, 26, 7, 20, 1, 14, 27, 8,
21, 2,
+ 15, 28, 9, 22, 3, 16, 29, 10, 23,
4, 17,
+ 30, 11, 24, 5, 18, 31, 12, 25, 6,
19};
+ static const uint8_t SHIFT_LO_13[32] = {0, 0, 6, 0, 12, 0, 0, 5, 0, 11, 0,
0, 4, 0, 10, 0,
+ 0, 3, 0, 9, 0, 0, 2, 0, 8, 0, 0,
1, 0, 7, 0, 0};
+ static const uint8_t READ_FLAG_13[32] = {1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
1, 0, 1, 0,
+ 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_13, SHIFT_HI_13,
+ SHIFT_LO_13, READ_FLAG_13,
mask13, nb13, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_13,
SHIFT_HI_13, SHIFT_LO_13,
+ READ_FLAG_13, mask13, nb13, isZigZag);
+ }
+}
+
+static void bitunpack256v32_14_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb14 = 14;
+ const uint32_t mask14 = (1u << nb14) - 1; // 0x3FFF
+ const int expansions_count_14 = 16;
+ static const uint8_t SHIFT_HI_14[16] = {0, 14, 28, 10, 24, 6, 20, 2,
+ 16, 30, 12, 26, 8, 22, 4, 18};
+ static const uint8_t SHIFT_LO_14[16] = {0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6,
0, 10, 0, 0};
+ static const uint8_t READ_FLAG_14[16] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
0, 1, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_14,
+ SHIFT_HI_14, SHIFT_LO_14,
READ_FLAG_14, mask14,
+ nb14, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_14,
SHIFT_HI_14, SHIFT_LO_14,
+ READ_FLAG_14, mask14, nb14, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_15_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb15 = 15;
+ const uint32_t mask15 = (1u << 15) - 1; // 0x7FFF
+
+ // expansions=32 => unpacks 256 values at once
+ const int expansions_count_15 = 32;
+
+ static const uint8_t SHIFT_HI_15[32] = {0, 15, 30, 13, 28, 11, 26, 9, 24,
7, 22,
+ 5, 20, 3, 18, 1, 16, 31, 14, 29,
12, 27,
+ 10, 25, 8, 23, 6, 21, 4, 19, 2,
17};
+
+ static const uint8_t SHIFT_LO_15[32] = {0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0,
12, 0, 14, 0,
+ 0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0,
11, 0, 13, 0, 0};
+
+ static const uint8_t READ_FLAG_15[32] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_15, SHIFT_HI_15,
+ SHIFT_LO_15, READ_FLAG_15,
mask15, nb15, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_15,
SHIFT_HI_15, SHIFT_LO_15,
+ READ_FLAG_15, mask15, nb15, isZigZag);
+ }
+}
+
+static void bitunpack256v32_16_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb16 = 16;
+ const uint32_t mask16 = (1u << 16) - 1; // 0xFFFF
+
+ const int expansions_count = 2;
+ // Iteration 0: directly read 8×32-bit; Iteration 1: only right shift 16
bits, no new data read
+ static const uint8_t SHIFT_HI_16[2] = {0, 16};
+ static const uint8_t SHIFT_LO_16[2] = {0, 0};
+ static const uint8_t READ_FLAG_16[2] = {1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 16; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count,
+ SHIFT_HI_16, SHIFT_LO_16,
READ_FLAG_16, mask16,
+ nb16, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 16; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count,
SHIFT_HI_16, SHIFT_LO_16,
+ READ_FLAG_16, mask16, nb16, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_17_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const int nb17 = 17;
+ const uint32_t mask17 = (1u << 17) - 1; // 0x1FFFF
+
+ // expansions=32 => unpacks 256 values
+ const int expansions_count_17 = 32;
+
+ static const uint8_t SHIFT_HI_17[32] = {0, 17, 2, 19, 4, 21, 6, 23, 8,
25, 10,
+ 27, 12, 29, 14, 31, 16, 1, 18, 3,
20, 5,
+ 22, 7, 24, 9, 26, 11, 28, 13, 30,
15};
+
+ static const uint8_t SHIFT_LO_17[32] = {0, 15, 0, 13, 0, 11, 0, 9, 0, 7,
0, 5, 0, 3, 0, 1,
+ 16, 0, 14, 0, 12, 0, 10, 0, 8, 0,
6, 0, 4, 0, 2, 0};
+
+ static const uint8_t READ_FLAG_17[32] = {1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_17, SHIFT_HI_17,
+ SHIFT_LO_17, READ_FLAG_17,
mask17, nb17, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_17,
SHIFT_HI_17, SHIFT_LO_17,
+ READ_FLAG_17, mask17, nb17, isZigZag);
+ }
+}
+
+static void bitunpack256v32_18_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const int nb18 = 18;
+ const uint32_t mask18 = (1u << 18) - 1; // 0x3FFFF
+
+ // expansions=16 => 128 values
+ const int expansions_count_18 = 16;
+
+ static const uint8_t SHIFT_HI_18[16] = {0, 18, 4, 22, 8, 26, 12, 30,
+ 16, 2, 20, 6, 24, 10, 28, 14};
+ static const uint8_t SHIFT_LO_18[16] = {0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12,
0, 8, 0, 4, 0};
+ static const uint8_t READ_FLAG_18[16] = {// #0 =>1, #1 =>1, #2=>0, #3=>1,
+ // #4 =>0, #5 =>1, #6=>0, #7=>1,
+ // #8 =>1, #9 =>0, #10=>1, #11=>0,
+ // #12=>1, #13=>0, #14=>1, #15=>0
+ 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
1, 0, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_18,
+ SHIFT_HI_18, SHIFT_LO_18,
READ_FLAG_18, mask18,
+ nb18, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_18,
SHIFT_HI_18, SHIFT_LO_18,
+ READ_FLAG_18, mask18, nb18, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_19_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const int nb19 = 19;
+ const uint32_t mask19 = (1u << 19) - 1; // 0x7FFFF
+
+ // expansions=32 => unpacks 256 values at once
+ const int expansions_count_19 = 32;
+
+ static const uint8_t SHIFT_HI_19[32] = {0, 19, 6, 25, 12, 31, 18, 5, 24,
11, 30,
+ 17, 4, 23, 10, 29, 16, 3, 22, 9,
28, 15,
+ 2, 21, 8, 27, 14, 1, 20, 7, 26,
13};
+ static const uint8_t SHIFT_LO_19[32] = {0, 13, 0, 7, 0, 1, 14, 0, 8, 0,
2, 15, 0, 9, 0, 3,
+ 16, 0, 10, 0, 4, 17, 0, 11, 0, 5,
18, 0, 12, 0, 6, 0};
+ static const uint8_t READ_FLAG_19[32] = {1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
0, 1, 0, 1,
+ 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
1, 0, 1, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_19, SHIFT_HI_19,
+ SHIFT_LO_19, READ_FLAG_19,
mask19, nb19, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_19,
SHIFT_HI_19, SHIFT_LO_19,
+ READ_FLAG_19, mask19, nb19, isZigZag);
+ }
+}
+
+static void bitunpack256v32_20_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const int nb20 = 20;
+ const uint32_t mask20 = (1u << 20) - 1; // 0xFFFFF
+
+ // expansions=8 => process 64 values at once
+ const int expansions_count_20 = 8;
+
+ // shift tables for k=0..7
+ static const uint8_t SHIFT_HI_20[8] = {0, 20, 8, 28, 16, 4, 24, 12};
+ static const uint8_t SHIFT_LO_20[8] = {0, 12, 0, 4, 16, 0, 8, 0};
+ static const uint8_t READ_FLAG_20[8] = {1, 1, 0, 1, 1, 0, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_20,
+ SHIFT_HI_20, SHIFT_LO_20,
READ_FLAG_20, mask20,
+ nb20, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_20,
SHIFT_HI_20, SHIFT_LO_20,
+ READ_FLAG_20, mask20, nb20, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_21_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const uint32_t mask21 = (1u << 21) - 1; // 0x1FFFFF
+ const int nb21 = 21;
+
+ // expansions=32 => unpacks 256 values at once
+ const int expansions_count_21 = 32;
+
+ static const uint8_t SHIFT_HI_21[32] = {0, 21, 10, 31, 20, 9, 30, 19, 8,
29, 18,
+ 7, 28, 17, 6, 27, 16, 5, 26, 15,
4, 25,
+ 14, 3, 24, 13, 2, 23, 12, 1, 22,
11};
+ static const uint8_t SHIFT_LO_21[32] = {0, 11, 0, 1, 12, 0, 2, 13, 0,
3, 14,
+ 0, 4, 15, 0, 5, 16, 0, 6, 17,
0, 7,
+ 18, 0, 8, 19, 0, 9, 20, 0, 10,
0};
+ static const uint8_t READ_FLAG_21[32] = {
+ // Check original expansions #k if there's a "load #X" => 1 if yes,
0 if no
+ 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+ 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_21, SHIFT_HI_21,
+ SHIFT_LO_21, READ_FLAG_21,
mask21, nb21, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_21,
SHIFT_HI_21, SHIFT_LO_21,
+ READ_FLAG_21, mask21, nb21, isZigZag);
+ }
+}
+
+static void bitunpack256v32_22_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const uint32_t mask22 = (1u << 22) - 1; // 0x3FFFFF
+ const int nb22 = 22;
+
+ // b=22 => one block function with expansions=16 => outputs 128 values
+ // need to call it twice to get 256 values
+ const int expansions_count_22 = 16;
+
+ static const uint8_t SHIFT_HI_22[16] = {/* 0 */ 0, /* 1 */ 22, /* 2 */ 12,
/* 3 */ 2,
+ /* 4 */ 24, /* 5 */ 14, /* 6 */ 4,
/* 7 */ 26,
+ /* 8 */ 16, /* 9 */ 6, /*10 */ 28,
/*11 */ 18,
+ /*12 */ 8, /*13 */ 30, /*14 */ 20,
/*15 */ 10};
+
+ static const uint8_t SHIFT_LO_22[16] = {/* 0 */ 0, /* 1 */ 10, /* 2 */ 20,
/* 3 */ 0,
+ /* 4 */ 8, /* 5 */ 18, /* 6 */ 0,
/* 7 */ 6,
+ /* 8 */ 16, /* 9 */ 0, /*10 */ 4,
/*11 */ 14,
+ /*12 */ 0, /*13 */ 2, /*14 */ 12,
/*15 */ 0};
+
+ static const uint8_t READ_FLAG_22[16] = {
+ // From original code: expansions #3, #6, #9, #12, #15 don't read,
others do
+ 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_22,
+ SHIFT_HI_22, SHIFT_LO_22,
READ_FLAG_22, mask22,
+ nb22, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_22,
SHIFT_HI_22, SHIFT_LO_22,
+ READ_FLAG_22, mask22, nb22, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_23_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const int nb23 = 23;
+ const uint32_t mask23 = (1u << 23) - 1; // 0x7FFFFF
+
+ // expansions_count=32
+ const int expansions_count_23 = 32;
+
+ // Predefined SHIFT_HI_23, SHIFT_LO_23, READ_FLAG_23
+ static const uint8_t SHIFT_HI_23[32] = {0, 23, 14, 5, 28, 19, 10, 1, 24,
15, 6,
+ 29, 20, 11, 2, 25, 16, 7, 30, 21,
12, 3,
+ 26, 17, 8, 31, 22, 13, 4, 27, 18,
9};
+
+ static const uint8_t SHIFT_LO_23[32] = {0, 9, 18, 0, 4, 13, 22, 0, 8, 17,
0,
+ 3, 12, 21, 0, 7, 16, 0, 2, 11, 20,
0,
+ 6, 15, 0, 1, 10, 19, 0, 5, 14, 0};
+
+ static const uint8_t READ_FLAG_23[32] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 0, 1,
+ 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
0, 1, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_23, SHIFT_HI_23,
+ SHIFT_LO_23, READ_FLAG_23,
mask23, nb23, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_23,
SHIFT_HI_23, SHIFT_LO_23,
+ READ_FLAG_23, mask23, nb23, isZigZag);
+ }
+}
+static void bitunpack256v32_24_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // base bits & mask
+ const int nb24 = 24;
+ const uint32_t mask24 = (1u << 24) - 1; // 0xFFFFFF
+
+ // expansions_count=4 (corresponds to 4 expansions => outputs 32 values)
+ const int expansions_count_24 = 4;
+
+ // k=0 => leftover>>0, new<<0
+ // k=1 => leftover>>24, new<<8
+ // k=2 => leftover>>16, new<<16
+ // k=3 => leftover>>8, no new block read
+ static const uint8_t SHIFT_HI_24[4] = {0, 24, 16, 8};
+ static const uint8_t SHIFT_LO_24[4] = {0, 8, 16, 0};
+
+ // Only read new blocks for steps 0,1,2, not for step 3
+ static const uint8_t READ_FLAG_24[4] = {1, 1, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 8; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_24,
+ SHIFT_HI_24, SHIFT_LO_24,
READ_FLAG_24, mask24,
+ nb24, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 8; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_24,
SHIFT_HI_24, SHIFT_LO_24,
+ READ_FLAG_24, mask24, nb24, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_25_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // mask & base bits
+ const uint32_t mask25 = (1u << 25) - 1; // 0x1FFFFFF
+ const int nb25 = 25;
+
+ // 32 expansions total
+ const int expansions_count_25 = 32;
+
+ // Extract high and low shift amounts from original implementation
+ static const uint8_t SHIFT_HI_25[32] = {
+ /* #0 */ 0, /* #1 */ 25, /* #2 */ 18, /* #3 */ 11,
+ /* #4 */ 4, /* #5 */ 29, /* #6 */ 22, /* #7 */ 15,
+ /* #8 */ 8, /* #9 */ 1, /* #10 */ 26, /* #11 */ 19,
+ /* #12 */ 12, /* #13 */ 5, /* #14 */ 30, /* #15 */ 23,
+ /* #16 */ 16, /* #17 */ 9, /* #18 */ 2, /* #19 */ 27,
+ /* #20 */ 20, /* #21 */ 13, /* #22 */ 6, /* #23 */ 31,
+ /* #24 */ 24, /* #25 */ 17, /* #26 */ 10, /* #27 */ 3,
+ /* #28 */ 28, /* #29 */ 21, /* #30 */ 14, /* #31 */ 7};
+
+ static const uint8_t SHIFT_LO_25[32] = {
+ /* #0 */ 0, /* #1 */ 7, /* #2 */ 14, /* #3 */ 21,
+ /* #4 */ 0, /* #5 */ 3, /* #6 */ 10, /* #7 */ 17,
+ /* #8 */ 24, /* #9 */ 0, /* #10 */ 6, /* #11 */ 13,
+ /* #12 */ 20, /* #13 */ 0, /* #14 */ 2, /* #15 */ 9,
+ /* #16 */ 16, /* #17 */ 23, /* #18 */ 0, /* #19 */ 5,
+ /* #20 */ 12, /* #21 */ 19, /* #22 */ 0, /* #23 */ 1,
+ /* #24 */ 8, /* #25 */ 15, /* #26 */ 22, /* #27 */ 0,
+ /* #28 */ 4, /* #29 */ 11, /* #30 */ 18, /* #31 */ 0};
+
+ // Mark which steps don't need to read new data
+ // Based on original code, expansions #4, #9, #13, #18, #22, #27, #31 don't
need to read new data
+ static const uint8_t READ_FLAG_25[32] = {
+ /* #0 */ 1, /* #1 */ 1, /* #2 */ 1, /* #3 */ 1,
+ /* #4 */ 0, /* #5 */ 1, /* #6 */ 1, /* #7 */ 1,
+ /* #8 */ 1, /* #9 */ 0, /* #10 */ 1, /* #11 */ 1,
+ /* #12 */ 1, /* #13 */ 0, /* #14 */ 1, /* #15 */ 1,
+ /* #16 */ 1, /* #17 */ 1, /* #18 */ 0, /* #19 */ 1,
+ /* #20 */ 1, /* #21 */ 1, /* #22 */ 0, /* #23 */ 1,
+ /* #24 */ 1, /* #25 */ 1, /* #26 */ 1, /* #27 */ 0,
+ /* #28 */ 1, /* #29 */ 1, /* #30 */ 1, /* #31 */ 0};
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_25,
+ SHIFT_HI_25, SHIFT_LO_25,
READ_FLAG_25, mask25,
+ nb25, isZigZag);
+ }
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_25,
SHIFT_HI_25, SHIFT_LO_25,
+ READ_FLAG_25, mask25, nb25, isZigZag);
+ }
+}
+
+static void bitunpack256v32_26_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // mask & base bits
+ const uint32_t mask26 = (1u << 26) - 1; // 0x3FFFFFF
+ const int nb26 = 26;
+
+ // 16 expansions total
+ const int expansions_count_26 = 16;
+
+ // Extract high and low shift amounts from original implementation
+ static const uint8_t SHIFT_HI_26[16] = {
+ /* #0 */ 0, /* #1 */ 26, /* #2 */ 20, /* #3 */ 14,
+ /* #4 */ 8, /* #5 */ 2, /* #6 */ 28, /* #7 */ 22,
+ /* #8 */ 16, /* #9 */ 10, /* #10 */ 4, /* #11 */ 30,
+ /* #12 */ 24, /* #13 */ 18, /* #14 */ 12, /* #15 */ 6};
+
+ static const uint8_t SHIFT_LO_26[16] = {
+ /* #0 */ 0, /* #1 */ 6, /* #2 */ 12, /* #3 */ 18,
+ /* #4 */ 24, /* #5 */ 0, /* #6 */ 4, /* #7 */ 10,
+ /* #8 */ 16, /* #9 */ 22, /* #10 */ 0, /* #11 */ 2,
+ /* #12 */ 8, /* #13 */ 14, /* #14 */ 20, /* #15 */ 0};
+
+ // Mark which steps don't need to read new data
+ // Based on original code, expansions #5, #10, #15 don't need to read new
data
+ static const uint8_t READ_FLAG_26[16] = {
+ /* #0 */ 1, /* #1 */ 1, /* #2 */ 1, /* #3 */ 1,
+ /* #4 */ 1, /* #5 */ 0, /* #6 */ 1, /* #7 */ 1,
+ /* #8 */ 1, /* #9 */ 1, /* #10 */ 0, /* #11 */ 1,
+ /* #12 */ 1, /* #13 */ 1, /* #14 */ 1, /* #15 */ 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_26,
+ SHIFT_HI_26, SHIFT_LO_26,
READ_FLAG_26, mask26,
+ nb26, isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_26,
SHIFT_HI_26, SHIFT_LO_26,
+ READ_FLAG_26, mask26, nb26, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_27_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ // mask & base bits
+ const uint32_t mask27 = (1u << 27) - 1; // 0x7FFFFFF
+ const int nb27 = 27;
+
+ // 32 expansions total
+ const int expansions_count_27 = 32;
+
+ // Extract high and low shift amounts from original implementation
+ static const uint8_t SHIFT_HI_27[32] = {
+ /* #0 */ 0, /* #1 */ 27, /* #2 */ 22, /* #3 */ 17,
+ /* #4 */ 12, /* #5 */ 7, /* #6 */ 2, /* #7 */ 29,
+ /* #8 */ 24, /* #9 */ 19, /* #10 */ 14, /* #11 */ 9,
+ /* #12 */ 4, /* #13 */ 31, /* #14 */ 26, /* #15 */ 21,
+ /* #16 */ 16, /* #17 */ 11, /* #18 */ 6, /* #19 */ 1,
+ /* #20 */ 28, /* #21 */ 23, /* #22 */ 18, /* #23 */ 13,
+ /* #24 */ 8, /* #25 */ 3, /* #26 */ 30, /* #27 */ 25,
+ /* #28 */ 20, /* #29 */ 15, /* #30 */ 10, /* #31 */ 5};
+
+ static const uint8_t SHIFT_LO_27[32] = {
+ /* #0 */ 0, /* #1 */ 5, /* #2 */ 10, /* #3 */ 15,
+ /* #4 */ 20, /* #5 */ 25, /* #6 */ 0, /* #7 */ 3,
+ /* #8 */ 8, /* #9 */ 13, /* #10 */ 18, /* #11 */ 23,
+ /* #12 */ 0, /* #13 */ 1, /* #14 */ 6, /* #15 */ 11,
+ /* #16 */ 16, /* #17 */ 21, /* #18 */ 26, /* #19 */ 0,
+ /* #20 */ 4, /* #21 */ 9, /* #22 */ 14, /* #23 */ 19,
+ /* #24 */ 24, /* #25 */ 0, /* #26 */ 2, /* #27 */ 7,
+ /* #28 */ 12, /* #29 */ 17, /* #30 */ 22, /* #31 */ 0};
+
+ // Mark which steps don't need to read new data
+ // From original code, steps #6, #12, #19, #25, #31 don't have CPY8(iv, *pIn)
+ static const uint8_t READ_FLAG_27[32] = {
+ /* #0 */ 1, /* #1 */ 1, /* #2 */ 1, /* #3 */ 1,
+ /* #4 */ 1, /* #5 */ 1, /* #6 */ 0, /* #7 */ 1,
+ /* #8 */ 1, /* #9 */ 1, /* #10 */ 1, /* #11 */ 1,
+ /* #12 */ 0, /* #13 */ 1, /* #14 */ 1, /* #15 */ 1,
+ /* #16 */ 1, /* #17 */ 1, /* #18 */ 1, /* #19 */ 0,
+ /* #20 */ 1, /* #21 */ 1, /* #22 */ 1, /* #23 */ 1,
+ /* #24 */ 1, /* #25 */ 0, /* #26 */ 1, /* #27 */ 1,
+ /* #28 */ 1, /* #29 */ 1, /* #30 */ 1, /* #31 */ 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_27, SHIFT_HI_27,
+ SHIFT_LO_27, READ_FLAG_27,
mask27, nb27, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_27,
SHIFT_HI_27, SHIFT_LO_27,
+ READ_FLAG_27, mask27, nb27, isZigZag);
+ }
+}
+static void bitunpack256v32_28_scalar(
+ uint32_t** pIn, uint32_t** pOut,
+ uint32_t** pPEX, // Optional parameter, non-NULL
for extended version
+ unsigned char** pBB, bool isZigZag) // Optional parameter, non-NULL
for extended version
+{
+ // Common constant definitions
+ const uint32_t mask28 = (1u << 28) - 1; // 0xFFFFFFF
+ const int nb28 = 28;
+ const int expansions_count_28 = 8;
+ static const uint8_t SHIFT_HI_28[8] = {0, 28, 24, 20, 16, 12, 8, 4};
+ static const uint8_t SHIFT_LO_28[8] = {0, 4, 8, 12, 16, 20, 24, 0};
+ static const uint8_t READ_FLAG_28[8] = {1, 1, 1, 1, 1, 1, 1, 0};
+
+ // Choose template based on whether extension parameters are provided
+ if (pPEX != NULL && pBB != NULL) {
+ // Call extended template, each call outputs 64 values, loop 4 times to
get 256
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count_28,
+ SHIFT_HI_28, SHIFT_LO_28,
READ_FLAG_28, mask28,
+ nb28, isZigZag);
+ }
+ } else {
+ // Call non-extended template, also each call outputs 64 values, loop 4
times to get 256
+ for (int i = 0; i < 4; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count_28,
SHIFT_HI_28, SHIFT_LO_28,
+ READ_FLAG_28, mask28, nb28, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_29_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const uint32_t mask29 = (1U << 29) - 1; // 0x1FFFFFFF
+ const int expansions_count = 32;
+ static const uint8_t SHIFT_HI_29[32] = {0, 29, 26, 23, 20, 17, 14, 11, 8,
5, 2,
+ 31, 28, 25, 22, 19, 16, 13, 10, 7,
4, 1,
+ 30, 27, 24, 21, 18, 15, 12, 9, 6,
3};
+ static const uint8_t SHIFT_LO_29[32] = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27,
0,
+ 1, 4, 7, 10, 13, 16, 19, 22, 25, 28,
0,
+ 2, 5, 8, 11, 14, 17, 20, 23, 26, 0};
+ static const uint8_t READ_FLAG_29[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count, SHIFT_HI_29,
+ SHIFT_LO_29, READ_FLAG_29,
mask29, 29, isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_29,
SHIFT_LO_29,
+ READ_FLAG_29, mask29, 29, isZigZag);
+ }
+}
+
+static void bitunpack256v32_30_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const uint32_t mask30 = (1U << 30) - 1; // 0x3FFFFFFF
+ const int expansions_count = 16;
+ static const uint8_t SHIFT_HI_30[16] = {0, 30, 28, 26, 24, 22, 20, 18,
+ 16, 14, 12, 10, 8, 6, 4, 2};
+ static const uint8_t SHIFT_LO_30[16] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18,
20, 22, 24, 26, 28, 0};
+ static const uint8_t READ_FLAG_30[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0};
+
+ if (pPEX != NULL && pBB != NULL) {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count,
+ SHIFT_HI_30, SHIFT_LO_30,
READ_FLAG_30, mask30, 30,
+ isZigZag);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count,
SHIFT_HI_30, SHIFT_LO_30,
+ READ_FLAG_30, mask30, 30, isZigZag);
+ }
+ }
+}
+
+static void bitunpack256v32_31_scalar(uint32_t** pIn, uint32_t** pOut,
uint32_t** pPEX,
+ unsigned char** pBB, bool isZigZag) {
+ const uint32_t mask31 = (1U << 31) - 1; // 0x7FFFFFFF
+ const int expansions_count = 32;
+ // Construct parameter arrays:
+ // For k==0: SHIFT_HI = 0, SHIFT_LO = 0, READ_FLAG = 1
+ // For k = 1 .. 30: SHIFT_HI = 32 - k, SHIFT_LO = k, READ_FLAG = 1
+ // For k==31: SHIFT_HI = 1, SHIFT_LO = 0, READ_FLAG = 0
+ static const uint8_t SHIFT_HI[32] = {0, 31, 30, 29, 28, 27, 26, 25, 24, 23,
22,
+ 21, 20, 19, 18, 17, 16, 15, 14, 13, 12,
11,
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+ static const uint8_t SHIFT_LO[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
0};
+ static const uint8_t READ_FLAG[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0};
+ if (pPEX != NULL && pBB != NULL) {
+ bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB,
expansions_count, SHIFT_HI,
+ SHIFT_LO, READ_FLAG, mask31, 31,
isZigZag);
+ } else {
+ bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI,
SHIFT_LO, READ_FLAG,
+ mask31, 31, isZigZag);
+ }
+}
+
+static void bitunpack256v32_32_scalar(
+ uint32_t** pIn, uint32_t** pOut,
+ uint32_t** pPEX, // Optional parameter, non-NULL
for extended version
+ unsigned char** pBB, bool isZigZag) // Optional parameter, non-NULL
for extended version
+{
+ uint32_t* ip = *pIn;
+ uint32_t* op = *pOut;
+ const int nb = 32; // When b=32, each 32-bit integer stores a value directly
+
+ // There are 32 groups, each group has 8 numbers, totaling 256 numbers
+ for (int i = 0; i < 32; i++) {
+ // Copy 8 input values directly to output (avoid calling CPY8)
+ for (int j = 0; j < 8; j++) {
+ op[j] = ip[j];
+ }
+ ip += 8;
+
+ if (pPEX != NULL && pBB != NULL) {
+ uint8_t xm8 = **pBB;
+ (*pBB)++;
+ if (xm8 != 0) {
+ applyException_8bits(xm8, pPEX, nb, op);
+ }
+ }
+ if (isZigZag) {
+ for (int j = 0; j < 8; j++) {
+ op[j] = zigzagDecode_scalar(op[j]);
+ }
+ }
+ op += 8;
+ }
+ *pIn = ip;
+ *pOut = op;
+}
+
+// Define function pointer type for unpacking functions
+typedef void (*unpack_func_t)(uint32_t**, uint32_t**, unsigned**, unsigned
char**, bool);
+
+// Array of function pointers for each bit width (0 to 32)
+static unpack_func_t unpack_funcs[33] = {
+ bitunpack256v32_0_scalar, bitunpack256v32_1_scalar,
bitunpack256v32_2_scalar,
+ bitunpack256v32_3_scalar, bitunpack256v32_4_scalar,
bitunpack256v32_5_scalar,
+ bitunpack256v32_6_scalar, bitunpack256v32_7_scalar,
bitunpack256v32_8_scalar,
+ bitunpack256v32_9_scalar, bitunpack256v32_10_scalar,
bitunpack256v32_11_scalar,
+ bitunpack256v32_12_scalar, bitunpack256v32_13_scalar,
bitunpack256v32_14_scalar,
+ bitunpack256v32_15_scalar, bitunpack256v32_16_scalar,
bitunpack256v32_17_scalar,
+ bitunpack256v32_18_scalar, bitunpack256v32_19_scalar,
bitunpack256v32_20_scalar,
+ bitunpack256v32_21_scalar, bitunpack256v32_22_scalar,
bitunpack256v32_23_scalar,
+ bitunpack256v32_24_scalar, bitunpack256v32_25_scalar,
bitunpack256v32_26_scalar,
+ bitunpack256v32_27_scalar, bitunpack256v32_28_scalar,
bitunpack256v32_29_scalar,
+ bitunpack256v32_30_scalar, bitunpack256v32_31_scalar,
bitunpack256v32_32_scalar};
+/**
+ *
+ * @param in Compressed data input stream
+ * @param n Currently unused, can be processed according to actual needs
+ * @param out Output buffer for decompressed 32-bit integers (must
accommodate at least 256 32-bit integers)
+ * @param b Bit width for each integer, this example only demonstrates the
b=8 branch
+ * @return Returns the next readable input position after decompression
(consistent with original logic)
+ */
+unsigned char* bitunpack256scalarv32_withzigzag(const unsigned char*
__restrict in, unsigned n,
+ unsigned* __restrict out,
unsigned b,
+ bool isZigZag) {
+ // Debug output (optional, can be removed in production)
+ //printf("bitunpack256scalarv32_withzigzag b=%d bits=%d isZigZag=%d\n", b, b
& 0x3f, isZigZag);
+
+ // Calculate input pointer offset
+ unsigned char* ip = (unsigned char*)(in + PAD8(256 * b));
+
+ // Initialize pointers
+ uint32_t* pIn32 = (uint32_t*)in;
+ uint32_t* pOut32 = (uint32_t*)out;
+
+ unsigned bits = b & 0x3f;
+ // Execute unpacking if b is in valid range
+ if (bits <= 32) {
+ unpack_funcs[bits](&pIn32, &pOut32, NULL, NULL, isZigZag);
+ }
+
+ return ip;
+}
+unsigned char* bitunpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned b) {
+ // Debug output (optional, can be removed in production)
+ //printf("bitunpack256scalarv32 b=%d bits=%d\n", b, b & 0x3f);
+
+ // Calculate input pointer offset
+ unsigned char* ip = (unsigned char*)(in + PAD8(256 * b));
+
+ bitunpack256scalarv32_withzigzag(in, n, out, b, false);
+
+ return ip;
+}
+unsigned char* _bitd1unpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned
start, unsigned b,
+ unsigned* __restrict pex, unsigned
char* bb) {
+ //printf("_bitd1unpack256scalarv32, b=%d\n", b & 0x3f);
+ unsigned* deltas = (unsigned*)malloc(n * sizeof(unsigned));
+ if (!deltas) return NULL;
+
+ const unsigned char* orig_in = in;
+ in = _bitunpack256scalarv32(in, n, deltas, b, pex, bb, false);
+
+ unsigned running_sum = start;
+ for (unsigned i = 0; i < n; ++i) {
+ running_sum += deltas[i] + 1;
+ out[i] = running_sum;
+ }
+
+ free(deltas);
+ return (unsigned char*)in;
+}
+
+// Add this after the definition of _bitunpack256w32 in the SSE2/SSSE3 section
+
+// Delta1 unpacking for 256 32-bit integers (no exceptions)
+unsigned char* bitd1unpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned
start, unsigned b) {
+ //printf("bitd1unpack256scalarv32, b=%d\n", b & 0x3f);
+ const unsigned char* _in = in;
+ unsigned deltas[n];
+
+ in = bitunpack256scalarv32(in, n, deltas, b);
+
+ unsigned running_sum = start;
+ for (unsigned i = 0; i < n; ++i) {
+ running_sum += deltas[i] + 1;
+ out[i] = running_sum;
+ }
+
+ return (unsigned char*)in;
+}
+
+unsigned char* _bitunpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned b,
+ unsigned* __restrict pex, unsigned char*
bb, bool isZigZag) {
+ // Debug output (optional, can be removed in production)
+ //printf("_bitunpack256scalarv32 bits=%d isZigZag=%d\n", b & 0x3f, isZigZag);
+
+ // Calculate input pointer offset
+ unsigned char* ip = (unsigned char*)(in + PAD8(256 * b));
+
+ // Initialize pointers
+ unsigned* pPEX = pex;
+ unsigned char* pBB = bb;
+ uint32_t* pIn32 = (uint32_t*)in;
+ uint32_t* pOut32 = (uint32_t*)out;
+
+ unsigned bits = b & 0x3f;
+ // Execute unpacking if b is in valid range
+ if (bits <= 32) {
+ unpack_funcs[bits](&pIn32, &pOut32, &pPEX, &pBB, isZigZag);
+ }
+
+ return ip;
+}
+
+unsigned char* bitzunpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned
start, unsigned b) {
+ // Debug output (optional, can be removed in production)
+ //printf("bitzunpack256scalarv32 b=%d bits=%d\n", b, b & 0x3f);
+ const unsigned char* _in = in;
+ unsigned deltas[n];
+
+ in = bitunpack256scalarv32_withzigzag(in, n, deltas, b, true);
+
+ unsigned running_sum = start;
+ for (unsigned i = 0; i < n; ++i) {
+ running_sum += deltas[i];
+ out[i] = running_sum;
+ }
+
+ return (unsigned char*)in;
+}
+unsigned char* _bitzunpack256scalarv32(const unsigned char* __restrict in,
unsigned n,
+ unsigned* __restrict out, unsigned
start, unsigned b,
+ unsigned* __restrict pex, unsigned
char* bb) {
+ // Debug output (optional, can be removed in production)
+ //printf("_bitzunpack256scalarv32 bits=%d\n", b & 0x3f);
+
+ unsigned* deltas = (unsigned*)malloc(n * sizeof(unsigned));
+ if (!deltas) return NULL;
+
+ const unsigned char* orig_in = in;
+ in = _bitunpack256scalarv32(in, n, deltas, b, pex, bb, true);
+
+ unsigned running_sum = start;
+ for (unsigned i = 0; i < n; ++i) {
+ running_sum += deltas[i];
+ out[i] = running_sum;
+ }
+
+ free(deltas);
+ return (unsigned char*)in;
+}
#define STOZ64(_op_, _ov_) _mm_storeu_si128(_op_++, _ov_);
_mm_storeu_si128(_op_++, _ov_)
#define STO64( _op_, _ov_, _zv_) _mm_storeu_si128(_op_++,
_mm_unpacklo_epi32(_ov_,_zv_));_mm_storeu_si128(_op_++,
_mm_unpacklo_epi32(_mm_srli_si128(_ov_,8),_zv_))
diff --git a/src/ext/for/test_bitd1unpack.cpp b/src/ext/for/test_bitd1unpack.cpp
new file mode 100644
index 00000000000..23484a7f50e
--- /dev/null
+++ b/src/ext/for/test_bitd1unpack.cpp
@@ -0,0 +1,399 @@
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <vp4.h>
+#include <vint.h>
+#include "conf.h"
+
+// 定义PAD8宏
+#ifndef PAD8
+#define PAD8(_x_) (((_x_) + 7) / 8)
+#endif
+
+const unsigned TEST_SIZE = 512;
+
+#ifdef __AVX2__
+void generate_test_data(unsigned* raw_values, unsigned n, unsigned char*
encoded_data,
+ unsigned* out_size) {
+ // 使用p4nd1enc256v32编码原始数据
+ size_t end_ptr = p4nd1enc256v32(raw_values, n, encoded_data);
+
+ // 计算编码后数据大小
+ *out_size = end_ptr;
+}
+#endif
+#define _1vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\
+ if(!(_x_ & 0x80u)) {
_act_;}\
+ else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++;
_act_;}\
+ else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_);
_ip_ += 2; _act_;}\
+ else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu);
_ip_ += 3; _act_;}\
+ else { _x_ = (unsigned long long)((_x_) & 0x07)<<32 |
ctou32(_ip_); _ip_ += 4; _act_;}\
+} while(0)
+#define xvbxget32(_ip_, _x_) _1vbxget32(_ip_, _x_, ;)
+
+// 用于快速得到 10^k 的一个表,避免多次调用 pow
+// 注意 10^10=10000000000 需要 64 位才能存
+static const uint64_t g_pow10[] = {
+ 1ULL, // 10^0
+ 10ULL, // 10^1
+ 100ULL, // 10^2
+ 1000ULL, // 10^3
+ 10000ULL, // 10^4
+ 100000ULL, // 10^5
+ 1000000ULL, // 10^6
+ 10000000ULL, // 10^7
+ 100000000ULL, // 10^8
+ 1000000000ULL,// 10^9
+ 10000000000ULL// 10^10
+};
+
+// 计算 10^(floor(b/3)),若超出 g_pow10 范围可再加判断
+static inline uint64_t get_pow10_for_b(unsigned b) {
+ // floor(b/3)
+ unsigned idx = b / 3;
+ if (idx >= sizeof(g_pow10)/sizeof(g_pow10[0])) {
+ // 超过预置表最大 10^10,就固定返回 10^10 或自行处理
+ return g_pow10[10];
+ }
+ return g_pow10[idx];
+}
+
+// 计算 2^b 的函数
+static inline unsigned power2(unsigned b) {
+ // (1U << b) 当 b=32 时也可能溢出,你可自行判断
+ return (1U << b);
+}
+
+
+/**
+ * @param values 输出数组
+ * @param n 要生成的数据个数
+ * @param b 当前位宽
+ * @param with_exception 0=无异常,1=有异常
+ */
+void generate_raw_data_for_bitwidth(unsigned* values, unsigned n,
+ unsigned b, int with_exception)
+{
+ if (n == 0) return;
+
+ if (!with_exception) {
+ // =====================================
+ // 无异常模式:递增序列
+ // =====================================
+ // 1) 先给一个随机初始值 base (你也可随意决定)
+ unsigned base = rand() % 1000;
+ values[0] = base;
+
+ // 2) 根据 b 分段决定“增量最大范围”
+ unsigned inc_range;
+ if (b < 4) {
+ // b=0 => 2^0=1, b=1 =>2, b=2 =>4, b=3=>8
+ inc_range = power2(b);
+ } else {
+ // b>=4 => 用10^(floor(b/3)) => 10,100,1000,...
+ uint64_t r = get_pow10_for_b(b);
+ // 这里最好判断 r 是否超出 unsigned 范围
+ // 若测试场景不会特别大,可以直接转为 unsigned
+ if (r > 0xFFFFFFFFULL) {
+ r = 0xFFFFFFFFULL; // 避免溢出
+ }
+ inc_range = (unsigned)r;
+ }
+
+ // 3) 生成递增序列
+ for (unsigned i = 1; i < n; i++) {
+ // +1 是为了避免 0 增量的情况
+ unsigned inc = 1 + rand() % inc_range;
+ base += inc;
+ values[i] = base;
+ }
+
+ } else {
+ // =====================================
+ // 有异常模式:直接随机
+ // =====================================
+
+ // 观察示例得知:
+ // - b=0 => rand()%2
+ // - b=2 => rand()%4
+ // - b=3 => rand()%10
+ // - b=7 => rand()%100
+ // - b=10 => rand()%1000
+ // - b=13 => rand()%10000
+ // => 规律:当 b >= 3 用 10^(floor(b/3));当 b < 3 用特殊处理
+
+ uint64_t val_range = 0; // 用 64 位临时存,最后再转回 unsigned
+
+ if (b == 0) {
+ val_range = 2; // 0..1
+ }
+ else if (b == 1) {
+ // 你没给 b=1 的具体例子,这里假设跟 b=0 一样 => range=2
+ val_range = 2; // 0..1
+ }
+ else if (b == 2) {
+ val_range = 4; // 0..3
+ }
+ else {
+ // b>=3 => 用 10^(floor(b/3))
+ val_range = get_pow10_for_b(b);
+ // 同样检查一下是否超过 unsigned
+ if (val_range > 0xFFFFFFFFULL) {
+ val_range = 0xFFFFFFFFULL;
+ }
+ }
+
+ // 直接随机
+ for (unsigned i = 0; i < n; i++) {
+ unsigned x = (unsigned)(rand() % (unsigned)val_range);
+ values[i] = x;
+ }
+ }
+}
+
+/**
+ * 生成 n 个有符号数:
+ * - b<3: 范围很小(±(1<<b) 之类)
+ * - b>=3: 直接从 ±(10^(floor(b/3))) 随机, 并包含一定的负值
+ *
+ * with_exception=0 => 生成一个“有序/有限范围”
+ * with_exception=1 => 生成一个“更大随机范围” (你可自定义)
+ */
+static void generate_raw_signed_data_for_zigzag(unsigned *values,
+ unsigned n,
+ unsigned b,
+ int with_exception)
+{
+ if (n == 0) return;
+
+ // srand(...) 在外部一次初始化
+ uint64_t val_range = 1;
+ if (b < 3) {
+ // 例如 b=0 =>±1, b=1=>±2, b=2=>±4
+ val_range = (1ULL << b);
+ } else {
+ // b>=3 => use get_pow10_for_b(b) => 10^(floor(b/3))
+ val_range = get_pow10_for_b(b); // 参考你贴的 delta pfor
+ if(val_range > 0x7fffffffULL) {
+ val_range = 0x7fffffffULL; // 避免溢出 32-bit
+ }
+ }
+
+ for(unsigned i=0; i<n; i++){
+ // 先产生 0..val_range-1
+ int32_t x = (int32_t)(rand() % (unsigned)val_range);
+ // 随机决定正负
+ if(with_exception) {
+ // 例如 50% 概率取反
+ if((rand() & 1) == 1) x = -x;
+ } else {
+ // 不带异常 => 大部分正, 也可以小概率负
+ if((rand()%10)==0) x = -x;
+ }
+ values[i] = x;
+ }
+}
+#ifdef __AVX2__
+void run_testZigzag(unsigned b,
+ int with_exception,
+ unsigned TEST_SIZE,
+ unsigned *raw_values,
+ unsigned char *encoded_data,
+ unsigned *decoded1,
+ unsigned *decoded2)
+{
+ printf("Zigzag 测试: 位宽 b=%u, with_exception=%d\n", b, with_exception);
+
+ // 1) 生成带正负 raw data
+ generate_raw_signed_data_for_zigzag(raw_values, TEST_SIZE, b,
with_exception);
+ unsigned encoded_size = p4nzenc256v32(raw_values, TEST_SIZE, encoded_data);
+
+ // 获取编码头部信息(例如起始值等)
+ unsigned start;
+ unsigned char* copy = encoded_data;
+ xvbxget32(copy, start);
+ unsigned char encoded_b = copy[0]; // 编码后的第一个字节为位宽
+ if((encoded_b & 0x40)) {
+ encoded_b &= 0x3f;
+ } else {
+ if(encoded_b & 0x80) {
+ encoded_b &= 0x7f;
+ }
+ }
+ printf(" 编码参数: 位宽 b=%u, 起始值 start=%u, 编码大小=%u字节\n", encoded_b, start,
encoded_size);
+
+ // 3) decode => two versions for cross-check
+ // (here we define "decoded1" from "bitzunpack256v32...??" and
"decoded2" from "bitzunpack256scalarv32Zigzag"??)
+ memset(decoded1,0,TEST_SIZE*sizeof(unsigned));
+ memset(decoded2,0,TEST_SIZE*sizeof(unsigned));
+
+ // "decoded1" => maybe vector version if you have it? e.g.
"bitzunpack256v32(in,b, out,??)"
+ // "decoded2" => scalar version ?
+
+ // for demonstration, we do the same decode to compare:
+ p4nzdec256v32(encoded_data, TEST_SIZE, decoded1);
+ p4nzdec256scalarv32(encoded_data, TEST_SIZE, decoded2);
+
+ // 4) compare mismatch
+ int mismatch=0;
+ for(unsigned i=0;i<TEST_SIZE;i++){
+ if(decoded1[i] != decoded2[i]){
+ if(mismatch<10)
+ printf(" mismatch at i=%u: dec1=%d, dec2=%d\n", i,
decoded1[i], decoded2[i]);
+ mismatch++;
+ }
+ }
+ if(mismatch==0){
+ printf(" decode1 & decode2 match!\n");
+ // verify with original
+ int error=0;
+ for(unsigned i=0;i<TEST_SIZE;i++){
+ if(decoded1[i] != raw_values[i]){
+ if(error<10)
+ printf(" raw mismatch at i=%u: raw=%d, dec=%d\n",
i,raw_values[i], decoded1[i]);
+ error++;
+ }
+ }
+ if(error==0) printf(" and match raw data!\n");
+ else printf(" total %d raw mismatch\n", error);
+ } else {
+ printf(" total mismatch=%d\n", mismatch);
+ }
+ printf("\n");
+}
+
+void run_test(unsigned b, int with_exception, unsigned TEST_SIZE,
+ unsigned* raw_values, unsigned char* encoded_data,
+ unsigned* decoded1, unsigned* decoded2) {
+ printf("测试: 位宽 b=%u, 异常%s\n", b, (with_exception ? "有" : "无"));
+
+ // 生成符合当前 b 与异常模式的原始数据
+ generate_raw_data_for_bitwidth(raw_values, TEST_SIZE, b, with_exception);
+
+ unsigned encoded_size;
+ generate_test_data(raw_values, TEST_SIZE, encoded_data, &encoded_size);
+
+ // 获取编码头部信息(例如起始值等)
+ unsigned start;
+ unsigned char* copy = encoded_data;
+ xvbxget32(copy, start);
+ unsigned char encoded_b = copy[0]; // 编码后的第一个字节为位宽
+ if((encoded_b & 0x40)) {
+ encoded_b &= 0x3f;
+ } else {
+ if(encoded_b & 0x80) {
+ encoded_b &= 0x7f;
+ }
+ }
+ printf(" 编码参数: 位宽 b=%u, 起始值 start=%u, 编码大小=%u字节\n", encoded_b, start,
encoded_size);
+
+ // 清空解码缓冲区
+ memset(decoded1, 0, TEST_SIZE * sizeof(unsigned));
+ memset(decoded2, 0, TEST_SIZE * sizeof(unsigned));
+
+ // 调用两种解码方式
+ p4nd1dec256v32(encoded_data, TEST_SIZE, decoded1);
+ p4nd1dec256scalarv32(encoded_data, TEST_SIZE, decoded2);
+
+ // 比较两个解码结果是否匹配
+ int mismatch = 0;
+ for (unsigned i = 0; i < TEST_SIZE; i++) {
+ if (decoded1[i] != decoded2[i]) {
+ if (mismatch < 10)
+ printf(" 不匹配: 索引 %u, 原始值=%u, 原始解码=%u, 标量解码=%u\n",
+ i, raw_values[i], decoded1[i], decoded2[i]);
+ mismatch++;
+ }
+ }
+ if (mismatch == 0) {
+ printf(" 通过: 所有解码值匹配!\n");
+ // 验证解码值与原始数据是否一致
+ int error = 0;
+ for (unsigned i = 0; i < TEST_SIZE && error < 10; i++) {
+ if (decoded1[i] != raw_values[i]) {
+ printf(" 编码/解码错误: 索引 %u, 原始值=%u, 解码值=%u\n",
+ i, raw_values[i], decoded1[i]);
+ error++;
+ }
+ }
+ if (error == 0)
+ printf(" 验证通过: 解码结果与原始数据一致\n");
+ } else {
+ printf(" 失败: 有 %d 个值不匹配\n", mismatch);
+ printf(" 原始数据 (前16个): ");
+ for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++)
+ printf("%u ", raw_values[i]);
+ printf("...\n");
+ printf(" 原始解码 (前16个): ");
+ for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++)
+ printf("%u ", decoded1[i]);
+ printf("...\n");
+ printf(" 标量解码 (前16个): ");
+ for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++)
+ printf("%u ", decoded2[i]);
+ printf("...\n");
+ }
+ printf("\n");
+}
+
+void testZigZag()
+{
+ const unsigned TEST_SIZE=512; //or512
+ unsigned *raw_values= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned));
+ unsigned *decoded1= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned));
+ unsigned *decoded2= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned));
+ unsigned char* encoded_data= (unsigned char*) malloc(TEST_SIZE*4+ 10);
//maybe
+
+ srand((unsigned)time(NULL));
+ printf("开始测试 p4nzdec256v32...\n");
+
+ for(unsigned b=0; b<=32; b++){
+ run_testZigzag(b,0, TEST_SIZE, raw_values, encoded_data, decoded1,
decoded2);
+ run_testZigzag(b,1, TEST_SIZE, raw_values, encoded_data, decoded1,
decoded2);
+ }
+
+ free(raw_values);
+ free(decoded1);
+ free(decoded2);
+ free(encoded_data);
+}
+
+void test_p4nd1dec256v32() {
+ const unsigned TEST_SIZE = 512;
+
+ // 分配缓冲区
+ unsigned* raw_values = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned));
+ unsigned char* encoded_data = (unsigned char*)malloc(TEST_SIZE *
sizeof(unsigned) * 2);
+ unsigned* decoded1 = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned));
+ unsigned* decoded2 = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned));
+
+ srand((unsigned)time(NULL));
+ printf("开始测试 p4nd1dec256v32...\n");
+
+ // 对 b = 0 到 31 测试两种模式:无异常和有异常
+ for (unsigned b = 0; b < 32; b++) {
+ run_test(b, 0, TEST_SIZE, raw_values, encoded_data, decoded1,
decoded2);
+ run_test(b, 1, TEST_SIZE, raw_values, encoded_data, decoded1,
decoded2);
+ }
+ // 对 b == 32 只测试无异常情况
+ run_test(32, 0, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2);
+
+ free(raw_values);
+ free(encoded_data);
+ free(decoded1);
+ free(decoded2);
+
+ printf("测试完成!\n");
+}
+#endif
+int main() {
+#ifdef __AVX2__
+ test_p4nd1dec256v32();
+ testZigZag();
+ //test_until_b1_achieved_improved();
+#endif
+ return 0;
+}
diff --git a/src/ext/for/vp4.h b/src/ext/for/vp4.h
index fae28df8d45..39460b614c4 100644
--- a/src/ext/for/vp4.h
+++ b/src/ext/for/vp4.h
@@ -99,6 +99,7 @@ size_t p4nd1dec32( unsigned char *__restrict in, size_t n,
uint32_t *__restri
size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t
*__restrict out);
size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t
*__restrict out);
size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t
*__restrict out);
+size_t p4nd1dec256scalarv32(unsigned char* __restrict in, size_t n, uint32_t*
__restrict out);
size_t p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t
*__restrict out);
//Zigzag
size_t p4nzdec8( unsigned char *__restrict in, size_t n, uint8_t
*__restrict out);
@@ -107,6 +108,7 @@ size_t p4nzdec32( unsigned char *__restrict in, size_t
n, uint32_t *__restri
size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t
*__restrict out);
size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t
*__restrict out);
size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t
*__restrict out);
+size_t p4nzdec256scalarv32(unsigned char* __restrict in, size_t n, uint32_t*
__restrict out);
size_t p4nzdec64( unsigned char *__restrict in, size_t n, uint64_t
*__restrict out);
//************** Low level API - n limited to 128/256
***************************************
diff --git a/src/ext/for/vp4d.c b/src/ext/for/vp4d.c
index a255fc2a2f0..e1bbacde3b6 100644
--- a/src/ext/for/vp4d.c
+++ b/src/ext/for/vp4d.c
@@ -253,6 +253,36 @@ extern char _shuffle_16[256][16];
#define BITUNPACK bitunpack256w
#define BITUNPACKD bitunpack256w
#define _BITUNPACKD _bitunpack256w
+#include "vp4d.c"
+#define P4DELTA(a) ,a
+#define P4DELTA_(a) a
+#define DELTA
+
+#undef _P4DEC
+#undef P4DEC
+#undef P4NDEC
+#undef BITUNPACKD
+#undef _BITUNPACKD
+
+
+#define _P4DEC _p4d1dec256scalarv
+#define P4DEC p4d1dec256scalarv
+#define P4NDEC p4nd1dec256scalarv
+#define P4NDECS p4d1dec
+#define BITUNPACK bitunpack256scalarv
+#define BITUNPACKD bitd1unpack256scalarv
+#define _BITUNPACKD _bitd1unpack256scalarv
+#define BITUNDD bitd1dec
+#include "vp4d.c"
+
+#define _P4DEC _p4zdec256scalarv
+#define P4DEC p4zdec256scalarv
+#define P4NDEC p4nzdec256scalarv
+#define P4NDECS p4zdec
+#define BITUNPACKD bitzunpack256scalarv
+#define _BITUNPACKD _bitzunpack256scalarv
+#define BITUNDD bitzdec
+#define USIZE 32
#include "vp4d.c"
#endif
#undef DELTA
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 5392cc79188..4b12bb6f9c3 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -16,7 +16,7 @@ SOURCE_GROUP("search" ./search/*)
SOURCE_GROUP("search-spans" ./search/spans/*)
SOURCE_GROUP("store" ./store/*)
SOURCE_GROUP("util" ./util/*)
-
+INCLUDE_DIRECTORIES( ${clucene_SOURCE_DIR}/src/ext/* )
IF (BUILD_CONTRIBS_LIB)
SET(test_contribs_lib_files ./contribs-lib/analysis/testChinese.cpp)
SET(EXTRA_LIBS ${EXTRA_LIBS} clucene-contribs-lib)
@@ -106,6 +106,7 @@ SET(test_files ./tests.cpp
./util/TestStrConvert.cpp
./query/TestMultiPhraseQuery.cpp
./store/TestUTF8Chars.cpp
+ ./store/testPFOR.cpp
${test_HEADERS})
IF (USE_SHARED_OBJECT_FILES)
GET_SHARED_FILES(clucene_shared_Files)
diff --git a/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat
new file mode 100644
index 00000000000..e823aef9968
Binary files /dev/null and
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat differ
diff --git a/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat
new file mode 100644
index 00000000000..a27177e3090
Binary files /dev/null and
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat differ
diff --git a/src/test/store/testPFOR.cpp b/src/test/store/testPFOR.cpp
new file mode 100644
index 00000000000..a8ddbc871b5
--- /dev/null
+++ b/src/test/store/testPFOR.cpp
@@ -0,0 +1,546 @@
+#include "test.h"
+
+#include <memory.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "CLucene/index/CodeMode.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/IndexInput.h"
+#include "CLucene/store/IndexOutput.h"
+#include "CLucene/util/PFORUtil.h"
+#include "CuTest.h"
+#include "for/vp4.h"
+
+using namespace lucene::store;
+// Add a helper macro for printing more detailed error messages when
assertions fail
+#define CuAssertTrueWithMessage(tc, message, condition) \
+ do { \
+ if (!(condition)) { \
+ printf("Assertion failed: %s\n", message); \
+ } \
+ CuAssertTrue(tc, condition); \
+ } while (0)
+
+static const uint64_t g_pow10[] = {
+ 1ULL, // 10^0
+ 10ULL, // 10^1
+ 100ULL, // 10^2
+ 1000ULL, // 10^3
+ 10000ULL, // 10^4
+ 100000ULL, // 10^5
+ 1000000ULL, // 10^6
+ 10000000ULL, // 10^7
+ 100000000ULL, // 10^8
+ 1000000000ULL, // 10^9
+ 10000000000ULL // 10^10
+};
+
+// 计算 10^(floor(b/3)),若超出 g_pow10 范围可再加判断
+static inline uint64_t get_pow10_for_b(unsigned b) {
+ // floor(b/3)
+ unsigned idx = b / 3;
+ if (idx >= sizeof(g_pow10) / sizeof(g_pow10[0])) {
+ // 超过预置表最大 10^10,就固定返回 10^10 或自行处理
+ return g_pow10[10];
+ }
+ return g_pow10[idx];
+}
+
+// 计算 2^b 的函数
+static inline unsigned power2(unsigned b) {
+ // (1U << b) 当 b=32 时也可能溢出,你可自行判断
+ return (1U << b);
+}
+
+/**
+ * @param values 输出数组
+ * @param n 要生成的数据个数
+ * @param b 当前位宽
+ * @param with_exception 0=无异常,1=有异常
+ */
+void generate_raw_data_for_bitwidth(unsigned* values, unsigned n, unsigned b,
int with_exception) {
+ if (n == 0) return;
+
+ if (!with_exception) {
+ // =====================================
+ // 无异常模式:递增序列
+ // =====================================
+ // 1) 先给一个随机初始值 base (你也可随意决定)
+ unsigned base = rand() % 1000;
+ values[0] = base;
+
+ // 2) 根据 b 分段决定"增量最大范围"
+ unsigned inc_range;
+ if (b < 4) {
+ // b=0 => 2^0=1, b=1 =>2, b=2 =>4, b=3=>8
+ inc_range = power2(b);
+ } else {
+ // b>=4 => 用10^(floor(b/3)) => 10,100,1000,...
+ uint64_t r = get_pow10_for_b(b);
+ // 这里最好判断 r 是否超出 unsigned 范围
+ // 若测试场景不会特别大,可以直接转为 unsigned
+ if (r > 0xFFFFFFFFULL) {
+ r = 0xFFFFFFFFULL; // 避免溢出
+ }
+ inc_range = (unsigned)r;
+ }
+
+ // 3) 生成递增序列
+ for (unsigned i = 1; i < n; i++) {
+ // +1 是为了避免 0 增量的情况
+ unsigned inc = 1 + rand() % inc_range;
+ base += inc;
+ values[i] = base;
+ }
+
+ } else {
+ // =====================================
+ // 有异常模式:直接随机
+ // =====================================
+
+ // 观察示例得知:
+ // - b=0 => rand()%2
+ // - b=2 => rand()%4
+ // - b=3 => rand()%10
+ // - b=7 => rand()%100
+ // - b=10 => rand()%1000
+ // - b=13 => rand()%10000
+ // => 规律:当 b >= 3 用 10^(floor(b/3));当 b < 3 用特殊处理
+
+ uint64_t val_range = 0; // 用 64 位临时存,最后再转回 unsigned
+
+ if (b == 0) {
+ val_range = 2; // 0..1
+ } else if (b == 1) {
+ // 你没给 b=1 的具体例子,这里假设跟 b=0 一样 => range=2
+ val_range = 2; // 0..1
+ } else if (b == 2) {
+ val_range = 4; // 0..3
+ } else {
+ // b>=3 => 用 10^(floor(b/3))
+ val_range = get_pow10_for_b(b);
+ // 同样检查一下是否超过 unsigned
+ if (val_range > 0xFFFFFFFFULL) {
+ val_range = 0xFFFFFFFFULL;
+ }
+ }
+
+ // 直接随机
+ for (unsigned i = 0; i < n; i++) {
+ unsigned x = (unsigned)(rand() % (unsigned)val_range);
+ values[i] = x;
+ }
+ }
+}
+
+void test_pfor_has_prox(CuTest* tc) {
+ const unsigned TEST_SIZE = 512;
+ const char* testFileName = "pfor.dat";
+
+ // 分配缓冲区
+ std::vector<unsigned> docDeltaBuffer(TEST_SIZE);
+ std::vector<unsigned> freqBuffer(TEST_SIZE);
+ std::vector<unsigned> encoded_data(TEST_SIZE * 2);
+ std::vector<unsigned> decoded1(TEST_SIZE);
+ std::vector<unsigned> decoded2(TEST_SIZE);
+
+ srand((unsigned)time(NULL));
+ printf("开始测试 p4nd1dec256v32...\n");
+
+ {
+ generate_raw_data_for_bitwidth(docDeltaBuffer.data(), TEST_SIZE, 32,
0);
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+
+ auto* output = dir->createOutput(testFileName);
+
+ lucene::util::pfor_encode(output, docDeltaBuffer, freqBuffer, true);
+ output->close();
+ _CLDELETE(output);
+ dir->close();
+ _CLDELETE(dir);
+ }
+ {
+ IndexInput* input = nullptr;
+ CLuceneError error;
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+ bool result = dir->openInput(testFileName, input, error);
+ lucene::util::pfor_decode(input, decoded1, decoded2, true, false);
+ for (size_t i = 0; i < TEST_SIZE; i++) {
+ CuAssertIntEquals(tc, _T("docDeltaBuffer[%zu] != decoded1[%zu]"),
docDeltaBuffer[i],
+ decoded1[i]);
+ CuAssertIntEquals(tc, _T("freqBuffer[%zu] != decoded2[%zu]"),
freqBuffer[i],
+ decoded2[i]);
+ }
+ input->close();
+ _CLDELETE(input);
+ dir->close();
+ _CLDELETE(dir);
+ }
+ printf("测试完成!\n");
+}
+
+void test_pfor_no_prox(CuTest* tc) {
+ const unsigned TEST_SIZE = 512;
+ const char* testFileName = "pfor.dat";
+
+ // 分配缓冲区
+ std::vector<unsigned> docDeltaBuffer(TEST_SIZE);
+ std::vector<unsigned> freqBuffer(TEST_SIZE);
+ std::vector<unsigned> encoded_data(TEST_SIZE * 2);
+ std::vector<unsigned> decoded1(TEST_SIZE);
+ std::vector<unsigned> decoded2(TEST_SIZE);
+
+ srand((unsigned)time(NULL));
+ printf("开始测试 p4nd1dec256v32...\n");
+
+ {
+ generate_raw_data_for_bitwidth(docDeltaBuffer.data(), TEST_SIZE, 32,
0);
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+
+ auto* output = dir->createOutput(testFileName);
+
+ lucene::util::pfor_encode(output, docDeltaBuffer, freqBuffer, false);
+ output->close();
+ _CLDELETE(output);
+ dir->close();
+ _CLDELETE(dir);
+ }
+ {
+ IndexInput* input = nullptr;
+ CLuceneError error;
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+ bool result = dir->openInput(testFileName, input, error);
+ lucene::util::pfor_decode(input, decoded1, decoded2, false, false);
+ for (size_t i = 0; i < TEST_SIZE; i++) {
+ CuAssertIntEquals(tc, _T("docDeltaBuffer[%zu] != decoded1[%zu]"),
docDeltaBuffer[i],
+ decoded1[i]);
+ CuAssertIntEquals(tc, _T("freqBuffer[%zu] != decoded2[%zu]"),
freqBuffer[i],
+ decoded2[i]);
+ }
+ input->close();
+ _CLDELETE(input);
+ dir->close();
+ _CLDELETE(dir);
+ }
+ printf("测试完成!\n");
+}
+
+// Test the compatibility of P4DEC and P4ENC
+void test_p4dec_p4enc_compat(CuTest* tc) {
+ const unsigned TEST_SIZE = 512;
+ const char* testFileName = "pfor_p4enc.dat";
+
+ // Allocate buffers
+ std::vector<uint32_t> originalData(TEST_SIZE);
+ std::vector<uint32_t> decodedData(TEST_SIZE);
+ std::vector<uint32_t> freqs(TEST_SIZE);
+ std::vector<uint32_t> decodedFreqs(TEST_SIZE);
+
+ srand((unsigned)time(NULL));
+ printf("Testing P4ENC and pfor_decode compatibility...\n");
+
+ // Generate test data with delta encoding pattern (increasing values)
+ generate_raw_data_for_bitwidth(originalData.data(), TEST_SIZE, 32, 0);
+ generate_raw_data_for_bitwidth(freqs.data(), TEST_SIZE, 32, 1);
+
+ auto encode = [](IndexOutput* out, std::vector<uint32_t>& buffer, bool
isDoc) {
+ std::vector<uint8_t> compress(4 * buffer.size() + PFOR_BLOCK_SIZE);
+ size_t size = 0;
+ if (isDoc) {
+ size = P4ENC(buffer.data(), buffer.size(), compress.data());
+ } else {
+ size = P4NZENC(buffer.data(), buffer.size(), compress.data());
+ }
+ out->writeVInt(size);
+ out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()),
size);
+ };
+ // 第一步:使用P4ENC编码数据并写入文件
+ {
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+ auto* output = dir->createOutput(testFileName);
+
+ // 写入编码模式和大小
+ output->writeByte((char)lucene::index::CodeMode::kPfor);
+ output->writeVInt(TEST_SIZE);
+
+ // 编码并写入数据
+ encode(output, originalData, true);
+ encode(output, freqs, false);
+
+ output->close();
+ _CLDELETE(output);
+ dir->close();
+ _CLDELETE(dir);
+ }
+
+ // 第二步:使用pfor_decode解码数据
+ {
+ IndexInput* input = nullptr;
+ CLuceneError error;
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+ bool result = dir->openInput(testFileName, input, error);
+
+ // 使用pfor_decode解码数据 (不使用代理 has_prox=false, compatibleRead=false)
+ uint32_t decoded_size =
+ lucene::util::pfor_decode(input, decodedData, decodedFreqs,
true, false);
+
+ // 验证解码大小
+ CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE,
decoded_size);
+
+ // 验证解码数据与原始数据匹配
+ for (size_t i = 0; i < TEST_SIZE; i++) {
+ //printf("freqs[%zu] = %u, decodedFreqs[%zu] = %u\n", i, freqs[i],
i, decodedFreqs[i]);
+ //printf("originalData[%zu] = %u, decodedData[%zu] = %u\n", i,
originalData[i], i, decodedData[i]);
+ CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original",
+ originalData[i] == decodedData[i]);
+ CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original",
+ freqs[i] == decodedFreqs[i]);
+ }
+
+ input->close();
+ _CLDELETE(input);
+ dir->close();
+ _CLDELETE(dir);
+ }
+
+ printf("P4ENC/pfor_decode compatibility test completed successfully!\n");
+}
+
+// Test cross-platform compatibility for P4DEC/P4ENC
+void test_cross_platform_compat(CuTest* tc) {
+ const unsigned TEST_SIZE = 512;
+ const char* testFileName = "pfor_cross_platform.dat";
+
+ // Allocate buffers
+ std::vector<uint32_t> originalData(TEST_SIZE);
+ std::vector<uint32_t> decodedData(TEST_SIZE);
+
+ srand((unsigned)time(NULL));
+ printf("Testing cross-platform compatibility...\n");
+
+ // Generate test data with different patterns
+ for (unsigned i = 0; i < TEST_SIZE; i++) {
+ // Mix of small and large values to test different bit widths
+ if (i % 10 == 0) {
+ originalData[i] = rand() % 1000000; // Occasional large value
+ } else {
+ originalData[i] = rand() % 100; // Mostly small values
+ }
+ }
+
+ // Part 1: Write encoded data to file using PFOR encoding
+ {
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+ auto* output = dir->createOutput(testFileName);
+
+ // Write encoding mode and size
+ output->writeByte((char)lucene::index::CodeMode::kPfor);
+ output->writeVInt(TEST_SIZE);
+
+ // Encode and write the data
+ std::vector<uint8_t> compress(4 * TEST_SIZE + PFOR_BLOCK_SIZE);
+ size_t size = lucene::util::P4ENC(originalData.data(), TEST_SIZE,
compress.data());
+ output->writeVInt(size);
+ output->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()),
size);
+
+ output->close();
+ _CLDELETE(output);
+ dir->close();
+ _CLDELETE(dir);
+ }
+
+ // Part 2: Read encoded data from file and decode it with
compatibleRead=true
+ {
+ IndexInput* input = nullptr;
+ CLuceneError error;
+ auto* dir = lucene::store::FSDirectory::getDirectory("./");
+ bool result = dir->openInput(testFileName, input, error);
+
+ // Verify the encoded format
+ char mode = input->readByte();
+ uint32_t arraySize = input->readVInt();
+ CuAssertIntEquals(tc, _T("Array size mismatch"), TEST_SIZE, arraySize);
+
+ // Read, decode and verify
+ uint32_t SerializedSize = input->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ input->readBytes(buf.data(), SerializedSize);
+
+ // Use P4DEC for decoding, simulating cross-platform read
+ lucene::util::P4DEC(buf.data(), arraySize, decodedData.data());
+
+ // Verify decoded data matches original
+ for (size_t i = 0; i < TEST_SIZE; i++) {
+ CuAssertIntEquals(tc, _T("Cross-platform decoded data mismatch at
%zu"),
+ originalData[i], decodedData[i]);
+ }
+
+ input->close();
+ _CLDELETE(input);
+ dir->close();
+ _CLDELETE(dir);
+ }
+
+ printf("Cross-platform compatibility test completed successfully!\n");
+}
+
+// Test compatibility between encoded by ARM old version data and decoded by
x86 new version with compatible mode
+void test_p4ndx_compatibility(CuTest* tc) {
+ const unsigned TEST_SIZE = 512;
+ const char* testFileName1 =
"pfor_p4ndx_compat_gen_by_old_version_x86_64.dat";
+ const char* testFileName2 = "pfor_p4ndx_compat_gen_by_old_version_arm.dat";
+
+ // Allocate buffers
+ std::vector<uint32_t> docDeltaBuffer = {
+ 635, 1188, 1795, 2109, 2694, 3612, 3714, 4511,
5072, 5352, 5526,
+ 5894, 6706, 6891, 6979, 7080, 7586, 7789, 8530,
9065, 9704, 9949,
+ 10377, 10678, 11516, 11921, 12226, 13133, 13417, 13854,
14215, 14486, 15476,
+ 16444, 17380, 18306, 19191, 19580, 20302, 21099, 21119,
22014, 22178, 22361,
+ 22440, 23043, 23326, 24262, 25067, 25443, 26265, 27061,
27681, 27931, 28027,
+ 28837, 28843, 29595, 29663, 29953, 30494, 30922, 31834,
32364, 33111, 33311,
+ 33766, 34749, 35689, 36217, 36348, 36660, 37083, 37378,
37872, 38725, 39622,
+ 40399, 40540, 40594, 41098, 42060, 42909, 43032, 43243,
43539, 43823, 44040,
+ 44439, 44790, 45648, 46587, 46718, 46839, 47307, 47536,
48208, 48482, 49046,
+ 49658, 50460, 51154, 51429, 52005, 52993, 53761, 54541,
54778, 55674, 56594,
+ 57236, 57635, 58517, 59007, 59881, 60325, 60462, 60971,
60983, 61518, 62378,
+ 63247, 64073, 64415, 64757, 65050, 65620, 65633, 66552,
66685, 67661, 67733,
+ 67912, 68514, 69161, 69327, 69697, 70475, 71229, 71846,
71896, 72291, 72659,
+ 72942, 73178, 73419, 74145, 74517, 75266, 75356, 75615,
76575, 77533, 77617,
+ 77918, 78569, 79297, 79520, 79536, 80534, 81241, 81584,
81653, 82538, 83483,
+ 83550, 83601, 84267, 85112, 85268, 85550, 86444, 87347,
87996, 88172, 88310,
+ 88551, 88804, 89666, 90008, 90350, 90822, 91475, 92127,
93034, 93340, 93994,
+ 94628, 95156, 95825, 96457, 96691, 96703, 96755, 96874,
97182, 97301, 97822,
+ 98795, 99758, 100434, 101040, 101248, 101826, 102081, 102816,
102884, 103731, 104070,
+ 104999, 105539, 106220, 106972, 107165, 107849, 108507, 109005,
109342, 109633, 109658,
+ 110016, 110290, 110900, 111621, 111947, 112675, 112703, 113499,
114099, 114451, 115209,
+ 115837, 116794, 117111, 117668, 118231, 118634, 119258, 119668,
120409, 121313, 122262,
+ 123035, 123690, 124183, 124991, 125303, 126293, 126790, 127745,
128111, 128965, 129545,
+ 129873, 130447, 130704, 130759, 131712, 131764, 131771, 132075,
132236, 132870, 133482,
+ 134311, 134501, 134676, 134907, 135073, 136009, 136333, 136402,
137286, 137734, 137810,
+ 138539, 138795, 139534, 139604, 140356, 140401, 141189, 142146,
142771, 142886, 143416,
+ 143649, 144170, 145004, 145289, 145816, 146305, 146750, 146910,
147010, 147636, 147986,
+ 148612, 149468, 150335, 150896, 151427, 152362, 153159, 154138,
154500, 155025, 155259,
+ 155360, 155954, 156291, 156436, 157169, 157462, 157583, 158430,
158604, 158958, 159326,
+ 159333, 159971, 160865, 161712, 162146, 162552, 162850, 162909,
163016, 163940, 164207,
+ 165180, 165664, 166461, 167368, 167648, 168423, 168692, 168848,
169208, 169929, 170679,
+ 171375, 172240, 172722, 173710, 174696, 175377, 175890, 176581,
176629, 177148, 177476,
+ 177769, 178486, 178599, 179297, 179312, 179836, 180640, 180930,
181072, 181848, 182621,
+ 183559, 183594, 183999, 184064, 184719, 185279, 186055, 186430,
187091, 187915, 188154,
+ 188649, 188812, 189388, 189563, 190239, 190505, 190727, 190921,
191866, 192732, 192995,
+ 193405, 193969, 194246, 195179, 195546, 196464, 196890, 197385,
198075, 198790, 199319,
+ 199765, 199896, 200079, 200085, 200992, 201549, 202215, 202945,
203092, 203252, 204144,
+ 204219, 204905, 205472, 205812, 206071, 206184, 206821, 206946,
207673, 207719, 208407,
+ 208762, 209092, 209498, 209770, 209877, 210129, 210442, 211263,
212043, 212802, 213754,
+ 214068, 214832, 215690, 215912, 216693, 217632, 218001, 218942,
219124, 219567, 220545,
+ 220646, 220780, 221017, 221582, 222352, 223065, 223356, 223523,
224275, 224920, 225768,
+ 225925, 226841, 227795, 228556, 228784, 229559, 230099, 231085,
231163, 231369, 231470,
+ 231757, 232184, 233066, 233291, 234086, 234260, 235018, 235607,
235758, 236616, 237339,
+ 238078, 238500, 239344, 239795, 239859, 240222, 240424, 241132,
241694, 242405, 243380,
+ 243896, 244367, 244922, 245564, 245926, 246818, 247537, 248104,
249097, 249102, 249448,
+ 249674, 250255, 250395, 250794, 251132, 251213, 252114, 252662,
252817, 253457, 253778,
+ 254128, 254570, 254955, 255667, 256311, 256755};
+ std::vector<uint32_t> freqBuffer = {
+ 73, 5, 18, 40, 27, 24, 33, 88, 15, 51, 7, 59, 7, 4, 84, 39,
43, 34, 28, 75, 35, 75,
+ 29, 26, 48, 79, 67, 32, 42, 10, 75, 67, 67, 45, 7, 94, 21, 40,
35, 37, 43, 94, 48, 2,
+ 98, 85, 41, 93, 19, 22, 69, 54, 49, 50, 32, 97, 81, 0, 29, 24,
62, 57, 91, 30, 54, 51,
+ 76, 76, 91, 63, 65, 87, 57, 13, 89, 7, 98, 31, 1, 70, 5, 22,
76, 54, 24, 9, 52, 6,
+ 9, 33, 82, 71, 4, 2, 25, 53, 97, 76, 30, 25, 20, 93, 90, 7, 3,
55, 96, 10, 6, 79,
+ 63, 76, 84, 85, 52, 39, 10, 13, 91, 68, 22, 76, 50, 46, 19, 75,
99, 16, 4, 81, 41, 24,
+ 27, 83, 31, 30, 38, 27, 92, 44, 59, 56, 20, 43, 93, 25, 82, 55,
38, 25, 23, 13, 2, 25,
+ 59, 21, 53, 10, 89, 57, 44, 82, 81, 71, 17, 64, 53, 55, 43, 97, 0,
2, 53, 72, 46, 99,
+ 49, 28, 54, 40, 6, 30, 53, 8, 5, 5, 64, 81, 60, 74, 22, 17,
18, 4, 50, 41, 21, 14,
+ 94, 28, 58, 92, 80, 60, 97, 5, 58, 96, 54, 87, 3, 46, 45, 33,
99, 53, 40, 15, 86, 1,
+ 90, 8, 18, 60, 64, 21, 54, 37, 87, 48, 65, 45, 92, 98, 58, 42, 3,
16, 90, 9, 55, 93,
+ 56, 0, 26, 7, 5, 67, 23, 91, 20, 65, 99, 38, 77, 15, 11, 31,
52, 99, 32, 18, 96, 76,
+ 68, 54, 18, 71, 23, 9, 32, 78, 2, 88, 31, 81, 48, 88, 0, 23,
80, 20, 40, 31, 10, 17,
+ 47, 22, 49, 99, 21, 81, 69, 17, 57, 37, 24, 28, 60, 47, 37, 93,
77, 91, 33, 8, 72, 33,
+ 97, 72, 56, 29, 92, 96, 60, 55, 14, 59, 77, 15, 11, 98, 48, 32,
67, 57, 70, 91, 85, 82,
+ 90, 74, 75, 68, 66, 61, 28, 90, 94, 77, 15, 3, 6, 59, 99, 19,
14, 65, 30, 91, 32, 41,
+ 41, 80, 74, 9, 38, 96, 52, 75, 78, 43, 2, 6, 63, 68, 19, 91,
10, 13, 69, 25, 16, 27,
+ 85, 68, 98, 99, 33, 81, 91, 66, 74, 84, 98, 48, 93, 88, 96, 98,
16, 27, 93, 18, 85, 56,
+ 38, 4, 47, 48, 69, 68, 74, 38, 48, 11, 6, 98, 10, 91, 31, 53, 9,
6, 38, 60, 54, 83,
+ 48, 3, 33, 64, 30, 26, 34, 67, 82, 72, 71, 82, 21, 92, 2, 47,
30, 50, 58, 88, 1, 20,
+ 32, 32, 74, 93, 38, 64, 53, 45, 99, 54, 48, 33, 70, 30, 59, 5,
97, 94, 29, 20, 76, 50,
+ 12, 78, 49, 95, 81, 7, 83, 34, 80, 67, 18, 6, 13, 57, 70, 18,
54, 69, 72, 54, 2, 95,
+ 36, 14, 52, 33, 8, 81, 5, 36, 84, 17, 14, 33, 12, 47, 93, 48,
81, 25, 67, 52, 31, 80,
+ 9, 1, 99, 15, 22, 23, 69, 25};
+ std::vector<uint32_t> decodedDocs(TEST_SIZE);
+ std::vector<uint32_t> decodedFreqs(TEST_SIZE);
+
+ srand((unsigned)time(NULL));
+ printf("Testing pfor_decode compatibility...\n");
+
+#if defined(__AVX2__)
+ // Part 2: Decode data using pfor_decode with compatible mode
(compatibleRead=true)
+ {
+ IndexInput* input = nullptr;
+ CLuceneError error;
+ auto* dir =
lucene::store::FSDirectory::getDirectory(clucene_data_location);
+ bool result = dir->openInput(testFileName2, input, error);
+
+ // Use pfor_decode with compatibleRead=true
+ uint32_t decoded_size =
+ lucene::util::pfor_decode(input, decodedDocs, decodedFreqs,
true, true);
+
+ // Verify decoded size
+ CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE,
decoded_size);
+
+ // Verify decoded data matches original
+ for (size_t i = 0; i < TEST_SIZE; i++) {
+ CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original",
+ docDeltaBuffer[i] == decodedDocs[i]);
+ CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original",
+ freqBuffer[i] == decodedFreqs[i]);
+ }
+
+ input->close();
+ _CLDELETE(input);
+ dir->close();
+ _CLDELETE(dir);
+ }
+#elif defined(__ARM_NEON) || defined(__SSSE3__)
+ {
+ IndexInput* input = nullptr;
+ CLuceneError error;
+ auto* dir =
lucene::store::FSDirectory::getDirectory(clucene_data_location);
+ bool result = dir->openInput(testFileName1, input, error);
+
+ // Use pfor_decode with compatibleRead=true
+ uint32_t decoded_size =
+ lucene::util::pfor_decode(input, decodedDocs, decodedFreqs,
true, true);
+
+ // Verify decoded size
+ CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE,
decoded_size);
+
+ // Verify decoded data matches original
+ for (size_t i = 0; i < TEST_SIZE; i++) {
+ CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original",
+ docDeltaBuffer[i] == decodedDocs[i]);
+ CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original",
+ freqBuffer[i] == decodedFreqs[i]);
+ }
+
+ input->close();
+ _CLDELETE(input);
+ dir->close();
+ _CLDELETE(dir);
+ }
+#endif
+ printf("compatibility test completed successfully!\n");
+}
+CuSuite* testPFORSuite() {
+ CuSuite* suite = CuSuiteNew(_T("PFOR Test Suite"));
+
+ SUITE_ADD_TEST(suite, test_pfor_has_prox);
+ SUITE_ADD_TEST(suite, test_pfor_no_prox);
+ SUITE_ADD_TEST(suite, test_p4dec_p4enc_compat);
+ SUITE_ADD_TEST(suite, test_cross_platform_compat);
+ SUITE_ADD_TEST(suite, test_p4ndx_compatibility);
+
+ return suite;
+}
\ No newline at end of file
diff --git a/src/test/test.h b/src/test/test.h
index 19f37e81243..7f9bd908ee8 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -86,7 +86,7 @@ CuSuite *testMultiPhraseQuery(void);
CuSuite *testIndexCompaction(void);
CuSuite *testStringReader(void);
CuSuite *testUTF8CharsSuite(void);
-
+CuSuite *testPFORSuite(void);
#ifdef TEST_CONTRIB_LIBS
//CuSuite *testGermanAnalyzer(void);
CuSuite *testchinese(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 7cd9f657385..e0ee6055f32 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -20,7 +20,8 @@ unittest tests[] = {
{"IndexCompaction", testIndexCompaction},
{"testStringReader", testStringReader},
{"TestUTF8Chars", testUTF8CharsSuite},
+ {"testPFOR", testPFORSuite},
#ifdef TEST_CONTRIB_LIBS
- {"chinese", testchinese},
+ //{"chinese", testchinese},
#endif
{"LastTest", NULL}};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]