This is an automated email from the ASF dual-hosted git repository.
jiangtian pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/tsfile.git
The following commit(s) were added to refs/heads/develop by this push:
new 74aa61f8 Import Lzokay and Implement it's compressor, while lz4 was
moved into third_party. (#266)
74aa61f8 is described below
commit 74aa61f8496a5c8d70239a35294011843956b722
Author: Yukim1 <[email protected]>
AuthorDate: Fri Oct 18 14:22:17 2024 +0800
Import Lzokay and Implement it's compressor, while lz4 was moved into
third_party. (#266)
* Import https://github.com/google/snappy third-party library, implementing
the Snappy compression method.
* Exclude rat check for third_party
* Import snappy as source code instead of submodule
* turn on -fpic
* Remove some unnecessary files from third_party/google_snappy
* Fix problems mentioned in review
* Fix the mistake of error code
* Fix the error code in lz4 and improve compress test
* Remove launch.json and wrong static
* Remove unnecessary file
* Optimized the size of Snappy memory allocation
* Revert "Optimized the size of Snappy memory allocation"
This reverts commit 9a28e43b44b316e0baefcdb475b81d85d3dafe06.
* Import lzokay and Implement it's compressor, while lz4 is moved into
third_party.
* Fix for ubuntu
* Add some infomations of third_party lib in License and remove debug log.
---------
Co-authored-by: root <root@LAPTOP-FT8O6IRD>
---
LICENSE | 41 ++
cpp/CMakeLists.txt | 6 +-
cpp/src/CMakeLists.txt | 13 +-
cpp/src/compress/compressor_factory.h | 3 +-
cpp/src/compress/lzo_compressor.cc | 127 ++++++
cpp/src/compress/lzo_compressor.h | 62 +++
cpp/test/compress/lzo_compressor_test.cc | 129 ++++++
cpp/third_party/CMakeLists.txt | 2 +
cpp/third_party/lz4/CMakeLists.txt | 3 +
cpp/{src/compress => third_party/lz4}/lz4.c | 0
cpp/{src/compress => third_party/lz4}/lz4.h | 0
cpp/third_party/lzokay/CMakeLists.txt | 3 +
cpp/third_party/lzokay/LICENSE | 22 +
cpp/third_party/lzokay/lzokay.cpp | 647 ++++++++++++++++++++++++++++
cpp/third_party/lzokay/lzokay.hpp | 79 ++++
15 files changed, 1129 insertions(+), 8 deletions(-)
diff --git a/LICENSE b/LICENSE
index b37af99e..26968535 100644
--- a/LICENSE
+++ b/LICENSE
@@ -261,3 +261,44 @@ The following files include code modified from Eclipse
Collections project.
Copyright: 2021 Goldman Sachs
Project page: https://www.eclipse.org/collections
License:
https://github.com/eclipse/eclipse-collections/blob/master/LICENSE-EDL-1.0.txt
+
+--------------------------------------------------------------------------------
+
+The following files include code modified from Snappy project.
+
+./tsfile/cpp/third_party/google_snappy/snappy.h
+./tsfile/cpp/third_party/google_snappy/snappy.cc
+./tsfile/cpp/third_party/google_snappy/snappy-internal.h
+./tsfile/cpp/third_party/google_snappy/snappy-sinksource.cc
+./tsfile/cpp/third_party/google_snappy/snappy-sinksource.h
+./tsfile/cpp/third_party/google_snappy/snappy-stubs-internal.cc
+./tsfile/cpp/third_party/google_snappy/snappy-stubs-internal.h
+./tsfile/cpp/third_party/google_snappy/snappy-stubs-public.h.in
+./tsfile/cpp/third_party/google_snappy/CMakeLists.txt
+
+Copyright: 2011, Google Inc.
+Project page: https://github.com/google/snappy
+License: https://github.com/google/snappy/blob/main/COPYING
+
+--------------------------------------------------------------------------------
+
+The following files include code is copied from lz4 project.
+
+./tsfile/cpp/third_party/lz4/lz4.c
+./tsfile/cpp/third_party/lz4/lz4.h
+
+Copyright: (C) 2011-2020, Yann Collet.
+Project page: http://www.lz4.org
+License: https://github.com/lz4/lz4/blob/dev/LICENSE
+
+--------------------------------------------------------------------------------
+
+The following files include code is copied from lzokay project.
+
+./tsfile/cpp/third_party/lzokay/lzokay.cpp
+./tsfile/cpp/third_party/lzokay/lzokay.hpp
+./tsfile/cpp/third_party/lzokay/LICENSE
+
+Copyright: (c) 2018 Jack Andersen
+Project page: https://github.com/AxioDL/lzokay
+License: https://github.com/AxioDL/lzokay/blob/master/LICENSE
\ No newline at end of file
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dabc4db7..dcf8f6e3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -68,7 +68,11 @@ endif()
message("CMAKE DEBUG: CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
-set(PROJECT_INCLUDE_DIR ${PROJECT_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/src)
+set(PROJECT_INCLUDE_DIR ${PROJECT_INCLUDE_DIR}
+ ${PROJECT_SOURCE_DIR}/src
+ ${PROJECT_SOURCE_DIR}/third_party/lz4
+ ${PROJECT_SOURCE_DIR}/third_party/lzokay
+)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
include_directories(${PROJECT_INCLUDE_DIR})
diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt
index 31898991..dd024a5d 100644
--- a/cpp/src/CMakeLists.txt
+++ b/cpp/src/CMakeLists.txt
@@ -47,13 +47,14 @@ set_target_properties(tsfile PROPERTIES SOVERSION
${LIBTSFILE_SO_VERSION})
set(LIBTSFILE_SDK_DIR ${LIBRARY_OUTPUT_PATH})
install(TARGETS tsfile LIBRARY DESTINATION ${LIBTSFILE_SDK_DIR})
-set(SNAPPY_INCLUDE_DIR "/third_party/snappy")
-include_directories(${SNAPPY_INCLUDE_DIR})
set(SNAPPY_LIB_NAME "snappy")
-target_link_libraries(compress_obj ${SNAPPY_LIB_NAME})
-target_link_libraries(common_obj ${SNAPPY_LIB_NAME})
-target_link_libraries(read_obj ${SNAPPY_LIB_NAME})
-target_link_libraries(write_obj ${SNAPPY_LIB_NAME})
+set(LZ4_LIB_NAME "LZ4")
+set(LZO_LIB_NAME "lzokay")
+
+target_link_libraries(compress_obj ${SNAPPY_LIB_NAME} ${LZ4_LIB_NAME}
${LZO_LIB_NAME})
+target_link_libraries(common_obj ${SNAPPY_LIB_NAME} ${LZ4_LIB_NAME}
${LZO_LIB_NAME})
+target_link_libraries(read_obj ${SNAPPY_LIB_NAME} ${LZ4_LIB_NAME}
${LZO_LIB_NAME})
+target_link_libraries(write_obj ${SNAPPY_LIB_NAME} ${LZ4_LIB_NAME}
${LZO_LIB_NAME})
# set(CMAKE_PREFIX_PATH ../../third-party/zlib-1.2.13/install/lib)
# set(ZLIB_LIB_DIR ../../third-party/zlib-1.2.13/install/lib)
diff --git a/cpp/src/compress/compressor_factory.h
b/cpp/src/compress/compressor_factory.h
index 13c50d4f..b467fe93 100644
--- a/cpp/src/compress/compressor_factory.h
+++ b/cpp/src/compress/compressor_factory.h
@@ -22,6 +22,7 @@
#include "gzip_compressor.h"
#include "lz4_compressor.h"
+#include "lzo_compressor.h"
#include "snappy_compressor.h"
#include "uncompressed_compressor.h"
@@ -50,7 +51,7 @@ class CompressorFactory {
// ALLOC_AND_RETURN_COMPRESSPR(GZIPCompressor);
return nullptr;
} else if (type == common::LZO) {
- return nullptr;
+ ALLOC_AND_RETURN_COMPRESSPR(LZOCompressor);
} else if (type == common::SDT) {
return nullptr;
} else if (type == common::PAA) {
diff --git a/cpp/src/compress/lzo_compressor.cc
b/cpp/src/compress/lzo_compressor.cc
new file mode 100644
index 00000000..0ccde9f6
--- /dev/null
+++ b/cpp/src/compress/lzo_compressor.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "lzo_compressor.h"
+
+#include "common/allocator/alloc_base.h"
+
+using namespace common;
+
+namespace storage {
+
+void LZOCompressor::destroy() {
+ if (compressed_buf_ != nullptr) {
+ mem_free(compressed_buf_);
+ compressed_buf_ = nullptr;
+ }
+
+ if (uncompressed_buf_ != nullptr) {
+ mem_free(uncompressed_buf_);
+ uncompressed_buf_ = nullptr;
+ }
+}
+
+int LZOCompressor::reset(bool for_compress) {
+ // Nothing to do
+ return E_OK;
+}
+
+int LZOCompressor::compress(char *uncompressed_buf,
+ uint32_t uncompressed_buf_len,
+ char *&compressed_buf,
+ uint32_t &compressed_buf_len) {
+ int ret = E_OK;
+ size_t max_dst_size = lzokay::compress_worst_size(uncompressed_buf_len);
+ compressed_buf = (char *)mem_alloc(max_dst_size, MOD_COMPRESSOR_OBJ);
+ if (compressed_buf == nullptr) {
+ ret = E_OOM;
+ } else {
+ size_t compressed_len = 0;
+ uint8_t *srcUint8 = reinterpret_cast<uint8_t *>(uncompressed_buf);
+ uint8_t *dstUint8 = reinterpret_cast<uint8_t *>(compressed_buf);
+ lzokay::EResult compress_result =
+ lzokay::compress(srcUint8, uncompressed_buf_len, dstUint8,
+ max_dst_size, compressed_len);
+ if (compress_result == lzokay::EResult::Success) {
+ char *compress_data = (char *)mem_realloc(dstUint8,
compressed_len);
+ if (compress_data == nullptr) {
+ ret = E_OOM;
+ } else {
+ compressed_buf = compress_data;
+ compressed_buf_ = compress_data;
+ compressed_buf_len = compressed_len;
+ }
+ } else {
+ ret = E_COMPRESS_ERR;
+ }
+ }
+ return ret;
+}
+
+void LZOCompressor::after_compress(char *compressed_buf) {
+ if (compressed_buf != nullptr) {
+ mem_free(compressed_buf);
+ }
+}
+
+int LZOCompressor::uncompress(char *compressed_buf, uint32_t
compressed_buf_len,
+ char *&uncompressed_buf,
+ uint32_t &uncompressed_buf_len) {
+ int ret = E_OK;
+ char *regen_buffer = nullptr;
+ size_t ulength;
+ constexpr float ratio[] = {1.5, 2.5, 3.5, 4.5, 255};
+ for (uint8_t i = 0; i < UNCOMPRESSED_TIME; ++i) {
+ regen_buffer = (char *)mem_alloc(compressed_buf_len * ratio[i],
+ MOD_COMPRESSOR_OBJ);
+ if (regen_buffer == nullptr) {
+ ret = E_OOM;
+ } else {
+ lzokay::EResult result = lzokay::decompress(
+ reinterpret_cast<uint8_t *>(compressed_buf),
compressed_buf_len,
+ reinterpret_cast<uint8_t *>(regen_buffer),
+ compressed_buf_len * ratio[i], ulength);
+ if (result != lzokay::EResult::Success) {
+ mem_free(regen_buffer);
+ regen_buffer = nullptr;
+ ret = E_COMPRESS_ERR;
+ } else {
+ char *compress_data =
+ (char *)mem_realloc(regen_buffer, ulength);
+ if (regen_buffer == nullptr) {
+ ret = E_OOM;
+ } else {
+ ret = E_OK;
+ uncompressed_buf_len = ulength;
+ uncompressed_buf_ = compress_data;
+ uncompressed_buf = compress_data;
+ break;
+ }
+ }
+ }
+ }
+ return ret;
+}
+
+void LZOCompressor::after_uncompress(char *uncompressed_buf) {
+ if (uncompressed_buf != nullptr) {
+ mem_free(uncompressed_buf);
+ }
+}
+
+} // end namespace storage
\ No newline at end of file
diff --git a/cpp/src/compress/lzo_compressor.h
b/cpp/src/compress/lzo_compressor.h
new file mode 100644
index 00000000..aba0e208
--- /dev/null
+++ b/cpp/src/compress/lzo_compressor.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef COMPRESS_LZO_COMPRESSOR_H
+#define COMPRESS_LZO_COMPRESSOR_H
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/allocator/byte_stream.h"
+#include "common/logger/elog.h"
+#include "compressor.h"
+#include "lzokay.hpp"
+#include "utils/errno_define.h"
+#include "utils/util_define.h"
+
+#define UNCOMPRESSED_TIME 4
+
+namespace storage {
+
+class LZOCompressor : public Compressor {
+ public:
+ LZOCompressor() : compressed_buf_(nullptr), uncompressed_buf_(nullptr) {};
+ ~LZOCompressor() {};
+ // @for_compress
+ // true - for compressiom
+ // false - for uncompression
+ int reset(bool for_compress) OVERRIDE;
+ void destroy() OVERRIDE;
+ int compress(char *uncompressed_buf, uint32_t uncompressed_buf_len,
+ char *&compressed_buf, uint32_t &compressed_buf_len) OVERRIDE;
+ void after_compress(char *compressed_buf) OVERRIDE;
+ int uncompress(char *compressed_buf, uint32_t compressed_buf_len,
+ char *&uncompressed_buf,
+ uint32_t &uncompressed_buf_len) OVERRIDE;
+ void after_uncompress(char *uncompressed_buf) OVERRIDE;
+
+ private:
+ char *compressed_buf_;
+ char *uncompressed_buf_;
+};
+
+} // end namespace storage
+#endif // COMPRESS_SNAPPY_COMPRESSOR_H
diff --git a/cpp/test/compress/lzo_compressor_test.cc
b/cpp/test/compress/lzo_compressor_test.cc
new file mode 100644
index 00000000..918ccb41
--- /dev/null
+++ b/cpp/test/compress/lzo_compressor_test.cc
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License a
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "compress/lzo_compressor.h"
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+namespace {
+
+class LZOTest : public ::testing::Test {
+ protected:
+ void SetUp() override {}
+
+ void TearDown() override {}
+
+ std::string RandomString(int length) {
+ static std::random_device rd;
+ static std::mt19937 generator(rd());
+ static std::uniform_int_distribution<> dis(33, 127);
+
+ std::string result;
+ result.reserve(length);
+ for (int i = 0; i < length; ++i) {
+ result.push_back(static_cast<char>(dis(generator)));
+ }
+ return result;
+ }
+};
+
+TEST_F(LZOTest, TestBytes1) {
+ std::string input = RandomString(2000000);
+ std::vector<char> uncompressed(input.begin(), input.end());
+
+ storage::LZOCompressor compressor;
+ compressor.reset(true);
+
+ char *compressed_buf = nullptr;
+ uint32_t compressed_buf_len = 0;
+ auto start_time = std::chrono::high_resolution_clock::now();
+ compressor.compress(uncompressed.data(), uncompressed.size(),
+ compressed_buf, compressed_buf_len);
+ auto end_time = std::chrono::high_resolution_clock::now();
+ auto compression_duration =
+ std::chrono::duration_cast<std::chrono::milliseconds>(end_time -
+ start_time)
+ .count();
+
+ std::cout << "Compression time cost: " << compression_duration << " ms"
+ << std::endl;
+ std::cout << "Ratio: "
+ << static_cast<double>(compressed_buf_len) / uncompressed.size()
+ << std::endl;
+
+ char *decompressed_buf = nullptr;
+ uint32_t decompressed_buf_len = uncompressed.size();
+ compressor.reset(false);
+ start_time = std::chrono::high_resolution_clock::now();
+ compressor.uncompress(compressed_buf, compressed_buf_len, decompressed_buf,
+ decompressed_buf_len);
+ end_time = std::chrono::high_resolution_clock::now();
+ auto decompression_duration =
+ std::chrono::duration_cast<std::chrono::milliseconds>(end_time -
+ start_time)
+ .count();
+
+ std::cout << "Decompression time cost: " << decompression_duration << " ms"
+ << std::endl;
+ std::vector<char> decompressed(decompressed_buf,
+ decompressed_buf + decompressed_buf_len);
+ EXPECT_EQ(uncompressed, decompressed);
+
+ compressor.after_compress(compressed_buf);
+ compressor.after_uncompress(decompressed_buf);
+}
+
+TEST_F(LZOTest, TestBytes2) {
+ storage::LZOCompressor compressor;
+
+ int n = 500000;
+ std::string input = RandomString(n);
+ std::vector<char> uncompressed(input.begin(), input.end());
+
+ char *compressed_buf = nullptr;
+ uint32_t compressed_buf_len = 0;
+ uint32_t compressed_buf_len_new = 0;
+ compressor.reset(true);
+ compressor.compress(uncompressed.data(), uncompressed.size(),
+ compressed_buf, compressed_buf_len);
+ compressor.after_compress(compressed_buf);
+
+ compressor.compress(uncompressed.data(), uncompressed.size(),
+ compressed_buf, compressed_buf_len_new);
+ EXPECT_EQ(compressed_buf_len_new, compressed_buf_len);
+
+ char *decompressed_buf = nullptr;
+ uint32_t decompressed_buf_len = uncompressed.size();
+ compressor.reset(false);
+ compressor.uncompress(compressed_buf, compressed_buf_len, decompressed_buf,
+ decompressed_buf_len);
+
+ std::vector<char> decompressed(decompressed_buf,
+ decompressed_buf + decompressed_buf_len);
+ EXPECT_EQ(uncompressed, decompressed);
+
+ compressor.after_compress(compressed_buf);
+ compressor.after_uncompress(decompressed_buf);
+}
+} // namespace
diff --git a/cpp/third_party/CMakeLists.txt b/cpp/third_party/CMakeLists.txt
index a89e0af8..bdce1efe 100755
--- a/cpp/third_party/CMakeLists.txt
+++ b/cpp/third_party/CMakeLists.txt
@@ -17,4 +17,6 @@ specific language governing permissions and limitations
under the License.
]]
add_subdirectory(google_snappy)
+add_subdirectory(lz4)
+add_subdirectory(lzokay)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
\ No newline at end of file
diff --git a/cpp/third_party/lz4/CMakeLists.txt
b/cpp/third_party/lz4/CMakeLists.txt
new file mode 100644
index 00000000..ec81bba6
--- /dev/null
+++ b/cpp/third_party/lz4/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.11)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_library(LZ4 STATIC lz4.c)
\ No newline at end of file
diff --git a/cpp/src/compress/lz4.c b/cpp/third_party/lz4/lz4.c
similarity index 100%
rename from cpp/src/compress/lz4.c
rename to cpp/third_party/lz4/lz4.c
diff --git a/cpp/src/compress/lz4.h b/cpp/third_party/lz4/lz4.h
similarity index 100%
rename from cpp/src/compress/lz4.h
rename to cpp/third_party/lz4/lz4.h
diff --git a/cpp/third_party/lzokay/CMakeLists.txt
b/cpp/third_party/lzokay/CMakeLists.txt
new file mode 100644
index 00000000..b2f9b3a6
--- /dev/null
+++ b/cpp/third_party/lzokay/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.11)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_library(lzokay STATIC lzokay.cpp)
\ No newline at end of file
diff --git a/cpp/third_party/lzokay/LICENSE b/cpp/third_party/lzokay/LICENSE
new file mode 100755
index 00000000..e5fdd74a
--- /dev/null
+++ b/cpp/third_party/lzokay/LICENSE
@@ -0,0 +1,22 @@
+The MIT License
+
+Copyright (c) 2018 Jack Andersen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/cpp/third_party/lzokay/lzokay.cpp
b/cpp/third_party/lzokay/lzokay.cpp
new file mode 100755
index 00000000..83a0086e
--- /dev/null
+++ b/cpp/third_party/lzokay/lzokay.cpp
@@ -0,0 +1,647 @@
+#include "lzokay.hpp"
+#include <cstring>
+#include <algorithm>
+#include <iterator>
+
+/*
+ * Based on documentation from the Linux sources: Documentation/lzo.txt
+ *
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/lzo.txt
+ */
+
+namespace lzokay {
+
+#if _WIN32
+#define HOST_BIG_ENDIAN 0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define HOST_BIG_ENDIAN 1
+#else
+#define HOST_BIG_ENDIAN 0
+#endif
+
+#if HOST_BIG_ENDIAN
+static uint16_t get_le16(const uint8_t* p) {
+ uint16_t val = *reinterpret_cast<const uint16_t*>(p);
+#if __GNUC__
+ return __builtin_bswap16(val);
+#elif _WIN32
+ return _byteswap_ushort(val);
+#else
+ return (val = (val << 8) | ((val >> 8) & 0xFF));
+#endif
+}
+#else
+static uint16_t get_le16(const uint8_t* p) {
+ return *reinterpret_cast<const uint16_t*>(p);
+}
+#endif
+
+constexpr std::size_t Max255Count = std::size_t(~0) / 255 - 2;
+
+#define NEEDS_IN(count) \
+ if (inp + (count) > inp_end) { \
+ dst_size = outp - dst; \
+ return EResult::InputOverrun; \
+ }
+
+#define NEEDS_OUT(count) \
+ if (outp + (count) > outp_end) { \
+ dst_size = outp - dst; \
+ return EResult::OutputOverrun; \
+ }
+
+#define CONSUME_ZERO_BYTE_LENGTH \
+ std::size_t offset; \
+ { \
+ const uint8_t *old_inp = inp; \
+ while (*inp == 0) ++inp; \
+ offset = inp - old_inp; \
+ if (offset > Max255Count) { \
+ dst_size = outp - dst; \
+ return EResult::Error; \
+ } \
+ }
+
+#define WRITE_ZERO_BYTE_LENGTH(length) \
+ { \
+ std::size_t l; \
+ for (l = length; l > 255; l -= 255) { *outp++ = 0; } \
+ *outp++ = l; \
+ }
+
+constexpr uint32_t M1MaxOffset = 0x0400;
+constexpr uint32_t M2MaxOffset = 0x0800;
+constexpr uint32_t M3MaxOffset = 0x4000;
+// constexpr uint32_t M4MaxOffset = 0xbfff;
+
+// constexpr uint32_t M1MinLen = 2;
+// constexpr uint32_t M1MaxLen = 2;
+constexpr uint32_t M2MinLen = 3;
+constexpr uint32_t M2MaxLen = 8;
+// constexpr uint32_t M3MinLen = 3;
+constexpr uint32_t M3MaxLen = 33;
+// constexpr uint32_t M4MinLen = 3;
+constexpr uint32_t M4MaxLen = 9;
+
+constexpr uint32_t M1Marker = 0x0;
+// constexpr uint32_t M2Marker = 0x40;
+constexpr uint32_t M3Marker = 0x20;
+constexpr uint32_t M4Marker = 0x10;
+
+constexpr uint32_t MaxMatchByLengthLen = 34; /* Max M3 len + 1 */
+
+EResult decompress(const uint8_t* src, std::size_t src_size,
+ uint8_t* dst, std::size_t init_dst_size,
+ std::size_t& dst_size) {
+ dst_size = init_dst_size;
+
+ if (src_size < 3) {
+ dst_size = 0;
+ return EResult::InputOverrun;
+ }
+
+ const uint8_t* inp = src;
+ const uint8_t* inp_end = src + src_size;
+ uint8_t* outp = dst;
+ uint8_t* outp_end = dst + dst_size;
+ uint8_t* lbcur;
+ std::size_t lblen;
+ std::size_t state = 0;
+ std::size_t nstate = 0;
+
+ /* First byte encoding */
+ if (*inp >= 22) {
+ /* 22..255 : copy literal string
+ * length = (byte - 17) = 4..238
+ * state = 4 [ don't copy extra literals ]
+ * skip byte
+ */
+ std::size_t len = *inp++ - uint8_t(17);
+ NEEDS_IN(len)
+ NEEDS_OUT(len)
+ for (std::size_t i = 0; i < len; ++i)
+ *outp++ = *inp++;
+ state = 4;
+ } else if (*inp >= 18) {
+ /* 18..21 : copy 0..3 literals
+ * state = (byte - 17) = 0..3 [ copy <state> literals ]
+ * skip byte
+ */
+ nstate = *inp++ - uint8_t(17);
+ state = nstate;
+ NEEDS_IN(nstate)
+ NEEDS_OUT(nstate)
+ for (std::size_t i = 0; i < nstate; ++i)
+ *outp++ = *inp++;
+ }
+ /* 0..17 : follow regular instruction encoding, see below. It is worth
+ * noting that codes 16 and 17 will represent a block copy from
+ * the dictionary which is empty, and that they will always be
+ * invalid at this place.
+ */
+
+ while (true) {
+ NEEDS_IN(1)
+ uint8_t inst = *inp++;
+ if (inst & 0xC0) {
+ /* [M2]
+ * 1 L L D D D S S (128..255)
+ * Copy 5-8 bytes from block within 2kB distance
+ * state = S (copy S literals after this block)
+ * length = 5 + L
+ * Always followed by exactly one byte : H H H H H H H H
+ * distance = (H << 3) + D + 1
+ *
+ * 0 1 L D D D S S (64..127)
+ * Copy 3-4 bytes from block within 2kB distance
+ * state = S (copy S literals after this block)
+ * length = 3 + L
+ * Always followed by exactly one byte : H H H H H H H H
+ * distance = (H << 3) + D + 1
+ */
+ NEEDS_IN(1)
+ lbcur = outp - ((*inp++ << 3) + ((inst >> 2) & 0x7) + 1);
+ lblen = std::size_t(inst >> 5) + 1;
+ nstate = inst & uint8_t(0x3);
+ } else if (inst & M3Marker) {
+ /* [M3]
+ * 0 0 1 L L L L L (32..63)
+ * Copy of small block within 16kB distance (preferably less than 34B)
+ * length = 2 + (L ?: 31 + (zero_bytes * 255) + non_zero_byte)
+ * Always followed by exactly one LE16 : D D D D D D D D : D D D D D D
S S
+ * distance = D + 1
+ * state = S (copy S literals after this block)
+ */
+ lblen = std::size_t(inst & uint8_t(0x1f)) + 2;
+ if (lblen == 2) {
+ CONSUME_ZERO_BYTE_LENGTH
+ NEEDS_IN(1)
+ lblen += offset * 255 + 31 + *inp++;
+ }
+ NEEDS_IN(2)
+ nstate = get_le16(inp);
+ inp += 2;
+ lbcur = outp - ((nstate >> 2) + 1);
+ nstate &= 0x3;
+ } else if (inst & M4Marker) {
+ /* [M4]
+ * 0 0 0 1 H L L L (16..31)
+ * Copy of a block within 16..48kB distance (preferably less than 10B)
+ * length = 2 + (L ?: 7 + (zero_bytes * 255) + non_zero_byte)
+ * Always followed by exactly one LE16 : D D D D D D D D : D D D D D D
S S
+ * distance = 16384 + (H << 14) + D
+ * state = S (copy S literals after this block)
+ * End of stream is reached if distance == 16384
+ */
+ lblen = std::size_t(inst & uint8_t(0x7)) + 2;
+ if (lblen == 2) {
+ CONSUME_ZERO_BYTE_LENGTH
+ NEEDS_IN(1)
+ lblen += offset * 255 + 7 + *inp++;
+ }
+ NEEDS_IN(2)
+ nstate = get_le16(inp);
+ inp += 2;
+ lbcur = outp - (((inst & 0x8) << 11) + (nstate >> 2));
+ nstate &= 0x3;
+ if (lbcur == outp)
+ break; /* Stream finished */
+ lbcur -= 16384;
+ } else {
+ /* [M1] Depends on the number of literals copied by the last
instruction. */
+ if (state == 0) {
+ /* If last instruction did not copy any literal (state == 0), this
+ * encoding will be a copy of 4 or more literal, and must be
interpreted
+ * like this :
+ *
+ * 0 0 0 0 L L L L (0..15) : copy long literal string
+ * length = 3 + (L ?: 15 + (zero_bytes * 255) + non_zero_byte)
+ * state = 4 (no extra literals are copied)
+ */
+ std::size_t len = inst + 3;
+ if (len == 3) {
+ CONSUME_ZERO_BYTE_LENGTH
+ NEEDS_IN(1)
+ len += offset * 255 + 15 + *inp++;
+ }
+ /* copy_literal_run */
+ NEEDS_IN(len)
+ NEEDS_OUT(len)
+ for (std::size_t i = 0; i < len; ++i)
+ *outp++ = *inp++;
+ state = 4;
+ continue;
+ } else if (state != 4) {
+ /* If last instruction used to copy between 1 to 3 literals (encoded in
+ * the instruction's opcode or distance), the instruction is a copy of
a
+ * 2-byte block from the dictionary within a 1kB distance. It is worth
+ * noting that this instruction provides little savings since it uses 2
+ * bytes to encode a copy of 2 other bytes but it encodes the number of
+ * following literals for free. It must be interpreted like this :
+ *
+ * 0 0 0 0 D D S S (0..15) : copy 2 bytes from <= 1kB distance
+ * length = 2
+ * state = S (copy S literals after this block)
+ * Always followed by exactly one byte : H H H H H H H H
+ * distance = (H << 2) + D + 1
+ */
+ NEEDS_IN(1)
+ nstate = inst & uint8_t(0x3);
+ lbcur = outp - ((inst >> 2) + (*inp++ << 2) + 1);
+ lblen = 2;
+ } else {
+ /* If last instruction used to copy 4 or more literals (as detected by
+ * state == 4), the instruction becomes a copy of a 3-byte block from
the
+ * dictionary from a 2..3kB distance, and must be interpreted like
this :
+ *
+ * 0 0 0 0 D D S S (0..15) : copy 3 bytes from 2..3 kB distance
+ * length = 3
+ * state = S (copy S literals after this block)
+ * Always followed by exactly one byte : H H H H H H H H
+ * distance = (H << 2) + D + 2049
+ */
+ NEEDS_IN(1)
+ nstate = inst & uint8_t(0x3);
+ lbcur = outp - ((inst >> 2) + (*inp++ << 2) + 2049);
+ lblen = 3;
+ }
+ }
+ if (lbcur < dst) {
+ dst_size = outp - dst;
+ return EResult::LookbehindOverrun;
+ }
+ NEEDS_IN(nstate)
+ NEEDS_OUT(lblen + nstate)
+ /* Copy lookbehind */
+ for (std::size_t i = 0; i < lblen; ++i)
+ *outp++ = *lbcur++;
+ state = nstate;
+ /* Copy literal */
+ for (std::size_t i = 0; i < nstate; ++i)
+ *outp++ = *inp++;
+ }
+
+ dst_size = outp - dst;
+ if (lblen != 3) /* Ensure terminating M4 was encountered */
+ return EResult::Error;
+ if (inp == inp_end)
+ return EResult::Success;
+ else if (inp < inp_end)
+ return EResult::InputNotConsumed;
+ else
+ return EResult::InputOverrun;
+}
+
+struct State {
+ const uint8_t* src;
+ const uint8_t* src_end;
+ const uint8_t* inp;
+ uint32_t wind_sz;
+ uint32_t wind_b;
+ uint32_t wind_e;
+ uint32_t cycle1_countdown;
+
+ const uint8_t* bufp;
+ uint32_t buf_sz;
+
+ /* Access next input byte and advance both ends of circular buffer */
+ void get_byte(uint8_t* buf) {
+ if (inp >= src_end) {
+ if (wind_sz > 0)
+ --wind_sz;
+ buf[wind_e] = 0;
+ if (wind_e < DictBase::MaxMatchLen)
+ buf[DictBase::BufSize + wind_e] = 0;
+ } else {
+ buf[wind_e] = *inp;
+ if (wind_e < DictBase::MaxMatchLen)
+ buf[DictBase::BufSize + wind_e] = *inp;
+ ++inp;
+ }
+ if (++wind_e == DictBase::BufSize)
+ wind_e = 0;
+ if (++wind_b == DictBase::BufSize)
+ wind_b = 0;
+ }
+
+ uint32_t pos2off(uint32_t pos) const {
+ return wind_b > pos ? wind_b - pos : DictBase::BufSize - (pos - wind_b);
+ }
+};
+
+class DictImpl : public DictBase {
+public:
+ struct Match3Impl : DictBase::Match3 {
+ static uint32_t make_key(const uint8_t* data) {
+ return ((0x9f5f * (((uint32_t(data[0]) << 5 ^ uint32_t(data[1])) << 5) ^
data[2])) >> 5) & 0x3fff;
+ }
+
+ uint16_t get_head(uint32_t key) const {
+ return (chain_sz[key] == 0) ? uint16_t(UINT16_MAX) : head[key];
+ }
+
+ void init() {
+ std::fill(std::begin(chain_sz), std::end(chain_sz), 0);
+ }
+
+ void remove(uint32_t pos, const uint8_t* b) {
+ --chain_sz[make_key(b + pos)];
+ }
+
+ void advance(State& s, uint32_t& match_pos, uint32_t& match_count, const
uint8_t* b) {
+ uint32_t key = make_key(b + s.wind_b);
+ match_pos = chain[s.wind_b] = get_head(key);
+ match_count = chain_sz[key]++;
+ if (match_count > DictBase::MaxMatchLen)
+ match_count = DictBase::MaxMatchLen;
+ head[key] = uint16_t(s.wind_b);
+ }
+
+ void skip_advance(State& s, const uint8_t* b) {
+ uint32_t key = make_key(b + s.wind_b);
+ chain[s.wind_b] = get_head(key);
+ head[key] = uint16_t(s.wind_b);
+ best_len[s.wind_b] = uint16_t(DictBase::MaxMatchLen + 1);
+ chain_sz[key]++;
+ }
+ };
+
+ struct Match2Impl : DictBase::Match2 {
+ static uint32_t make_key(const uint8_t* data) {
+ return uint32_t(data[0]) ^ (uint32_t(data[1]) << 8);
+ }
+
+ void init() {
+ std::fill(std::begin(head), std::end(head), UINT16_MAX);
+ }
+
+ void add(uint16_t pos, const uint8_t* b) {
+ head[make_key(b + pos)] = pos;
+ }
+
+ void remove(uint32_t pos, const uint8_t* b) {
+ uint16_t& p = head[make_key(b + pos)];
+ if (p == pos)
+ p = UINT16_MAX;
+ }
+
+ bool search(State& s, uint32_t& lb_pos, uint32_t& lb_len,
+ uint32_t best_pos[MaxMatchByLengthLen], const uint8_t* b)
const {
+ uint16_t pos = head[make_key(b + s.wind_b)];
+ if (pos == UINT16_MAX)
+ return false;
+ if (best_pos[2] == 0)
+ best_pos[2] = pos + 1;
+ if (lb_len < 2) {
+ lb_len = 2;
+ lb_pos = pos;
+ }
+ return true;
+ }
+ };
+
+ void init(State& s, const uint8_t* src, std::size_t src_size) {
+ auto& match3 = static_cast<Match3Impl&>(_storage->match3);
+ auto& match2 = static_cast<Match2Impl&>(_storage->match2);
+
+ s.cycle1_countdown = DictBase::MaxDist;
+ match3.init();
+ match2.init();
+
+ s.src = src;
+ s.src_end = src + src_size;
+ s.inp = src;
+ s.wind_sz = uint32_t(std::min(src_size, std::size_t(MaxMatchLen)));
+ s.wind_b = 0;
+ s.wind_e = s.wind_sz;
+ std::copy_n(s.inp, s.wind_sz, _storage->buffer);
+ s.inp += s.wind_sz;
+
+ if (s.wind_e == DictBase::BufSize)
+ s.wind_e = 0;
+
+ if (s.wind_sz < 3)
+ std::fill_n(_storage->buffer + s.wind_b + s.wind_sz, 3, 0);
+ }
+
+ void reset_next_input_entry(State& s, Match3Impl& match3, Match2Impl&
match2) {
+ /* Remove match from about-to-be-clobbered buffer entry */
+ if (s.cycle1_countdown == 0) {
+ match3.remove(s.wind_e, _storage->buffer);
+ match2.remove(s.wind_e, _storage->buffer);
+ } else {
+ --s.cycle1_countdown;
+ }
+ }
+
+ void advance(State& s, uint32_t& lb_off, uint32_t& lb_len,
+ uint32_t best_off[MaxMatchByLengthLen], bool skip) {
+ auto& match3 = static_cast<Match3Impl&>(_storage->match3);
+ auto& match2 = static_cast<Match2Impl&>(_storage->match2);
+
+ if (skip) {
+ for (uint32_t i = 0; i < lb_len - 1; ++i) {
+ reset_next_input_entry(s, match3, match2);
+ match3.skip_advance(s, _storage->buffer);
+ match2.add(uint16_t(s.wind_b), _storage->buffer);
+ s.get_byte(_storage->buffer);
+ }
+ }
+
+ lb_len = 1;
+ lb_off = 0;
+ uint32_t lb_pos = 0;
+
+ uint32_t best_pos[MaxMatchByLengthLen] = {};
+ uint32_t match_pos, match_count;
+ match3.advance(s, match_pos, match_count, _storage->buffer);
+
+ int best_char = _storage->buffer[s.wind_b];
+ uint32_t best_len = lb_len;
+ if (lb_len >= s.wind_sz) {
+ if (s.wind_sz == 0)
+ best_char = -1;
+ lb_off = 0;
+ match3.best_len[s.wind_b] = DictBase::MaxMatchLen + 1;
+ } else {
+ if (match2.search(s, lb_pos, lb_len, best_pos, _storage->buffer) &&
s.wind_sz >= 3) {
+ for (uint32_t i = 0; i < match_count; ++i, match_pos =
match3.chain[match_pos]) {
+ auto ref_ptr = _storage->buffer + s.wind_b;
+ auto match_ptr = _storage->buffer + match_pos;
+ auto mismatch = std::mismatch(ref_ptr, ref_ptr + s.wind_sz,
match_ptr);
+ auto match_len = uint32_t(mismatch.first - ref_ptr);
+ if (match_len < 2)
+ continue;
+ if (match_len < MaxMatchByLengthLen && best_pos[match_len] == 0)
+ best_pos[match_len] = match_pos + 1;
+ if (match_len > lb_len) {
+ lb_len = match_len;
+ lb_pos = match_pos;
+ if (match_len == s.wind_sz || match_len >
match3.best_len[match_pos])
+ break;
+ }
+ }
+ }
+ if (lb_len > best_len)
+ lb_off = s.pos2off(lb_pos);
+ match3.best_len[s.wind_b] = uint16_t(lb_len);
+ for (auto posit = std::begin(best_pos) + 2, offit = best_off + 2;
+ posit != std::end(best_pos); ++posit, ++offit) {
+ *offit = (*posit > 0) ? s.pos2off(*posit - 1) : 0;
+ }
+ }
+
+ reset_next_input_entry(s, match3, match2);
+
+ match2.add(uint16_t(s.wind_b), _storage->buffer);
+
+ s.get_byte(_storage->buffer);
+
+ if (best_char < 0) {
+ s.buf_sz = 0;
+ lb_len = 0;
+ /* Signal exit */
+ } else {
+ s.buf_sz = s.wind_sz + 1;
+ }
+ s.bufp = s.inp - s.buf_sz;
+ }
+};
+
+static void find_better_match(const uint32_t best_off[MaxMatchByLengthLen],
uint32_t& lb_len, uint32_t& lb_off) {
+ if (lb_len <= M2MinLen || lb_off <= M2MaxOffset)
+ return;
+ if (lb_off > M2MaxOffset && lb_len >= M2MinLen + 1 && lb_len <= M2MaxLen + 1
&&
+ best_off[lb_len - 1] != 0 && best_off[lb_len - 1] <= M2MaxOffset) {
+ lb_len -= 1;
+ lb_off = best_off[lb_len];
+ } else if (lb_off > M3MaxOffset && lb_len >= M4MaxLen + 1 && lb_len <=
M2MaxLen + 2 &&
+ best_off[lb_len - 2] && best_off[lb_len] <= M2MaxOffset) {
+ lb_len -= 2;
+ lb_off = best_off[lb_len];
+ } else if (lb_off > M3MaxOffset && lb_len >= M4MaxLen + 1 && lb_len <=
M3MaxLen + 1 &&
+ best_off[lb_len - 1] != 0 && best_off[lb_len - 2] <= M3MaxOffset)
{
+ lb_len -= 1;
+ lb_off = best_off[lb_len];
+ }
+}
+
+static EResult encode_literal_run(uint8_t*& outp, const uint8_t* outp_end,
const uint8_t* dst, std::size_t& dst_size,
+ const uint8_t* lit_ptr, uint32_t lit_len) {
+ if (outp == dst && lit_len <= 238) {
+ NEEDS_OUT(1);
+ *outp++ = uint8_t(17 + lit_len);
+ } else if (lit_len <= 3) {
+ outp[-2] = uint8_t(outp[-2] | lit_len);
+ } else if (lit_len <= 18) {
+ NEEDS_OUT(1);
+ *outp++ = uint8_t(lit_len - 3);
+ } else {
+ NEEDS_OUT((lit_len - 18) / 255 + 2);
+ *outp++ = 0;
+ WRITE_ZERO_BYTE_LENGTH(lit_len - 18);
+ }
+ NEEDS_OUT(lit_len);
+ outp = std::copy_n(lit_ptr, lit_len, outp);
+ return EResult::Success;
+}
+
+static EResult encode_lookback_match(uint8_t*& outp, const uint8_t* outp_end,
const uint8_t* dst, std::size_t& dst_size,
+ uint32_t lb_len, uint32_t lb_off,
uint32_t last_lit_len) {
+ if (lb_len == 2) {
+ lb_off -= 1;
+ NEEDS_OUT(2);
+ *outp++ = uint8_t(M1Marker | ((lb_off & 0x3) << 2));
+ *outp++ = uint8_t(lb_off >> 2);
+ } else if (lb_len <= M2MaxLen && lb_off <= M2MaxOffset) {
+ lb_off -= 1;
+ NEEDS_OUT(2);
+ *outp++ = uint8_t((lb_len - 1) << 5 | ((lb_off & 0x7) << 2));
+ *outp++ = uint8_t(lb_off >> 3);
+ } else if (lb_len == M2MinLen && lb_off <= M1MaxOffset + M2MaxOffset &&
last_lit_len >= 4) {
+ lb_off -= 1 + M2MaxOffset;
+ NEEDS_OUT(2);
+ *outp++ = uint8_t(M1Marker | ((lb_off & 0x3) << 2));
+ *outp++ = uint8_t(lb_off >> 2);
+ } else if (lb_off <= M3MaxOffset) {
+ lb_off -= 1;
+ if (lb_len <= M3MaxLen) {
+ NEEDS_OUT(1);
+ *outp++ = uint8_t(M3Marker | (lb_len - 2));
+ } else {
+ lb_len -= M3MaxLen;
+ NEEDS_OUT(lb_len / 255 + 2);
+ *outp++ = uint8_t(M3Marker);
+ WRITE_ZERO_BYTE_LENGTH(lb_len);
+ }
+ NEEDS_OUT(2);
+ *outp++ = uint8_t(lb_off << 2);
+ *outp++ = uint8_t(lb_off >> 6);
+ } else {
+ lb_off -= 0x4000;
+ if (lb_len <= M4MaxLen) {
+ NEEDS_OUT(1);
+ *outp++ = uint8_t(M4Marker | ((lb_off & 0x4000) >> 11) | (lb_len - 2));
+ } else {
+ lb_len -= M4MaxLen;
+ NEEDS_OUT(lb_len / 255 + 2);
+ *outp++ = uint8_t(M4Marker | ((lb_off & 0x4000) >> 11));
+ WRITE_ZERO_BYTE_LENGTH(lb_len);
+ }
+ NEEDS_OUT(2);
+ *outp++ = uint8_t(lb_off << 2);
+ *outp++ = uint8_t(lb_off >> 6);
+ }
+ return EResult::Success;
+}
+
+EResult compress(const uint8_t* src, std::size_t src_size,
+ uint8_t* dst, std::size_t init_dst_size,
+ std::size_t& dst_size, DictBase& dict) {
+ EResult err;
+ State s;
+ auto& d = static_cast<DictImpl&>(dict);
+ dst_size = init_dst_size;
+ uint8_t* outp = dst;
+ uint8_t* outp_end = dst + dst_size;
+ uint32_t lit_len = 0;
+ uint32_t lb_off, lb_len;
+ uint32_t best_off[MaxMatchByLengthLen];
+ d.init(s, src, src_size);
+ const uint8_t* lit_ptr = s.inp;
+ d.advance(s, lb_off, lb_len, best_off, false);
+ while (s.buf_sz > 0) {
+ if (lit_len == 0)
+ lit_ptr = s.bufp;
+ if (lb_len < 2 || (lb_len == 2 && (lb_off > M1MaxOffset || lit_len == 0 ||
lit_len >= 4)) ||
+ (lb_len == 2 && outp == dst) || (outp == dst && lit_len == 0)) {
+ lb_len = 0;
+ } else if (lb_len == M2MinLen && lb_off > M1MaxOffset + M2MaxOffset &&
lit_len >= 4) {
+ lb_len = 0;
+ }
+ if (lb_len == 0) {
+ ++lit_len;
+ d.advance(s, lb_off, lb_len, best_off, false);
+ continue;
+ }
+ find_better_match(best_off, lb_len, lb_off);
+ if ((err = encode_literal_run(outp, outp_end, dst, dst_size, lit_ptr,
lit_len)) < EResult::Success)
+ return err;
+ if ((err = encode_lookback_match(outp, outp_end, dst, dst_size, lb_len,
lb_off, lit_len)) < EResult::Success)
+ return err;
+ lit_len = 0;
+ d.advance(s, lb_off, lb_len, best_off, true);
+ }
+ if ((err = encode_literal_run(outp, outp_end, dst, dst_size, lit_ptr,
lit_len)) < EResult::Success)
+ return err;
+
+ /* Terminating M4 */
+ NEEDS_OUT(3);
+ *outp++ = M4Marker | 1;
+ *outp++ = 0;
+ *outp++ = 0;
+
+ dst_size = outp - dst;
+ return EResult::Success;
+}
+
+}
\ No newline at end of file
diff --git a/cpp/third_party/lzokay/lzokay.hpp
b/cpp/third_party/lzokay/lzokay.hpp
new file mode 100755
index 00000000..8d9a861f
--- /dev/null
+++ b/cpp/third_party/lzokay/lzokay.hpp
@@ -0,0 +1,79 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace lzokay {
+
+enum class EResult {
+ LookbehindOverrun = -4,
+ OutputOverrun = -3,
+ InputOverrun = -2,
+ Error = -1,
+ Success = 0,
+ InputNotConsumed = 1,
+};
+
+class DictBase {
+protected:
+ static constexpr uint32_t HashSize = 0x4000;
+ static constexpr uint32_t MaxDist = 0xbfff;
+ static constexpr uint32_t MaxMatchLen = 0x800;
+ static constexpr uint32_t BufSize = MaxDist + MaxMatchLen;
+
+ /* List encoding of previous 3-byte data matches */
+ struct Match3 {
+ uint16_t head[HashSize]; /* key -> chain-head-pos */
+ uint16_t chain_sz[HashSize]; /* key -> chain-size */
+ uint16_t chain[BufSize]; /* chain-pos -> next-chain-pos */
+ uint16_t best_len[BufSize]; /* chain-pos -> best-match-length */
+ };
+ /* Encoding of 2-byte data matches */
+ struct Match2 {
+ uint16_t head[1 << 16]; /* 2-byte-data -> head-pos */
+ };
+
+ struct Data {
+ Match3 match3;
+ Match2 match2;
+
+ /* Circular buffer caching enough data to access the maximum lookback
+ * distance of 48K + maximum match length of 2K. An additional 2K is
+ * allocated so the start of the buffer may be replicated at the end,
+ * therefore providing efficient circular access.
+ */
+ uint8_t buffer[BufSize + MaxMatchLen];
+ };
+ using storage_type = Data;
+ storage_type* _storage;
+ DictBase() = default;
+ friend struct State;
+ friend EResult compress(const uint8_t* src, std::size_t src_size,
+ uint8_t* dst, std::size_t& dst_size, DictBase& dict);
+};
+template <template<typename> class _Alloc = std::allocator>
+class Dict : public DictBase {
+ _Alloc<DictBase::storage_type> _allocator;
+public:
+ Dict() { _storage = _allocator.allocate(1); }
+ ~Dict() { _allocator.deallocate(_storage, 1); }
+};
+
+EResult decompress(const uint8_t* src, std::size_t src_size,
+ uint8_t* dst, std::size_t dst_size,
+ std::size_t& out_size);
+EResult compress(const uint8_t* src, std::size_t src_size,
+ uint8_t* dst, std::size_t dst_size,
+ std::size_t& out_size, DictBase& dict);
+inline EResult compress(const uint8_t* src, std::size_t src_size,
+ uint8_t* dst, std::size_t dst_size,
+ std::size_t& out_size) {
+ Dict<> dict;
+ return compress(src, src_size, dst, dst_size, out_size, dict);
+}
+
+constexpr std::size_t compress_worst_size(std::size_t s) {
+ return s + s / 16 + 64 + 3;
+}
+
+}