jt2594838 commented on code in PR #542:
URL: https://github.com/apache/tsfile/pull/542#discussion_r2206653469


##########
cpp/src/encoding/int32_rle_decoder.h:
##########
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef ENCODING_INT32RLE_DECODER_H
+#define ENCODING_INT32RLE_DECODER_H
+
+#include <vector>
+
+#include "common/allocator/alloc_base.h"
+#include "decoder.h"
+#include "encoder.h"
+#include "encoding/encode_utils.h"
+#include "encoding/int32_packer.h"
+
+namespace storage {
+
+class Int32RleDecoder : public Decoder {
+   private:
+    uint32_t length_;
+    uint32_t bit_width_;
+    int bitpacking_num_;
+    bool is_length_and_bitwidth_readed_;
+    int current_count_;
+    common::ByteStream byte_cache_;
+    int32_t *current_buffer_;
+    Int32Packer *packer_;
+    uint8_t *tmp_buf_;
+
+   public:
+    Int32RleDecoder()
+        : length_(0),
+          bit_width_(0),
+          bitpacking_num_(0),
+          is_length_and_bitwidth_readed_(false),
+          current_count_(0),
+          byte_cache_(1024, common::MOD_DECODER_OBJ),
+          current_buffer_(nullptr),
+          packer_(nullptr),
+          tmp_buf_(nullptr) {}
+    ~Int32RleDecoder() override { destroy(); }
+
+    bool has_remaining() override { return has_next_package(); }
+    int read_boolean(bool &ret_value, common::ByteStream &in) {
+        int32_t bool_value;
+        read_int32(bool_value, in);
+        ret_value = bool_value == 0 ? false : true;
+        return common::E_OK;
+    }
+    int read_int32(int32_t &ret_value, common::ByteStream &in) override {
+        ret_value = static_cast<int32_t>(read_int(in));
+        return common::E_OK;
+    }
+    int read_int64(int64_t &ret_value, common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+    int read_float(float &ret_value, common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+    int read_double(double &ret_value, common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+    int read_String(common::String &ret_value, common::PageArena &pa,
+                    common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+
+    void init() {
+        packer_ = nullptr;
+        is_length_and_bitwidth_readed_ = false;
+        length_ = 0;
+        bit_width_ = 0;
+        bitpacking_num_ = 0;
+        current_count_ = 0;
+    }
+
+    bool has_next(common::ByteStream &buffer) {
+        if (current_count_ > 0 || buffer.remaining_size() > 0 ||
+            has_next_package()) {
+            return true;
+        }
+        return false;
+    }
+
+    bool has_next_package() {
+        return current_count_ > 0 || byte_cache_.remaining_size() > 0;
+    }
+
+    int32_t read_int(common::ByteStream &buffer) {
+        if (!is_length_and_bitwidth_readed_) {
+            // start to reader a new rle+bit-packing pattern
+            read_length_and_bitwidth(buffer);
+        }
+        if (current_count_ == 0) {
+            uint8_t header;
+            int ret = common::E_OK;
+            if (RET_FAIL(
+                    common::SerializationUtil::read_ui8(header, byte_cache_))) 
{
+                return ret;
+            }
+            call_read_bit_packing_buffer(header);
+        }
+        --current_count_;
+        int32_t result = current_buffer_[bitpacking_num_ - current_count_ - 1];
+        if (!has_next_package()) {
+            is_length_and_bitwidth_readed_ = false;
+        }
+        return result;
+    }
+
+    int call_read_bit_packing_buffer(uint8_t header) {
+        int bit_packed_group_count = (int)(header >> 1);
+        // in last bit-packing group, there may be some useless value,
+        // lastBitPackedNum indicates how many values is useful
+        uint8_t last_bit_packed_num;
+        int ret = common::E_OK;
+        if (RET_FAIL(common::SerializationUtil::read_ui8(last_bit_packed_num,
+                                                         byte_cache_))) {
+            return ret;
+        }
+        if (bit_packed_group_count > 0) {
+            current_count_ =
+                (bit_packed_group_count - 1) * 8 + last_bit_packed_num;
+            bitpacking_num_ = current_count_;
+        } else {
+            printf(
+                "tsfile-encoding IntRleDecoder: bit_packed_group_count %d, "
+                "smaller "
+                "than 1",
+                bit_packed_group_count);

Review Comment:
   Log or return an error code?



##########
cpp/test/encoding/int32_packer_test.cc:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License a
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "encoding/int32_packer.h"
+
+#include <gtest/gtest.h>
+
+#include <bitset>
+
+namespace storage {
+
+TEST(IntPackerTest, SequentialValues) {
+    for (int width = 4; width < 32; ++width) {

Review Comment:
   Why not [1, 32]?



##########
cpp/src/encoding/decoder_factory.h:
##########
@@ -53,47 +56,86 @@ class DecoderFactory {
         }
     }
 
-    static Decoder *alloc_value_decoder(common::TSEncoding encoding,
+    static Decoder* alloc_value_decoder(common::TSEncoding encoding,
                                         common::TSDataType data_type) {
-        if (encoding == common::PLAIN) {
-            ALLOC_AND_RETURN_DECODER(PlainDecoder);
-        } else if (encoding == common::GORILLA) {
-            if (data_type == common::INT32 || data_type == common::DATE) {
-                ALLOC_AND_RETURN_DECODER(IntGorillaDecoder);
-            } else if (data_type == common::INT64 ||
-                       data_type == common::TIMESTAMP) {
-                ALLOC_AND_RETURN_DECODER(LongGorillaDecoder);
-            } else if (data_type == common::FLOAT) {
-                ALLOC_AND_RETURN_DECODER(FloatGorillaDecoder);
-            } else if (data_type == common::DOUBLE) {
-                ALLOC_AND_RETURN_DECODER(DoubleGorillaDecoder);
-            } else {
+        using namespace common;
+
+        switch (encoding) {
+            case PLAIN:
+                ALLOC_AND_RETURN_DECODER(PlainDecoder);
+
+            case DICTIONARY:
+                switch (data_type) {
+                    case STRING:
+                    case TEXT:
+                        ALLOC_AND_RETURN_DECODER(DictionaryDecoder);
+                    default:
+                        ASSERT(false);
+                }

Review Comment:
   Maybe we should introduce error code for this method.



##########
cpp/test/encoding/int32_packer_test.cc:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License a
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "encoding/int32_packer.h"
+
+#include <gtest/gtest.h>
+
+#include <bitset>
+
+namespace storage {
+
+TEST(IntPackerTest, SequentialValues) {
+    for (int width = 4; width < 32; ++width) {
+        int32_t arr[8];
+        for (int i = 0; i < 8; ++i) arr[i] = i;
+        Int32Packer packer(width);
+        const int bufSize = NUM_OF_INTS * width / 8;
+        std::vector<unsigned char> buf(bufSize, 0);
+        packer.pack_8values(arr, 0, buf.data());
+        int32_t res[8] = {0};
+        packer.unpack_8values(buf.data(), 0, res);
+        for (int i = 0; i < 8; ++i) {
+            EXPECT_EQ(res[i], arr[i]) << "Width=" << width << " Index=" << i;
+        }
+    }
+}
+
+TEST(IntPackerStressTest, PackUnpackRandomPositiveValues) {
+    const int width = 31;
+    const int count = 100000;
+    const int total_values = count * 8;
+
+    Int32Packer packer(width);
+    std::vector<int32_t> pre_values;
+    std::vector<unsigned char> buffer;
+    pre_values.reserve(total_values);
+    buffer.resize(count * width);
+    int idx = 0;
+    std::srand(12345);  // Optional: deterministic seed
+    for (int i = 0; i < count; ++i) {
+        int32_t vs[8];
+        for (int j = 0; j < 8; ++j) {
+            vs[j] = std::rand() &
+                    0x7FFFFFFF;  // ensure non-negative (Java `nextInt`)
+            pre_values.push_back(vs[j]);
+        }
+
+        unsigned char temp_buf[32] = {0};
+        packer.pack_8values(vs, 0, temp_buf);
+        std::memcpy(buffer.data() + idx, temp_buf, width);
+        idx += width;
+    }
+
+    std::vector<int32_t> res(total_values);
+    packer.unpack_all_values(buffer.data(), static_cast<int>(buffer.size()),
+                             res.data());
+    for (int i = 0; i < total_values; ++i) {
+        ASSERT_EQ(res[i], pre_values[i]) << "Mismatch at index " << i;

Review Comment:
   May also give the values to help with debugging.



##########
cpp/test/encoding/dictionary_codec_test.cc:
##########
@@ -80,21 +82,69 @@ TEST_F(DictionaryTest, DictionaryEncoderAndDecoder) {
     ASSERT_EQ(decoder.read_string(stream), "apple");
 }
 
-TEST_F(DictionaryTest, DictionaryEncoderAndDecoderLargeQuantities) {
+TEST_F(DictionaryTest, DictionaryEncoderAndDecoderOneItem) {
     DictionaryEncoder encoder;
     common::ByteStream stream(1024, common::MOD_DICENCODE_OBJ);
     encoder.init();
 
-    for (int64_t value = 1; value < 10000; value++) {
-        encoder.encode(std::to_string(value), stream);
+    encoder.encode("apple", stream);
+    encoder.flush(stream);
+
+    DictionaryDecoder decoder;
+    decoder.init();
+
+    ASSERT_TRUE(decoder.has_next(stream));
+    ASSERT_EQ(decoder.read_string(stream), "apple");
+
+    ASSERT_FALSE(decoder.has_next(stream));
+}
+
+TEST_F(DictionaryTest,
+       DictionaryEncoderAndDecoderLargeQuantitiesWithRandomStrings) {
+    DictionaryEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DICENCODE_OBJ);
+    encoder.init();
+
+    // Prepare random string generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> length_dist(5, 20);  // String length range
+    std::uniform_int_distribution<> char_dist(33,
+                                              126);  // Printable ASCII range
+
+    // Generate 10000 random strings
+    const int num_strings = 10000;
+    std::vector<std::string> test_strings;
+    std::unordered_set<std::string> string_set;  // For ensuring uniqueness
+
+    while (test_strings.size() < num_strings) {
+        int length = length_dist(gen);
+        std::string str;
+        str.reserve(length);
+
+        for (int i = 0; i < length; ++i) {
+            str.push_back(static_cast<char>(char_dist(gen)));
+        }
+
+        // Ensure string uniqueness
+        if (string_set.insert(str).second) {
+            test_strings.push_back(str);
+        }

Review Comment:
   Better to add a test where strings may repeat.



##########
cpp/src/encoding/int32_rle_decoder.h:
##########
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef ENCODING_INT32RLE_DECODER_H
+#define ENCODING_INT32RLE_DECODER_H
+
+#include <vector>
+
+#include "common/allocator/alloc_base.h"
+#include "decoder.h"
+#include "encoder.h"
+#include "encoding/encode_utils.h"
+#include "encoding/int32_packer.h"
+
+namespace storage {
+
+class Int32RleDecoder : public Decoder {
+   private:
+    uint32_t length_;
+    uint32_t bit_width_;
+    int bitpacking_num_;
+    bool is_length_and_bitwidth_readed_;
+    int current_count_;
+    common::ByteStream byte_cache_;
+    int32_t *current_buffer_;
+    Int32Packer *packer_;
+    uint8_t *tmp_buf_;
+
+   public:
+    Int32RleDecoder()
+        : length_(0),
+          bit_width_(0),
+          bitpacking_num_(0),
+          is_length_and_bitwidth_readed_(false),
+          current_count_(0),
+          byte_cache_(1024, common::MOD_DECODER_OBJ),
+          current_buffer_(nullptr),
+          packer_(nullptr),
+          tmp_buf_(nullptr) {}
+    ~Int32RleDecoder() override { destroy(); }
+
+    bool has_remaining() override { return has_next_package(); }
+    int read_boolean(bool &ret_value, common::ByteStream &in) {
+        int32_t bool_value;
+        read_int32(bool_value, in);
+        ret_value = bool_value == 0 ? false : true;
+        return common::E_OK;
+    }
+    int read_int32(int32_t &ret_value, common::ByteStream &in) override {
+        ret_value = static_cast<int32_t>(read_int(in));
+        return common::E_OK;
+    }
+    int read_int64(int64_t &ret_value, common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+    int read_float(float &ret_value, common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+    int read_double(double &ret_value, common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+    int read_String(common::String &ret_value, common::PageArena &pa,
+                    common::ByteStream &in) override {
+        return common::E_TYPE_NOT_MATCH;
+    }
+
+    void init() {
+        packer_ = nullptr;
+        is_length_and_bitwidth_readed_ = false;
+        length_ = 0;
+        bit_width_ = 0;
+        bitpacking_num_ = 0;
+        current_count_ = 0;
+    }
+
+    bool has_next(common::ByteStream &buffer) {
+        if (current_count_ > 0 || buffer.remaining_size() > 0 ||
+            has_next_package()) {
+            return true;
+        }
+        return false;
+    }
+
+    bool has_next_package() {
+        return current_count_ > 0 || byte_cache_.remaining_size() > 0;
+    }
+
+    int32_t read_int(common::ByteStream &buffer) {
+        if (!is_length_and_bitwidth_readed_) {
+            // start to reader a new rle+bit-packing pattern
+            read_length_and_bitwidth(buffer);
+        }
+        if (current_count_ == 0) {
+            uint8_t header;
+            int ret = common::E_OK;
+            if (RET_FAIL(
+                    common::SerializationUtil::read_ui8(header, byte_cache_))) 
{
+                return ret;
+            }
+            call_read_bit_packing_buffer(header);
+        }
+        --current_count_;
+        int32_t result = current_buffer_[bitpacking_num_ - current_count_ - 1];
+        if (!has_next_package()) {
+            is_length_and_bitwidth_readed_ = false;
+        }
+        return result;
+    }
+
+    int call_read_bit_packing_buffer(uint8_t header) {
+        int bit_packed_group_count = (int)(header >> 1);
+        // in last bit-packing group, there may be some useless value,
+        // lastBitPackedNum indicates how many values is useful
+        uint8_t last_bit_packed_num;
+        int ret = common::E_OK;
+        if (RET_FAIL(common::SerializationUtil::read_ui8(last_bit_packed_num,
+                                                         byte_cache_))) {
+            return ret;
+        }
+        if (bit_packed_group_count > 0) {
+            current_count_ =
+                (bit_packed_group_count - 1) * 8 + last_bit_packed_num;
+            bitpacking_num_ = current_count_;
+        } else {
+            printf(
+                "tsfile-encoding IntRleDecoder: bit_packed_group_count %d, "
+                "smaller "
+                "than 1",
+                bit_packed_group_count);
+        }
+        read_bit_packing_buffer(bit_packed_group_count, last_bit_packed_num);
+        return ret;
+    }
+
+    void read_bit_packing_buffer(int bit_packed_group_count,
+                                 int last_bit_packed_num) {
+        if (current_buffer_ != nullptr) {
+            delete[] current_buffer_;
+        }
+        current_buffer_ = new int32_t[bit_packed_group_count * 8];

Review Comment:
   common::mem_alloc and check OOM?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to