(tsfile) branch develop updated: Fix the bitpack_codec to keep it consistent with the Java version. (#123)

jiangtian Wed, 26 Jun 2024 20:17:06 -0700

This is an automated email from the ASF dual-hosted git repository.

jiangtian pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/tsfile.git



The following commit(s) were added to refs/heads/develop by this push:
     new 1b24436d Fix the bitpack_codec to keep it consistent with the Java 
version. (#123)
1b24436d is described below

commit 1b24436def419a6720ac37021e58e5906a4db6cb
Author: Hongzhi Gao <[email protected]>
AuthorDate: Thu Jun 27 11:16:36 2024 +0800

    Fix the bitpack_codec to keep it consistent with the Java version. (#123)
    
    * Fix the bitpack_codec to keep it consistent with the Java version.
    
    * fix bitpack codec
---
 cpp/src/encoding/bitpack_decoder.h | 12 +++++++-----
 cpp/src/encoding/bitpack_encoder.h | 16 ++++++++--------
 cpp/src/encoding/intpacker.h       | 27 ++++++++++++++-------------
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/cpp/src/encoding/bitpack_decoder.h 
b/cpp/src/encoding/bitpack_decoder.h
index 8d431f18..c6aef933 100644
--- a/cpp/src/encoding/bitpack_decoder.h
+++ b/cpp/src/encoding/bitpack_decoder.h
@@ -37,7 +37,7 @@ class BitPackDecoder {
     bool is_length_and_bitwidth_readed_;
     int current_count_;
     common::ByteStream byte_cache_;
-    int *current_buffer_;
+    int64_t *current_buffer_;
     IntPacker *packer_;
     uint8_t *tmp_buf_;
 
@@ -71,7 +71,7 @@ class BitPackDecoder {
         return current_count_ > 0 || byte_cache_.remaining_size() > 0;
     }
 
-    int read_int(common::ByteStream &buffer) {
+    int64_t read_int(common::ByteStream &buffer) {
         if (!is_length_and_bitwidth_readed_) {
             // start to reader a new rle+bit-packing pattern
             read_length_and_bitwidth(buffer);
@@ -86,7 +86,7 @@ class BitPackDecoder {
             call_read_bit_packing_buffer(header);
         }
         --current_count_;
-        int result = current_buffer_[bitpacking_num_ - current_count_ - 1];
+        int64_t result = current_buffer_[bitpacking_num_ - current_count_ - 1];
         if (!has_next_package()) {
             is_length_and_bitwidth_readed_ = false;
         }
@@ -120,7 +120,7 @@ class BitPackDecoder {
 
     void read_bit_packing_buffer(int bit_packed_group_count,
                                  int last_bit_packed_num) {
-        current_buffer_ = new int[bit_packed_group_count * 8];
+        current_buffer_ = new int64_t[bit_packed_group_count * 8];
         unsigned char bytes[bit_packed_group_count * bit_width_];
         int bytes_to_read = bit_packed_group_count * bit_width_;
         if (bytes_to_read > (int)byte_cache_.remaining_size()) {
@@ -155,7 +155,9 @@ class BitPackDecoder {
             }
             byte_cache_.wrap_from((char *)tmp_buf_, length_);
             is_length_and_bitwidth_readed_ = true;
-            common::SerializationUtil::read_ui32(bit_width_, byte_cache_);
+            uint8_t tmp_bit_width;
+            common::SerializationUtil::read_ui8(tmp_bit_width, byte_cache_);
+            bit_width_ = tmp_bit_width;
             init_packer();
         }
         return ret;
diff --git a/cpp/src/encoding/bitpack_encoder.h 
b/cpp/src/encoding/bitpack_encoder.h
index 1dc3edbe..85a385bf 100644
--- a/cpp/src/encoding/bitpack_encoder.h
+++ b/cpp/src/encoding/bitpack_encoder.h
@@ -37,8 +37,8 @@ class BitPackEncoder {
     int bit_width_;
     IntPacker *packer_;
     common::ByteStream byte_cache_;
-    std::vector<int> values_;  // all data tobe encoded
-    int buffered_values_[8];   // encode each 8 values
+    std::vector<int64_t> values_;  // all data tobe encoded
+    int64_t buffered_values_[8];   // encode each 8 values
     std::vector<unsigned char> bytes_buffer_;
 
    public:
@@ -72,7 +72,7 @@ class BitPackEncoder {
         packer_ = nullptr;
     }
 
-    FORCE_INLINE void encode(int value, common::ByteStream &out) {
+    FORCE_INLINE void encode(int64_t value, common::ByteStream &out) {
         values_.push_back(value);
     }
 
@@ -81,7 +81,7 @@ class BitPackEncoder {
         bit_width_ = get_int_max_bit_width(values_);
         ASSERT(packer_ == nullptr);
         packer_ = new IntPacker(bit_width_);
-        common::SerializationUtil::write_i32(bit_width_, byte_cache_);
+        common::SerializationUtil::write_i8(bit_width_, byte_cache_);
         for (size_t i = 0; i < values_.size(); i++) {
             // encodeValue(value);
             buffered_values_[num_buffered_values_] = values_[i];
@@ -108,9 +108,9 @@ class BitPackEncoder {
         // TODO: put the bytes on the stack instead on the heap
         unsigned char *bytes = (unsigned char *)common::mem_alloc(
             bit_width_, common::MOD_BITENCODE_OBJ);
-        int tmp_buffer[8];
+        int64_t tmp_buffer[8];
         for (int i = 0; i < 8; i++) {
-            tmp_buffer[i] = (int)buffered_values_[i];
+            tmp_buffer[i] = (int64_t)buffered_values_[i];
         }
         packer_->pack_8values(tmp_buffer, 0, bytes);
         // we'll not writer bit-packing group to OutputStream immediately
@@ -121,12 +121,12 @@ class BitPackEncoder {
         common::mem_free(bytes);
     }
 
-    int get_int_max_bit_width(std::vector<int> values) {
+    int get_int_max_bit_width(std::vector<int64_t> values) {
         // TODO: Optimization - find the maximum value first, and then calcuate
         // the bit width
         int max = 1;
         for (size_t i = 0; i < values.size(); i++) {
-            int bitWidth = 32 - number_of_leading_zeros(values[i]);
+            int bitWidth = 64 - number_of_leading_zeros(values[i]);
             if (bitWidth > max) {
                 max = bitWidth;
             }
diff --git a/cpp/src/encoding/intpacker.h b/cpp/src/encoding/intpacker.h
index 27b73e39..483d0c92 100644
--- a/cpp/src/encoding/intpacker.h
+++ b/cpp/src/encoding/intpacker.h
@@ -39,7 +39,7 @@ class IntPacker {
     void reset() { /* do thing for IntPacker */
     }
 
-    void pack_8values(int values[], int offset, unsigned char buf[]) {
+    void pack_8values(int64_t values[], int offset, unsigned char buf[]) {
         int buf_idx = 0;
         int value_idx = offset;
         // remaining bits for the current unfinished Integer
@@ -47,13 +47,13 @@ class IntPacker {
 
         while (value_idx < NUM_OF_INTS + offset) {
             // buffer is used for saving 32 bits as a part of result
-            int buffer = 0;
+            int64_t buffer = 0;
             // remaining size of bits in the 'buffer'
-            int left_size = 32;
+            int left_size = 64;
 
             // encode the left bits of current Integer to 'buffer'
             if (left_bit > 0) {
-                buffer |= (values[value_idx] << (32 - left_bit));
+                buffer |= (values[value_idx] << (64 - left_bit));
                 left_size -= left_bit;
                 left_bit = 0;
                 value_idx++;
@@ -70,18 +70,19 @@ class IntPacker {
             if (left_size > 0 && value_idx < NUM_OF_INTS + offset) {
                 // put the first 'left_size' bits of the Integer into remaining
                 // space of the buffer
-                buffer |= ((unsigned)values[value_idx] >> (width_ - 
left_size));
+                buffer |= ((uint64_t)values[value_idx] >> (width_ - 
left_size));
                 left_bit = width_ - left_size;
             }
 
             // put the buffer into the final result
-            for (int j = 0; j < 4; j++) {
+            for (int j = 0; j < 8; j++) {
                 buf[buf_idx] =
-                    (unsigned char)(((unsigned)buffer >> ((3 - j) * 8)) & 
0xFF);
+                    (unsigned char)(((uint64_t)buffer >> ((8 - j - 1) * 8)) &
+                                    0xFF);
                 buf_idx++;
                 // width_ is the bit num of each value, but here is means the
                 // max byte num
-                if (buf_idx >= width_) {
+                if (buf_idx >= width_ * 8 / 8) {
                     return;
                 }
             }
@@ -96,9 +97,9 @@ class IntPacker {
      * @param values - decoded result , the length of 'values' should be @{link
      * IntPacker#NUM_OF_INTS}
      */
-    void unpack_8values(unsigned char buf[], int offset, int values[]) {
+    void unpack_8values(unsigned char buf[], int offset, int64_t values[]) {
         int byte_idx = offset;
-        unsigned long buffer = 0;
+        uint64_t buffer = 0;
         // total bits which have reader from 'buf' to 'buffer'. i.e.,
         // number of available bits to be decoded.
         int total_bits = 0;
@@ -133,16 +134,16 @@ class IntPacker {
      * @param length length of bytes to be decoded in buf.
      * @param values decoded result.
      */
-    void unpack_all_values(unsigned char buf[], int length, int values[]) {
+    void unpack_all_values(unsigned char buf[], int length, int64_t values[]) {
         int idx = 0;
         int k = 0;
         while (idx < length) {
-            int tv[8];
+            int64_t tv[8];
             // decode 8 values one time, current result will be saved in the
             // array named 'tv'
             unpack_8values(buf, idx, tv);
             // System.arraycopy(tv, 0, values, k, 8);
-            std::memmove(values + k, tv, 8 * sizeof(int));
+            std::memmove(values + k, tv, 8 * sizeof(int64_t));
             idx += width_;
             k += 8;
         }

(tsfile) branch develop updated: Fix the bitpack_codec to keep it consistent with the Java version. (#123)

Reply via email to