Re: [PR] Fix/tree query and rle decoder [tsfile]

via GitHub Wed, 01 Apr 2026 03:47:53 -0700


Copilot commented on code in PR #764:
URL: https://github.com/apache/tsfile/pull/764#discussion_r3021291002



##########
cpp/test/reader/tree_view/tsfile_reader_tree_test.cc:
##########
@@ -425,3 +426,99 @@ TEST_F(TsFileTreeReaderTest, ExtendedRowsAndColumnsTest) {
         delete measurement;
     }
 }
+
+// Regression test: query_table_on_tree on a device path with three or more
+// dot-segments (e.g. "root.sensors.TH") previously SEGVed because:
+// 1. StringArrayDeviceID split "root.sensors.TH" into ["root","sensors","TH"]
+//    instead of the correct ["root.sensors","TH"], so get_table_name() 
returned
+//    "root" instead of "root.sensors".
+// 2. load_device_index_entry used operator[] on the table map which inserted a
+//    null entry, then asserted on it.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeDeepDevicePath) {
+    TsFileTreeWriter writer(&write_file_);
+    // Device paths with 3 dot-segments: table_name="root.sensors", device="TH"
+    std::string device_id = "root.sensors.TH";
+    std::string m_temp = "temperature";
+    std::string m_humi = "humidity";
+    auto* ms_temp = new MeasurementSchema(m_temp, INT32);
+    auto* ms_humi = new MeasurementSchema(m_humi, INT32);
+    ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_temp));
+    ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_humi));
+    delete ms_temp;
+    delete ms_humi;
+
+    for (int ts = 0; ts < 5; ts++) {
+        TsRecord rec(device_id, ts);
+        rec.add_point(m_temp, static_cast<int32_t>(20 + ts));
+        rec.add_point(m_humi, static_cast<int32_t>(50 + ts));
+        ASSERT_EQ(E_OK, writer.write(rec));
+    }
+    writer.flush();
+    writer.close();
+
+    TsFileReader reader;
+    ASSERT_EQ(E_OK, reader.open(file_name_));
+    ResultSet* result;
+    // query_table_on_tree used to SEGV here due to wrong table-name lookup
+    ASSERT_EQ(E_OK,
+              reader.query_table_on_tree({m_temp, m_humi}, INT64_MIN,
+                                         INT64_MAX, result));

Review Comment:
   For consistency with the other test in this file (and to avoid any 
accidental use of an uninitialized pointer if this code is modified later), 
initialize `result` to `nullptr` before passing it to `query_table_on_tree`.



##########
cpp/test/reader/tree_view/tsfile_reader_tree_test.cc:
##########
@@ -425,3 +426,99 @@ TEST_F(TsFileTreeReaderTest, ExtendedRowsAndColumnsTest) {
         delete measurement;
     }
 }
+
+// Regression test: query_table_on_tree on a device path with three or more
+// dot-segments (e.g. "root.sensors.TH") previously SEGVed because:
+// 1. StringArrayDeviceID split "root.sensors.TH" into ["root","sensors","TH"]
+//    instead of the correct ["root.sensors","TH"], so get_table_name() 
returned
+//    "root" instead of "root.sensors".
+// 2. load_device_index_entry used operator[] on the table map which inserted a
+//    null entry, then asserted on it.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeDeepDevicePath) {
+    TsFileTreeWriter writer(&write_file_);
+    // Device paths with 3 dot-segments: table_name="root.sensors", device="TH"
+    std::string device_id = "root.sensors.TH";
+    std::string m_temp = "temperature";
+    std::string m_humi = "humidity";
+    auto* ms_temp = new MeasurementSchema(m_temp, INT32);
+    auto* ms_humi = new MeasurementSchema(m_humi, INT32);
+    ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_temp));
+    ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_humi));
+    delete ms_temp;
+    delete ms_humi;
+
+    for (int ts = 0; ts < 5; ts++) {
+        TsRecord rec(device_id, ts);
+        rec.add_point(m_temp, static_cast<int32_t>(20 + ts));
+        rec.add_point(m_humi, static_cast<int32_t>(50 + ts));
+        ASSERT_EQ(E_OK, writer.write(rec));
+    }
+    writer.flush();
+    writer.close();
+
+    TsFileReader reader;
+    ASSERT_EQ(E_OK, reader.open(file_name_));
+    ResultSet* result;
+    // query_table_on_tree used to SEGV here due to wrong table-name lookup
+    ASSERT_EQ(E_OK,
+              reader.query_table_on_tree({m_temp, m_humi}, INT64_MIN,
+                                         INT64_MAX, result));
+
+    auto* trs = static_cast<storage::TableResultSet*>(result);
+    bool has_next = false;
+    int row_cnt = 0;
+    while (IS_SUCC(trs->next(has_next)) && has_next) {
+        row_cnt++;
+    }
+    EXPECT_EQ(row_cnt, 5);
+    reader.destroy_query_data_set(result);
+    reader.close();
+}
+
+// Regression test: load_device_index_entry previously used operator[] to look
+// up the table node, which silently inserted a null entry and then asserted.
+// After the fix it uses find() and returns E_DEVICE_NOT_EXIST gracefully.
+// This is triggered when querying a measurement that no device in the file 
has.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeMissingMeasurement) {
+    // Use the same multi-device setup as ReadTreeByTable to ensure a valid 
file.
+    TsFileTreeWriter writer(&write_file_);
+    std::vector<std::string> device_ids = {"root.db1.t1", "root.db2.t1"};
+    std::string m_temp = "temperature";
+    for (auto dev : device_ids) {
+        auto* ms = new MeasurementSchema(m_temp, INT32);
+        ASSERT_EQ(E_OK, writer.register_timeseries(dev, ms));
+        delete ms;
+        TsRecord rec(dev, 0);
+        rec.add_point(m_temp, static_cast<int32_t>(25));
+        ASSERT_EQ(E_OK, writer.write(rec));
+    }
+    writer.flush();
+    writer.close();
+
+    TsFileReader reader;
+    ASSERT_EQ(E_OK, reader.open(file_name_));
+    ResultSet* result = nullptr;
+    // "nonexistent" is not present in any device. Before the fix,
+    // load_device_index_entry used operator[] which inserted null and crashed.
+    // After the fix it returns E_DEVICE_NOT_EXIST or E_COLUMN_NOT_EXIST.
+    int ret = reader.query_table_on_tree({"nonexistent"}, INT64_MIN,
+                                          INT64_MAX, result);
+    EXPECT_NE(ret, E_OK);  // Must not succeed (measurement not found)
+    if (result != nullptr) {
+        reader.destroy_query_data_set(result);
+    }
+    reader.close();
+}
+
+TEST_F(TsFileTreeReaderTest, simpletest) {
+    TsFileReader reader;
+    
reader.open("/Users/colin/Library/Containers/com.tencent.xinWeChat/Data/Documents/xwechat_files/wxid_197w1jpv66ag22_cc63/msg/file/2026-03/1761643915818-1-0-0.tsfile");
+    ResultSet* result;
+    int ret = reader.query_table_on_tree({"t", "h"}, INT64_MIN,
+                                         INT64_MAX, result);
+    ASSERT_EQ(ret, E_OK);
+
+    auto* table_result_set = (storage::TableResultSet*)result;
+    bool has_next = false;
+    print_table_result_set((table_result_set));
+}

Review Comment:
   This unit test is not hermetic: it depends on an absolute, machine-specific 
path and will fail in CI/other dev machines. It also doesn’t clean up the 
`ResultSet` via `destroy_query_data_set(...)` or `reader.close()` on success. 
Please remove this test, or convert it into a disabled/manual test that uses 
repo testdata (checked-in fixture) and performs proper cleanup.
   ```suggestion
   
   ```



##########
cpp/src/encoding/int64_rle_decoder.h:
##########
@@ -122,6 +126,34 @@ class Int64RleDecoder : public Decoder {
         return result;
     }
 
+    int call_read_rle_run(uint8_t header) {
+        int ret = common::E_OK;
+        int run_length = (int)(header >> 1);

Review Comment:
   Because `call_read_rle_run` takes `uint8_t header`, `run_length` is capped 
to 127 and cannot represent larger runs even if the encoded header is a 
multi-byte varint. This should take the full decoded varint header value (e.g., 
`uint32_t`/`uint64_t header_value`) and compute `run_length = header_value >> 
1` to support large run counts.



##########
cpp/src/encoding/int64_rle_decoder.h:
##########
@@ -112,7 +112,11 @@ class Int64RleDecoder : public Decoder {
                     common::SerializationUtil::read_ui8(header, byte_cache_))) 
{
                 return ret;
             }
-            call_read_bit_packing_buffer(header);
+            if (header & 1) {
+                call_read_bit_packing_buffer(header);
+            } else {
+                call_read_rle_run(header);
+            }

Review Comment:
   The Int64 decoder still reads the run header as a single byte (`read_ui8`). 
In the RLE/bit-packing hybrid format, the group header is a *varint*, so run 
counts like 64+ require multi-byte headers (same regression you fixed in 
`Int32RleDecoder`). This will mis-decode/lose alignment for larger run counts. 
Update this to read an unsigned varint header value (e.g., `read_var_uint(...)` 
into a wider integer type) and pass that through to both the RLE-run and 
bit-packing paths.



##########
cpp/test/encoding/int32_rle_codec_test.cc:
##########
@@ -164,4 +164,131 @@ TEST_F(Int32RleEncoderTest, EncodeFlushWithoutData) {
     EXPECT_EQ(stream.total_size(), 0u);
 }
 
+// Helper: write a manually crafted RLE segment (Java/Parquet hybrid RLE 
format):
+//   [length_varint] [bit_width] [group_header_varint] [value_bytes...]
+// run_count must be the actual count (written as (run_count<<1)|0 varint).
+static void write_rle_segment(common::ByteStream& stream, uint8_t bit_width,
+                               uint32_t run_count, int32_t value) {
+    common::ByteStream content(32, common::MOD_ENCODER_OBJ);
+    common::SerializationUtil::write_ui8(bit_width, content);
+    // Group header: (run_count << 1) | 0 = even varint
+    common::SerializationUtil::write_var_uint(run_count << 1, content);
+    // Value: ceil(bit_width / 8) bytes, little-endian
+    int byte_width = (bit_width + 7) / 8;
+    for (int i = 0; i < byte_width; i++) {
+        common::SerializationUtil::write_ui8((value >> (i * 8)) & 0xFF,

Review Comment:
   This extracts bytes from a signed `int32_t` using right-shift. 
Right-shifting negative signed integers is implementation-defined in C++. Even 
though current test values are positive, this helper is generic and could be 
reused; consider casting to `uint32_t` before shifting (or using a `uint32_t u 
= static_cast<uint32_t>(value)` for extraction) to make the encoding 
well-defined.
   ```suggestion
       uint32_t u = static_cast<uint32_t>(value);
       for (int i = 0; i < byte_width; i++) {
           common::SerializationUtil::write_ui8((u >> (i * 8)) & 0xFF,
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix/tree query and rle decoder [tsfile]

Reply via email to