Copilot commented on code in PR #764:
URL: https://github.com/apache/tsfile/pull/764#discussion_r3021291002
##########
cpp/test/reader/tree_view/tsfile_reader_tree_test.cc:
##########
@@ -425,3 +426,99 @@ TEST_F(TsFileTreeReaderTest, ExtendedRowsAndColumnsTest) {
delete measurement;
}
}
+
+// Regression test: query_table_on_tree on a device path with three or more
+// dot-segments (e.g. "root.sensors.TH") previously SEGVed because:
+// 1. StringArrayDeviceID split "root.sensors.TH" into ["root","sensors","TH"]
+// instead of the correct ["root.sensors","TH"], so get_table_name()
returned
+// "root" instead of "root.sensors".
+// 2. load_device_index_entry used operator[] on the table map which inserted a
+// null entry, then asserted on it.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeDeepDevicePath) {
+ TsFileTreeWriter writer(&write_file_);
+ // Device paths with 3 dot-segments: table_name="root.sensors", device="TH"
+ std::string device_id = "root.sensors.TH";
+ std::string m_temp = "temperature";
+ std::string m_humi = "humidity";
+ auto* ms_temp = new MeasurementSchema(m_temp, INT32);
+ auto* ms_humi = new MeasurementSchema(m_humi, INT32);
+ ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_temp));
+ ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_humi));
+ delete ms_temp;
+ delete ms_humi;
+
+ for (int ts = 0; ts < 5; ts++) {
+ TsRecord rec(device_id, ts);
+ rec.add_point(m_temp, static_cast<int32_t>(20 + ts));
+ rec.add_point(m_humi, static_cast<int32_t>(50 + ts));
+ ASSERT_EQ(E_OK, writer.write(rec));
+ }
+ writer.flush();
+ writer.close();
+
+ TsFileReader reader;
+ ASSERT_EQ(E_OK, reader.open(file_name_));
+ ResultSet* result;
+ // query_table_on_tree used to SEGV here due to wrong table-name lookup
+ ASSERT_EQ(E_OK,
+ reader.query_table_on_tree({m_temp, m_humi}, INT64_MIN,
+ INT64_MAX, result));
Review Comment:
For consistency with the other test in this file (and to avoid any
accidental use of an uninitialized pointer if this code is modified later),
initialize `result` to `nullptr` before passing it to `query_table_on_tree`.
##########
cpp/test/reader/tree_view/tsfile_reader_tree_test.cc:
##########
@@ -425,3 +426,99 @@ TEST_F(TsFileTreeReaderTest, ExtendedRowsAndColumnsTest) {
delete measurement;
}
}
+
+// Regression test: query_table_on_tree on a device path with three or more
+// dot-segments (e.g. "root.sensors.TH") previously SEGVed because:
+// 1. StringArrayDeviceID split "root.sensors.TH" into ["root","sensors","TH"]
+// instead of the correct ["root.sensors","TH"], so get_table_name()
returned
+// "root" instead of "root.sensors".
+// 2. load_device_index_entry used operator[] on the table map which inserted a
+// null entry, then asserted on it.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeDeepDevicePath) {
+ TsFileTreeWriter writer(&write_file_);
+ // Device paths with 3 dot-segments: table_name="root.sensors", device="TH"
+ std::string device_id = "root.sensors.TH";
+ std::string m_temp = "temperature";
+ std::string m_humi = "humidity";
+ auto* ms_temp = new MeasurementSchema(m_temp, INT32);
+ auto* ms_humi = new MeasurementSchema(m_humi, INT32);
+ ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_temp));
+ ASSERT_EQ(E_OK, writer.register_timeseries(device_id, ms_humi));
+ delete ms_temp;
+ delete ms_humi;
+
+ for (int ts = 0; ts < 5; ts++) {
+ TsRecord rec(device_id, ts);
+ rec.add_point(m_temp, static_cast<int32_t>(20 + ts));
+ rec.add_point(m_humi, static_cast<int32_t>(50 + ts));
+ ASSERT_EQ(E_OK, writer.write(rec));
+ }
+ writer.flush();
+ writer.close();
+
+ TsFileReader reader;
+ ASSERT_EQ(E_OK, reader.open(file_name_));
+ ResultSet* result;
+ // query_table_on_tree used to SEGV here due to wrong table-name lookup
+ ASSERT_EQ(E_OK,
+ reader.query_table_on_tree({m_temp, m_humi}, INT64_MIN,
+ INT64_MAX, result));
+
+ auto* trs = static_cast<storage::TableResultSet*>(result);
+ bool has_next = false;
+ int row_cnt = 0;
+ while (IS_SUCC(trs->next(has_next)) && has_next) {
+ row_cnt++;
+ }
+ EXPECT_EQ(row_cnt, 5);
+ reader.destroy_query_data_set(result);
+ reader.close();
+}
+
+// Regression test: load_device_index_entry previously used operator[] to look
+// up the table node, which silently inserted a null entry and then asserted.
+// After the fix it uses find() and returns E_DEVICE_NOT_EXIST gracefully.
+// This is triggered when querying a measurement that no device in the file
has.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeMissingMeasurement) {
+ // Use the same multi-device setup as ReadTreeByTable to ensure a valid
file.
+ TsFileTreeWriter writer(&write_file_);
+ std::vector<std::string> device_ids = {"root.db1.t1", "root.db2.t1"};
+ std::string m_temp = "temperature";
+ for (auto dev : device_ids) {
+ auto* ms = new MeasurementSchema(m_temp, INT32);
+ ASSERT_EQ(E_OK, writer.register_timeseries(dev, ms));
+ delete ms;
+ TsRecord rec(dev, 0);
+ rec.add_point(m_temp, static_cast<int32_t>(25));
+ ASSERT_EQ(E_OK, writer.write(rec));
+ }
+ writer.flush();
+ writer.close();
+
+ TsFileReader reader;
+ ASSERT_EQ(E_OK, reader.open(file_name_));
+ ResultSet* result = nullptr;
+ // "nonexistent" is not present in any device. Before the fix,
+ // load_device_index_entry used operator[] which inserted null and crashed.
+ // After the fix it returns E_DEVICE_NOT_EXIST or E_COLUMN_NOT_EXIST.
+ int ret = reader.query_table_on_tree({"nonexistent"}, INT64_MIN,
+ INT64_MAX, result);
+ EXPECT_NE(ret, E_OK); // Must not succeed (measurement not found)
+ if (result != nullptr) {
+ reader.destroy_query_data_set(result);
+ }
+ reader.close();
+}
+
+TEST_F(TsFileTreeReaderTest, simpletest) {
+ TsFileReader reader;
+
reader.open("/Users/colin/Library/Containers/com.tencent.xinWeChat/Data/Documents/xwechat_files/wxid_197w1jpv66ag22_cc63/msg/file/2026-03/1761643915818-1-0-0.tsfile");
+ ResultSet* result;
+ int ret = reader.query_table_on_tree({"t", "h"}, INT64_MIN,
+ INT64_MAX, result);
+ ASSERT_EQ(ret, E_OK);
+
+ auto* table_result_set = (storage::TableResultSet*)result;
+ bool has_next = false;
+ print_table_result_set((table_result_set));
+}
Review Comment:
This unit test is not hermetic: it depends on an absolute, machine-specific
path and will fail in CI/other dev machines. It also doesn’t clean up the
`ResultSet` via `destroy_query_data_set(...)` or `reader.close()` on success.
Please remove this test, or convert it into a disabled/manual test that uses
repo testdata (checked-in fixture) and performs proper cleanup.
```suggestion
```
##########
cpp/src/encoding/int64_rle_decoder.h:
##########
@@ -122,6 +126,34 @@ class Int64RleDecoder : public Decoder {
return result;
}
+ int call_read_rle_run(uint8_t header) {
+ int ret = common::E_OK;
+ int run_length = (int)(header >> 1);
Review Comment:
Because `call_read_rle_run` takes `uint8_t header`, `run_length` is capped
to 127 and cannot represent larger runs even if the encoded header is a
multi-byte varint. This should take the full decoded varint header value (e.g.,
`uint32_t`/`uint64_t header_value`) and compute `run_length = header_value >>
1` to support large run counts.
##########
cpp/src/encoding/int64_rle_decoder.h:
##########
@@ -112,7 +112,11 @@ class Int64RleDecoder : public Decoder {
common::SerializationUtil::read_ui8(header, byte_cache_)))
{
return ret;
}
- call_read_bit_packing_buffer(header);
+ if (header & 1) {
+ call_read_bit_packing_buffer(header);
+ } else {
+ call_read_rle_run(header);
+ }
Review Comment:
The Int64 decoder still reads the run header as a single byte (`read_ui8`).
In the RLE/bit-packing hybrid format, the group header is a *varint*, so run
counts like 64+ require multi-byte headers (same regression you fixed in
`Int32RleDecoder`). This will mis-decode/lose alignment for larger run counts.
Update this to read an unsigned varint header value (e.g., `read_var_uint(...)`
into a wider integer type) and pass that through to both the RLE-run and
bit-packing paths.
##########
cpp/test/encoding/int32_rle_codec_test.cc:
##########
@@ -164,4 +164,131 @@ TEST_F(Int32RleEncoderTest, EncodeFlushWithoutData) {
EXPECT_EQ(stream.total_size(), 0u);
}
+// Helper: write a manually crafted RLE segment (Java/Parquet hybrid RLE
format):
+// [length_varint] [bit_width] [group_header_varint] [value_bytes...]
+// run_count must be the actual count (written as (run_count<<1)|0 varint).
+static void write_rle_segment(common::ByteStream& stream, uint8_t bit_width,
+ uint32_t run_count, int32_t value) {
+ common::ByteStream content(32, common::MOD_ENCODER_OBJ);
+ common::SerializationUtil::write_ui8(bit_width, content);
+ // Group header: (run_count << 1) | 0 = even varint
+ common::SerializationUtil::write_var_uint(run_count << 1, content);
+ // Value: ceil(bit_width / 8) bytes, little-endian
+ int byte_width = (bit_width + 7) / 8;
+ for (int i = 0; i < byte_width; i++) {
+ common::SerializationUtil::write_ui8((value >> (i * 8)) & 0xFF,
Review Comment:
This extracts bytes from a signed `int32_t` using right-shift.
Right-shifting negative signed integers is implementation-defined in C++. Even
though current test values are positive, this helper is generic and could be
reused; consider casting to `uint32_t` before shifting (or using a `uint32_t u
= static_cast<uint32_t>(value)` for extraction) to make the encoding
well-defined.
```suggestion
uint32_t u = static_cast<uint32_t>(value);
for (int i = 0; i < byte_width; i++) {
common::SerializationUtil::write_ui8((u >> (i * 8)) & 0xFF,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]