jt2594838 commented on code in PR #748:
URL: https://github.com/apache/tsfile/pull/748#discussion_r2999946885
##########
cpp/src/common/tablet.cc:
##########
@@ -444,6 +444,57 @@ void Tablet::set_column_categories(
}
}
+void Tablet::reset_string_columns() {
+ size_t schema_count = schema_vec_->size();
+ for (size_t c = 0; c < schema_count; c++) {
+ const MeasurementSchema& schema = schema_vec_->at(c);
+ if (schema.data_type_ == STRING || schema.data_type_ == TEXT ||
+ schema.data_type_ == BLOB) {
+ value_matrix_[c].string_col->reset();
+ }
+ }
+}
+
+std::vector<uint32_t> Tablet::find_all_device_boundaries() const {
+ const uint32_t row_count = get_cur_row_size();
+ if (row_count <= 1) return {};
+
+ // Use uint64_t bitmap instead of vector<bool> for faster set/test/scan.
+ const uint32_t nwords = (row_count + 63) / 64;
+ std::vector<uint64_t> boundary(nwords, 0);
+
+ for (auto col_idx : id_column_indexes_) {
+ const StringColumn& sc = *value_matrix_[col_idx].string_col;
+ const uint32_t* off = sc.offsets;
+ const char* buf = sc.buffer;
+ for (uint32_t i = 1; i < row_count; i++) {
+ if (boundary[i >> 6] & (1ULL << (i & 63))) continue;
+ uint32_t len_a = off[i] - off[i - 1];
+ uint32_t len_b = off[i + 1] - off[i];
+ if (len_a != len_b ||
+ (len_a > 0 &&
+ memcmp(buf + off[i - 1], buf + off[i], len_a) != 0)) {
+ boundary[i >> 6] |= (1ULL << (i & 63));
+ }
+ }
+ }
Review Comment:
May traverse tag columns in reversed order, because we tend to organize tags
from big (like country) to small (like street).
You are more likely to find differences between small tags within the same
TsFile or write batch.
##########
cpp/src/common/tablet.h:
##########
@@ -46,14 +46,78 @@ class TabletColIterator;
* with their associated metadata such as column names and types.
*/
class Tablet {
+ // Arrow-style string column: offsets + contiguous buffer.
+ // string[i] = buffer + offsets[i], len = offsets[i+1] - offsets[i]
+ struct StringColumn {
+ uint32_t* offsets; // length: max_rows + 1
+ char* buffer; // contiguous string data
+ uint32_t buf_capacity; // allocated buffer size
+ uint32_t buf_used; // bytes written so far
+
+ StringColumn()
+ : offsets(nullptr), buffer(nullptr), buf_capacity(0), buf_used(0)
{}
+
+ void init(uint32_t max_rows, uint32_t init_buf_capacity) {
+ offsets = (uint32_t*)common::mem_alloc(
+ sizeof(uint32_t) * (max_rows + 1), common::MOD_DEFAULT);
+ offsets[0] = 0;
+ buf_capacity = init_buf_capacity;
+ buffer =
+ (char*)common::mem_alloc(buf_capacity, common::MOD_DEFAULT);
+ buf_used = 0;
+ }
+
+ void destroy() {
+ if (offsets) common::mem_free(offsets);
+ offsets = nullptr;
+ if (buffer) common::mem_free(buffer);
+ buffer = nullptr;
+ buf_capacity = buf_used = 0;
+ }
+
+ void reset() {
+ buf_used = 0;
+ if (offsets) offsets[0] = 0;
+ }
+
+ void append(uint32_t row, const char* data, uint32_t len) {
+ // Grow buffer if needed
+ if (buf_used + len > buf_capacity) {
+ buf_capacity = buf_capacity * 2 + len;
+ buffer = (char*)common::mem_realloc(buffer, buf_capacity);
+ }
+ memcpy(buffer + buf_used, data, len);
+ offsets[row] = buf_used;
+ offsets[row + 1] = buf_used + len;
+ buf_used += len;
+ }
Review Comment:
If data equals the value of the previous row, may simply use the same
offsets and avoid a memory copy.
However, if the memory comparison is too often, but the memory copy is not
avoided, we should stop comparing them.
##########
cpp/src/common/tablet.cc:
##########
@@ -98,10 +98,9 @@ int Tablet::init() {
case BLOB:
case TEXT:
case STRING: {
- value_matrix_[c].string_data =
- static_cast<common::String*>(common::mem_alloc(
- sizeof(String) * max_row_num_, common::MOD_TABLET));
- if (value_matrix_[c].string_data == nullptr) return E_OOM;
+ auto* sc = new StringColumn();
+ sc->init(max_row_num_, max_row_num_ * 32);
+ value_matrix_[c].string_col = sc;
Review Comment:
Use mem_alloc?
##########
cpp/src/common/tablet.cc:
##########
@@ -444,6 +444,57 @@ void Tablet::set_column_categories(
}
}
+void Tablet::reset_string_columns() {
+ size_t schema_count = schema_vec_->size();
+ for (size_t c = 0; c < schema_count; c++) {
+ const MeasurementSchema& schema = schema_vec_->at(c);
+ if (schema.data_type_ == STRING || schema.data_type_ == TEXT ||
+ schema.data_type_ == BLOB) {
+ value_matrix_[c].string_col->reset();
+ }
+ }
+}
+
+std::vector<uint32_t> Tablet::find_all_device_boundaries() const {
+ const uint32_t row_count = get_cur_row_size();
+ if (row_count <= 1) return {};
+
+ // Use uint64_t bitmap instead of vector<bool> for faster set/test/scan.
+ const uint32_t nwords = (row_count + 63) / 64;
+ std::vector<uint64_t> boundary(nwords, 0);
+
+ for (auto col_idx : id_column_indexes_) {
+ const StringColumn& sc = *value_matrix_[col_idx].string_col;
+ const uint32_t* off = sc.offsets;
+ const char* buf = sc.buffer;
+ for (uint32_t i = 1; i < row_count; i++) {
+ if (boundary[i >> 6] & (1ULL << (i & 63))) continue;
+ uint32_t len_a = off[i] - off[i - 1];
+ uint32_t len_b = off[i + 1] - off[i];
+ if (len_a != len_b ||
+ (len_a > 0 &&
+ memcmp(buf + off[i - 1], buf + off[i], len_a) != 0)) {
+ boundary[i >> 6] |= (1ULL << (i & 63));
Review Comment:
If the number of boundaries reaches the number of rows, may break.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]