ColinLeeo commented on code in PR #748:
URL: https://github.com/apache/tsfile/pull/748#discussion_r3015043042


##########
cpp/src/common/tablet.h:
##########
@@ -46,14 +46,78 @@ class TabletColIterator;
  * with their associated metadata such as column names and types.
  */
 class Tablet {
+    // Arrow-style string column: offsets + contiguous buffer.
+    // string[i] = buffer + offsets[i], len = offsets[i+1] - offsets[i]
+    struct StringColumn {
+        uint32_t* offsets;      // length: max_rows + 1
+        char* buffer;           // contiguous string data
+        uint32_t buf_capacity;  // allocated buffer size
+        uint32_t buf_used;      // bytes written so far
+
+        StringColumn()
+            : offsets(nullptr), buffer(nullptr), buf_capacity(0), buf_used(0) 
{}
+
+        void init(uint32_t max_rows, uint32_t init_buf_capacity) {
+            offsets = (uint32_t*)common::mem_alloc(
+                sizeof(uint32_t) * (max_rows + 1), common::MOD_DEFAULT);
+            offsets[0] = 0;
+            buf_capacity = init_buf_capacity;
+            buffer =
+                (char*)common::mem_alloc(buf_capacity, common::MOD_DEFAULT);
+            buf_used = 0;
+        }
+
+        void destroy() {
+            if (offsets) common::mem_free(offsets);
+            offsets = nullptr;
+            if (buffer) common::mem_free(buffer);
+            buffer = nullptr;
+            buf_capacity = buf_used = 0;
+        }
+
+        void reset() {
+            buf_used = 0;
+            if (offsets) offsets[0] = 0;
+        }
+
+        void append(uint32_t row, const char* data, uint32_t len) {
+            // Grow buffer if needed
+            if (buf_used + len > buf_capacity) {
+                buf_capacity = buf_capacity * 2 + len;
+                buffer = (char*)common::mem_realloc(buffer, buf_capacity);
+            }
+            memcpy(buffer + buf_used, data, len);
+            offsets[row] = buf_used;
+            offsets[row + 1] = buf_used + len;
+            buf_used += len;
+        }

Review Comment:
   The Arrow-style offsets buffer requires that each row be appended in order. 
If we allow two rows to share the same segment of data (i.e., offsets[row] == 
offsets[row-1] and offsets[row+1] == offsets[row]), then we must handle this 
kind of “zero-length range” either in the offset normalization path of 
set_column_string_values or add special handling during writing.
   
   This would introduce additional branching and complexity into all downstream 
logic (including Arrow conversion). 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to