Re: [PR] Remove timestamps cache to optmize init phase [tsfile]

via GitHub Wed, 08 Apr 2026 21:30:51 -0700


jt2594838 commented on code in PR #769:
URL: https://github.com/apache/tsfile/pull/769#discussion_r3055468541



##########
python/tsfile/dataset/reader.py:
##########
@@ -138,51 +146,64 @@ def _cache_metadata_table_model(self):
                 continue
 
             table_id = self._catalog.add_table(table_name, tag_columns, 
tag_types, field_columns)
-            time_arrays = []
-            tag_arrays = {tag_column: [] for tag_column in tag_columns}
-
-            # [Temporary] It will be replaced by new tsfile api, we won't 
query all the data later.
-            query_columns = tag_columns + field_columns
-
-            with self._reader.query_table(table_name, query_columns, 
batch_size=65536) as result_set:
-                while True:
-                    arrow_table = result_set.read_arrow_batch()
-                    if arrow_table is None:
-                        break
-                    batch_rows = arrow_table.num_rows
-                    total_rows += batch_rows
-                    time_arrays.append(arrow_table.column("time").to_numpy())
-                    for tag_column in tag_columns:
-                        
tag_arrays[tag_column].append(arrow_table.column(tag_column).to_numpy())
-
-                    if self.show_progress:
-                        sys.stderr.write(
-                            f"\rReading TsFile metadata: table {table_index + 
1}/{len(table_names)} "
-                            f"[{table_name}] ({total_rows:,} rows)"
-                        )
-                        sys.stderr.flush()
-
-            if not time_arrays:
-                continue
-
-            timestamps = np.concatenate(time_arrays).astype(np.int64)
-            if not tag_columns:
-                self._add_device(table_id, (), timestamps)
-                continue
-
-            for tag_values, device_timestamps in 
self._iter_device_groups(tag_columns, timestamps, tag_arrays):
-                self._add_device(table_id, tag_values, device_timestamps)
-
-        if self.show_progress and total_rows > 0:
+            table_groups = [
+                group
+                for group in metadata_groups.values()
+                if (group.table_name or "").lower() == table_name.lower()
+            ]
+            table_groups.sort(key=lambda group: tuple("" if value is None else 
str(value) for value in group.segments))
+
+            for group in table_groups:
+                stats = self._metadata_device_stats(group)
+                if stats is None:
+                    continue
+                tag_values = self._metadata_tag_values(group, len(tag_columns))
+                if len(tag_values) != len(tag_columns):
+                    continue

Review Comment:
   It is expected that a DeviceId can be a prefix of all tag columns, 
unnecessary to skip it.



##########
python/tsfile/dataset/timeseries.py:
##########
@@ -101,32 +103,29 @@ def __len__(self) -> int:
         return self._stats["count"]
 
     def __getitem__(self, key):
-        timestamps = self.timestamps
-        length = len(timestamps)
+        self._ensure_open()
+        length = len(self)
 
         if isinstance(key, int):
             if key < 0:
                 key += length
             if key < 0 or key >= length:
                 raise IndexError(f"Index {key} out of range [0, {length})")
-            ts = int(timestamps[key])
-            _, values = self._query_time_range(ts, ts)
+            _, values = self._read_by_position(key, 1)
             return float(values[0]) if len(values) > 0 else None
 
         if isinstance(key, slice):
-            requested_ts = timestamps[key]
-            if len(requested_ts) == 0:
+            positions = list(range(*key.indices(length)))
+            if not positions:

Review Comment:
   If the step is 1 or not set, it is unnecessary to list all positions?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Remove timestamps cache to optmize init phase [tsfile]

Reply via email to