This is an automated email from the ASF dual-hosted git repository.
colinlee pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/tsfile.git
The following commit(s) were added to refs/heads/develop by this push:
new 237b721e Support dataframe to tsfile (#706)
237b721e is described below
commit 237b721eef251add66a6a2bd59c60c13537c86ab
Author: Colin Lee <[email protected]>
AuthorDate: Sun Feb 15 10:48:28 2026 +0800
Support dataframe to tsfile (#706)
---
cpp/src/common/constant/tsfile_constant.h | 4 +-
cpp/src/common/global.cc | 2 +-
cpp/src/common/tsblock/tuple_desc.h | 30 +-
cpp/src/cwrapper/tsfile_cwrapper.cc | 1 -
cpp/src/cwrapper/tsfile_cwrapper.h | 7 +-
.../reader/block/single_device_tsblock_reader.cc | 8 +
cpp/src/reader/column_mapping.h | 11 +-
cpp/src/reader/table_query_executor.cc | 11 +-
cpp/src/utils/db_utils.h | 7 +-
.../reader/table_view/tsfile_reader_table_test.cc | 81 +++++
.../writer/table_view/tsfile_writer_table_test.cc | 2 +-
python/tests/resources/README.md | 52 ++-
.../tests/resources/table_with_time_column.tsfile | Bin 0 -> 644 bytes
python/tests/test_basic.py | 2 +-
python/tests/test_dataframe.py | 320 +++++++++++++++++
python/tests/test_load_tsfile_from_iotdb.py | 37 +-
python/tests/test_to_tsfile.py | 378 +++++++++++++++++++++
python/tests/test_write_and_read.py | 256 +-------------
python/tsfile/__init__.py | 2 +-
python/tsfile/constants.py | 98 +++++-
python/tsfile/exceptions.py | 2 +-
python/tsfile/schema.py | 64 +++-
python/tsfile/tsfile_cpp.pxd | 8 +-
python/tsfile/tsfile_py_cpp.pxd | 1 +
python/tsfile/tsfile_py_cpp.pyx | 195 ++++++++++-
python/tsfile/tsfile_reader.pyx | 16 +-
python/tsfile/tsfile_table_writer.py | 134 +++++++-
python/tsfile/tsfile_writer.pyx | 30 +-
python/tsfile/utils.py | 139 +++++++-
29 files changed, 1580 insertions(+), 318 deletions(-)
diff --git a/cpp/src/common/constant/tsfile_constant.h
b/cpp/src/common/constant/tsfile_constant.h
index d3f4dec1..096c645a 100644
--- a/cpp/src/common/constant/tsfile_constant.h
+++ b/cpp/src/common/constant/tsfile_constant.h
@@ -37,15 +37,15 @@ static const std::string BACK_QUOTE_STRING = "`";
static const std::string DOUBLE_BACK_QUOTE_STRING = "``";
static const unsigned char TIME_COLUMN_MASK = 0x80;
+static const std::string TIME_COLUMN_NAME = "time";
static const unsigned char VALUE_COLUMN_MASK = 0x40;
-
-static const std::string TIME_COLUMN_ID = "";
static const int NO_STR_TO_READ = -1;
static const std::regex IDENTIFIER_PATTERN("([a-zA-Z0-9_\\u2E80-\\u9FFF]+)");
static const std::regex NODE_NAME_PATTERN(
"(\\*{0,2}[a-zA-Z0-9_\\u2E80-\\u9FFF]+\\*{0,2})");
static const int DEFAULT_SEGMENT_NUM_FOR_TABLE_NAME = 3;
+
} // namespace storage
#endif
diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc
index 37b8c1bb..fd1d0132 100644
--- a/cpp/src/common/global.cc
+++ b/cpp/src/common/global.cc
@@ -122,7 +122,7 @@ int init_common() {
g_time_column_schema.data_type_ = INT64;
g_time_column_schema.encoding_ = PLAIN;
g_time_column_schema.compression_ = UNCOMPRESSED;
- g_time_column_schema.column_name_ = std::string("time");
+ g_time_column_schema.column_name_ = storage::TIME_COLUMN_NAME;
return ret;
}
diff --git a/cpp/src/common/tsblock/tuple_desc.h
b/cpp/src/common/tsblock/tuple_desc.h
index 6010d677..fe301755 100644
--- a/cpp/src/common/tsblock/tuple_desc.h
+++ b/cpp/src/common/tsblock/tuple_desc.h
@@ -47,9 +47,16 @@ class TupleDesc {
TupleDesc() {}
virtual ~TupleDesc() {}
- FORCE_INLINE void reset() { column_list_.clear(); }
+ FORCE_INLINE void reset() {
+ column_list_.clear();
+ time_column_index_ = -1;
+ }
FORCE_INLINE void push_back(ColumnSchema schema) {
+ if (schema.column_category_ == ColumnCategory::TIME) {
+ ASSERT(time_column_index_ == -1);
+ time_column_index_ = static_cast<int>(column_list_.size());
+ }
column_list_.push_back(schema);
}
@@ -76,12 +83,28 @@ class TupleDesc {
return column_list_[index].column_category_;
}
+ FORCE_INLINE int get_time_column_index() const {
+ return time_column_index_;
+ }
+
FORCE_INLINE std::string get_column_name(uint32_t index) {
return column_list_[index].column_name_;
}
FORCE_INLINE void remove_column(uint32_t idx) {
+ ASSERT(idx < column_list_.size());
+
+ const bool removing_time =
+ (static_cast<int>(idx) == time_column_index_);
+
column_list_.erase(column_list_.begin() + idx);
+
+ if (removing_time) {
+ time_column_index_ = -1;
+ } else if (time_column_index_ != -1 &&
+ static_cast<int>(idx) < time_column_index_) {
+ --time_column_index_;
+ }
}
// get the single row len, ignore nulls and select-list memory for the
@@ -99,12 +122,16 @@ class TupleDesc {
return false;
}
}
+ if (time_column_index_ != that.time_column_index_) {
+ return false;
+ }
return true;
}
void clone_from(TupleDesc* that) {
ASSERT(column_list_.size() == 0);
column_list_ = that->column_list_; // deep copy
+ time_column_index_ = that->time_column_index_;
}
#ifdef DEBUG
@@ -119,6 +146,7 @@ class TupleDesc {
#endif
private:
std::vector<ColumnSchema> column_list_;
+ int time_column_index_ = -1;
};
} // namespace common
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc
b/cpp/src/cwrapper/tsfile_cwrapper.cc
index 7c22ccd5..fbcf4e6f 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.cc
+++ b/cpp/src/cwrapper/tsfile_cwrapper.cc
@@ -116,7 +116,6 @@ TsFileWriter tsfile_writer_new(WriteFile file, TableSchema*
schema,
*err_code = common::E_INVALID_SCHEMA;
return nullptr;
}
-
column_schemas.emplace_back(
cur_schema.column_name,
static_cast<common::TSDataType>(cur_schema.data_type),
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.h
b/cpp/src/cwrapper/tsfile_cwrapper.h
index d9fe6bb8..643b4e52 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.h
+++ b/cpp/src/cwrapper/tsfile_cwrapper.h
@@ -71,7 +71,12 @@ typedef enum {
TS_COMPRESSION_INVALID = 255
} CompressionType;
-typedef enum column_category { TAG = 0, FIELD = 1 } ColumnCategory;
+typedef enum column_category {
+ TAG = 0,
+ FIELD = 1,
+ ATTRIBUTE = 2,
+ TIME = 3
+} ColumnCategory;
typedef struct column_schema {
char* column_name;
diff --git a/cpp/src/reader/block/single_device_tsblock_reader.cc
b/cpp/src/reader/block/single_device_tsblock_reader.cc
index 0e2b350c..836ab695 100644
--- a/cpp/src/reader/block/single_device_tsblock_reader.cc
+++ b/cpp/src/reader/block/single_device_tsblock_reader.cc
@@ -164,6 +164,14 @@ int SingleDeviceTsBlockReader::fill_measurements(
}
col_appenders_[time_column_index_]->append((const char*)&next_time_,
sizeof(next_time_));
+ int time_in_query_index = tuple_desc_.get_time_column_index();
+ if (time_in_query_index != -1) {
+ if (!col_appenders_[time_in_query_index]->add_row()) {
+ assert(false);
+ }
+ col_appenders_[time_in_query_index]->append(
+ (const char*)&next_time_, sizeof(next_time_));
+ }
for (auto& column_context : column_contexts) {
column_context->fill_into(col_appenders_);
if (RET_FAIL(advance_column(column_context))) {
diff --git a/cpp/src/reader/column_mapping.h b/cpp/src/reader/column_mapping.h
index abf9eafb..99e15303 100644
--- a/cpp/src/reader/column_mapping.h
+++ b/cpp/src/reader/column_mapping.h
@@ -36,8 +36,10 @@ class ColumnMapping {
if (column_category == common::ColumnCategory::TAG) {
tag_columns_.insert(column_name);
- } else {
+ } else if (column_category == common::ColumnCategory::FIELD) {
field_columns_.insert(column_name);
+ } else if (column_category == common::ColumnCategory::TIME) {
+ time_column_ = column_name;
}
return common::E_OK;
@@ -64,6 +66,10 @@ class ColumnMapping {
return field_columns_.find(column_name) != field_columns_.end();
}
+ bool is_time(const std::string& column_name) const {
+ return time_column_ == column_name;
+ }
+
const std::unordered_set<std::string>& get_id_columns() const {
return tag_columns_;
}
@@ -72,8 +78,11 @@ class ColumnMapping {
return field_columns_;
}
+ const std::string get_time_column() const { return time_column_; }
+
private:
std::unordered_map<std::string, std::vector<int>> column_pos_map;
+ std::string time_column_;
std::unordered_set<std::string> tag_columns_;
std::unordered_set<std::string> field_columns_;
};
diff --git a/cpp/src/reader/table_query_executor.cc
b/cpp/src/reader/table_query_executor.cc
index 79b636b5..2a01a6d5 100644
--- a/cpp/src/reader/table_query_executor.cc
+++ b/cpp/src/reader/table_query_executor.cc
@@ -65,9 +65,10 @@ int TableQueryExecutor::query(const std::string& table_name,
}
// column_mapping.add(*measurement_filter);
- auto device_task_iterator = std::unique_ptr<DeviceTaskIterator>(
- new DeviceTaskIterator(columns, table_root, column_mapping,
- meta_data_querier_, id_filter, table_schema));
+ auto device_task_iterator =
+ std::unique_ptr<DeviceTaskIterator>(new DeviceTaskIterator(
+ lower_case_column_names, table_root, column_mapping,
+ meta_data_querier_, id_filter, table_schema));
std::unique_ptr<TsBlockReader> tsblock_reader;
switch (table_query_ordering_) {
@@ -82,8 +83,8 @@ int TableQueryExecutor::query(const std::string& table_name,
ret = common::E_UNSUPPORTED_ORDER;
}
assert(tsblock_reader != nullptr);
- ret_qds =
- new TableResultSet(std::move(tsblock_reader), columns, data_types);
+ ret_qds = new TableResultSet(std::move(tsblock_reader),
+ lower_case_column_names, data_types);
return ret;
}
diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h
index 85d99b1a..d0953afe 100644
--- a/cpp/src/utils/db_utils.h
+++ b/cpp/src/utils/db_utils.h
@@ -37,6 +37,7 @@ namespace common {
extern TSEncoding get_value_encoder(TSDataType data_type);
extern CompressionType get_default_compressor();
+// TODO: remove this.
typedef struct FileID {
int64_t seq_; // timestamp when create
int32_t version_;
@@ -64,6 +65,7 @@ typedef struct FileID {
#endif
} FileID;
+// TODO: remove this.
typedef uint16_t NodeID;
struct TsID {
NodeID db_nid_;
@@ -157,7 +159,7 @@ struct TsID {
* This enumeration class defines the supported categories for columns within a
* table schema, distinguishing between tag and field columns.
*/
-enum class ColumnCategory { TAG = 0, FIELD = 1 };
+enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 };
/**
* @brief Represents the schema information for a single column.
@@ -176,7 +178,8 @@ struct ColumnSchema {
: column_name_(""),
data_type_(INVALID_DATATYPE),
compression_(UNCOMPRESSED),
- encoding_(PLAIN) {}
+ encoding_(PLAIN),
+ column_category_(ColumnCategory::FIELD) {}
/**
* @brief Constructs a ColumnSchema object with the given parameters.
diff --git a/cpp/test/reader/table_view/tsfile_reader_table_test.cc
b/cpp/test/reader/table_view/tsfile_reader_table_test.cc
index c281de41..b9f0eb21 100644
--- a/cpp/test/reader/table_view/tsfile_reader_table_test.cc
+++ b/cpp/test/reader/table_view/tsfile_reader_table_test.cc
@@ -707,3 +707,84 @@ TEST_F(TsFileTableReaderTest, TestNullInTable4) {
ASSERT_EQ(line, max_rows);
});
}
+
+TEST_F(TsFileTableReaderTest, TestTimeColumnReader) {
+ std::vector<common::ColumnSchema> column_schemas;
+ column_schemas.emplace_back("s0", TSDataType::INT64,
+ CompressionType::UNCOMPRESSED,
+ TSEncoding::PLAIN, ColumnCategory::FIELD);
+ column_schemas.emplace_back("S1", TSDataType::DOUBLE,
+ CompressionType::UNCOMPRESSED,
+ TSEncoding::PLAIN, ColumnCategory::FIELD);
+ // No need to manually insert data into the time column.
+ column_schemas.emplace_back("TIME_D", TSDataType::TIMESTAMP,
+ CompressionType::UNCOMPRESSED,
+ TSEncoding::PLAIN, ColumnCategory::TIME);
+
+ TableSchema table_schema("testTableTime", column_schemas);
+ auto tsfile_table_writer_ =
+ std::make_shared<TsFileTableWriter>(&write_file_, &table_schema);
+
+ const int num_rows = 20;
+ const int64_t base_time = 1000;
+ storage::Tablet tablet(table_schema.get_table_name(), {"s0", "s1"},
+ {TSDataType::INT64, TSDataType::DOUBLE},
+ {ColumnCategory::FIELD, ColumnCategory::FIELD},
+ num_rows);
+
+ for (int i = 0; i < num_rows; i++) {
+ int64_t t = base_time + i;
+ tablet.add_timestamp(i, t);
+ tablet.add_value(i, 0, static_cast<int64_t>(i * 10));
+ tablet.add_value(i, 1, static_cast<double>(i * 1.5));
+ }
+
+ ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+ ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+ ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+ storage::TsFileReader reader;
+ int ret = reader.open(file_name_);
+ ASSERT_EQ(ret, common::E_OK);
+
+ ResultSet* tmp_result_set = nullptr;
+ ret = reader.query(table_schema.get_table_name(), {"s0", "s1", "TIME_D"},
0,
+ 1000000000000, tmp_result_set);
+ ASSERT_EQ(ret, common::E_OK);
+ ASSERT_NE(tmp_result_set, nullptr);
+
+ auto* table_result_set = dynamic_cast<TableResultSet*>(tmp_result_set);
+ ASSERT_NE(table_result_set, nullptr);
+
+ auto result_set_metadata = table_result_set->get_metadata();
+ ASSERT_EQ(result_set_metadata->get_column_count(),
+ 4); // time + s0 + s1 + TIME_D
+ ASSERT_EQ(result_set_metadata->get_column_name(1), "time");
+ ASSERT_EQ(result_set_metadata->get_column_type(1), TSDataType::INT64);
+ ASSERT_EQ(result_set_metadata->get_column_name(2), "s0");
+ ASSERT_EQ(result_set_metadata->get_column_type(2), TSDataType::INT64);
+ ASSERT_EQ(result_set_metadata->get_column_name(3), "s1");
+ ASSERT_EQ(result_set_metadata->get_column_type(3), TSDataType::DOUBLE);
+ ASSERT_EQ(result_set_metadata->get_column_name(4), "time_d");
+ ASSERT_EQ(result_set_metadata->get_column_type(4), TSDataType::TIMESTAMP);
+
+ bool has_next = false;
+ int row_count = 0;
+ while (IS_SUCC(table_result_set->next(has_next)) && has_next) {
+ int64_t row_time = base_time + row_count;
+ // Column 1 is built-in time
+ ASSERT_EQ(table_result_set->get_value<int64_t>(1), row_time);
+ // s0, s1
+ ASSERT_EQ(table_result_set->get_value<int64_t>(2), row_count * 10);
+ ASSERT_DOUBLE_EQ(table_result_set->get_value<double>(3),
+ static_cast<double>(row_count * 1.5));
+ // time_d
+ ASSERT_EQ(table_result_set->get_value<int64_t>("TIME_D"), row_time);
+ ASSERT_EQ(table_result_set->get_value<int64_t>(4), row_time);
+ row_count++;
+ }
+ ASSERT_EQ(row_count, num_rows);
+
+ reader.destroy_query_data_set(table_result_set);
+ ASSERT_EQ(reader.close(), common::E_OK);
+}
diff --git a/cpp/test/writer/table_view/tsfile_writer_table_test.cc
b/cpp/test/writer/table_view/tsfile_writer_table_test.cc
index d5861ea1..1f8c80ff 100644
--- a/cpp/test/writer/table_view/tsfile_writer_table_test.cc
+++ b/cpp/test/writer/table_view/tsfile_writer_table_test.cc
@@ -447,7 +447,7 @@ TEST_F(TsFileWriterTableTest, WriteAndReadSimple) {
ASSERT_EQ(ret_value, 0);
auto* table_result_set = (TableResultSet*)ret;
auto metadata = ret->get_metadata();
- ASSERT_EQ(metadata->get_column_name(column_names.size() + 1), "VALUE");
+ ASSERT_EQ(metadata->get_column_name(column_names.size() + 1), "value");
bool has_next = false;
int cur_line = 0;
while (IS_SUCC(table_result_set->next(has_next)) && has_next) {
diff --git a/python/tests/resources/README.md b/python/tests/resources/README.md
index ca80bb43..d5ec82b4 100644
--- a/python/tests/resources/README.md
+++ b/python/tests/resources/README.md
@@ -282,4 +282,54 @@ IoTDB:test> select * from test;
|2025-10-10T22:21:19.000+08:00| b|
c|1069|7.9|v69|1970-01-01T08:00:01.069+08:00| 79|16.9|2024-12-18|text69|
+-----------------------------+--+--+----+---+---+-----------------------------+----+----+----------+------+
Total line number = 40
-```
\ No newline at end of file
+```
+
+In `table_with_time_column.tsfile`
+
+```
+IoTDB:mydb> select * from table2;
++-----------------------------+---------+-----------+--------+
+| id|region_id|temperature|humidity|
++-----------------------------+---------+-----------+--------+
+|2026-02-10T21:11:35.888+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:36.807+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:37.233+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:37.471+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:37.695+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:37.910+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:38.148+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:38.385+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:38.599+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:38.853+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:39.086+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:39.327+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:39.558+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:39.794+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:40.017+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:40.262+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:40.492+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:40.729+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:40.976+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:41.243+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:41.494+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:41.734+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:42.040+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:42.333+08:00| loc| 0.1| 0.1|
+|2026-02-10T21:11:43.005+08:00| loc| 0.1| 0.1|
++-----------------------------+---------+-----------+--------+
+Total line number = 25
+It costs 0.042s
+IoTDB:mydb> describe table2
++-----------+---------+--------+
+| ColumnName| DataType|Category|
++-----------+---------+--------+
+| id|TIMESTAMP| TIME|
+| region_id| STRING| TAG|
+|temperature| FLOAT| FIELD|
+| humidity| DOUBLE| FIELD|
++-----------+---------+--------+
+Total line number = 4
+It costs 0.065s
+IoTDB:mydb>
+```
+
diff --git a/python/tests/resources/table_with_time_column.tsfile
b/python/tests/resources/table_with_time_column.tsfile
new file mode 100644
index 00000000..66be782a
Binary files /dev/null and
b/python/tests/resources/table_with_time_column.tsfile differ
diff --git a/python/tests/test_basic.py b/python/tests/test_basic.py
index 842a8fb4..675ef837 100644
--- a/python/tests/test_basic.py
+++ b/python/tests/test_basic.py
@@ -17,7 +17,7 @@
#
import numpy as np
import pytest
-from tsfile import schema, Field
+from tsfile import Field
from tsfile import Tablet
from tsfile.constants import *
from tsfile.schema import *
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
new file mode 100644
index 00000000..de49bc1c
--- /dev/null
+++ b/python/tests/test_dataframe.py
@@ -0,0 +1,320 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+import os
+from datetime import date
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from tsfile import ColumnSchema, TableSchema, TSDataType, TIME_COLUMN
+from tsfile import TsFileTableWriter, ColumnCategory
+from tsfile import to_dataframe
+from tsfile.exceptions import ColumnNotExistError, TypeMismatchError
+
+
+def convert_to_nullable_types(df):
+ """
+ Convert DataFrame columns to nullable types to match returned DataFrame
from to_dataframe.
+ This handles the fact that returned DataFrames use nullable types (Int64,
Float64, etc.)
+ to support Null values.
+ """
+ df = df.copy()
+ for col in df.columns:
+ dtype = df[col].dtype
+ if dtype == 'int64':
+ df[col] = df[col].astype('Int64')
+ elif dtype == 'int32':
+ df[col] = df[col].astype('Int32')
+ elif dtype == 'float64':
+ df[col] = df[col].astype('Float64')
+ elif dtype == 'float32':
+ df[col] = df[col].astype('Float32')
+ elif dtype == 'bool':
+ df[col] = df[col].astype('boolean')
+ return df
+
+
+def test_write_dataframe_basic():
+ table = TableSchema("test_table",
+ [ColumnSchema("device", TSDataType.STRING,
ColumnCategory.TAG),
+ ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD),
+ ColumnSchema("value2", TSDataType.INT64,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_basic.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'time': [i for i in range(100)],
+ 'device': [f"device{i}" for i in range(100)],
+ 'value': [i * 1.5 for i in range(100)],
+ 'value2': [i * 10 for i in range(100)]
+ })
+ writer.write_dataframe(df)
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('time').reset_index(drop=True))
+ assert df_read.shape == (100, 4)
+ assert df_read[TIME_COLUMN].equals(df_sorted["time"])
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["value"].equals(df_sorted["value"])
+ assert df_read["value2"].equals(df_sorted["value2"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_with_index():
+ table = TableSchema("test_table",
+ [ColumnSchema("device", TSDataType.STRING,
ColumnCategory.TAG),
+ ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_index.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'device': [f"device{i}" for i in range(50)],
+ 'value': [i * 2.5 for i in range(50)]
+ })
+ df.index = [i * 10 for i in range(50)] # Set index as timestamps
+ writer.write_dataframe(df)
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted = df.sort_index()
+ df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True))
+ time_series = pd.Series(df.sort_index().index.values, dtype='Int64')
+ assert df_read.shape == (50, 3)
+ assert df_read[TIME_COLUMN].equals(time_series)
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["value"].equals(df_sorted["value"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_case_insensitive():
+ table = TableSchema("test_table",
+ [ColumnSchema("device", TSDataType.STRING,
ColumnCategory.TAG),
+ ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_case.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'Time': [i for i in range(30)], # Capital T
+ 'Device': [f"device{i}" for i in range(30)], # Capital D
+ 'VALUE': [i * 3.0 for i in range(30)] # All caps
+ })
+ writer.write_dataframe(df)
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('Time').reset_index(drop=True))
+ assert df_read.shape == (30, 3)
+ assert df_read[TIME_COLUMN].equals(df_sorted["Time"])
+ assert df_read["device"].equals(df_sorted["Device"])
+ assert df_read["value"].equals(df_sorted["VALUE"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_column_not_in_schema():
+ table = TableSchema("test_table",
+ [ColumnSchema("device", TSDataType.STRING,
ColumnCategory.TAG),
+ ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_extra_col.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'time': [i for i in range(10)],
+ 'device': [f"device{i}" for i in range(10)],
+ 'value': [i * 1.0 for i in range(10)],
+ 'extra_column': [i for i in range(10)] # Not in schema
+ })
+ with pytest.raises(ColumnNotExistError):
+ writer.write_dataframe(df)
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_type_mismatch():
+ table = TableSchema("test_table",
+ [ColumnSchema("value", TSDataType.STRING,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_type_mismatch.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'time': [i for i in range(10)],
+ 'value': [i for i in range(10)]
+ })
+ with pytest.raises(TypeMismatchError) as exc_info:
+ writer.write_dataframe(df)
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_all_datatypes():
+ table = TableSchema("test_table",
+ [ColumnSchema("bool_col", TSDataType.BOOLEAN,
ColumnCategory.FIELD),
+ ColumnSchema("int32_col", TSDataType.INT32,
ColumnCategory.FIELD),
+ ColumnSchema("int64_col", TSDataType.INT64,
ColumnCategory.FIELD),
+ ColumnSchema("float_col", TSDataType.FLOAT,
ColumnCategory.FIELD),
+ ColumnSchema("double_col", TSDataType.DOUBLE,
ColumnCategory.FIELD),
+ ColumnSchema("string_col", TSDataType.STRING,
ColumnCategory.FIELD),
+ ColumnSchema("blob_col", TSDataType.BLOB,
ColumnCategory.FIELD),
+ ColumnSchema("text_col", TSDataType.TEXT,
ColumnCategory.FIELD),
+ ColumnSchema("date_col", TSDataType.DATE,
ColumnCategory.FIELD),
+ ColumnSchema("timestamp_col", TSDataType.TIMESTAMP,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_all_types.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'time': [i for i in range(50)],
+ 'bool_col': [i % 2 == 0 for i in range(50)],
+ 'int32_col': pd.Series([i for i in range(50)], dtype='int32'),
+ 'int64_col': [i * 10 for i in range(50)],
+ 'float_col': pd.Series([i * 1.5 for i in range(50)],
dtype='float32'),
+ 'double_col': [i * 2.5 for i in range(50)],
+ 'string_col': [f"str{i}" for i in range(50)],
+ 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)],
+ 'text_col': [f"text{i}" for i in range(50)],
+ 'date_col': [date(2025, i % 11 + 1, i % 20 + 1) for i in
range(50)],
+ 'timestamp_col': [i for i in range(50)]
+ })
+ writer.write_dataframe(df)
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('time').reset_index(drop=True))
+ assert df_read.shape == (50, 11)
+ assert df_read["bool_col"].equals(df_sorted["bool_col"])
+ assert df_read["int32_col"].equals(df_sorted["int32_col"])
+ assert df_read["int64_col"].equals(df_sorted["int64_col"])
+ assert np.allclose(df_read["float_col"], df_sorted["float_col"])
+ assert np.allclose(df_read["double_col"], df_sorted["double_col"])
+ assert df_read["string_col"].equals(df_sorted["string_col"])
+ assert df_read["blob_col"].equals(df_sorted["blob_col"])
+ assert df_read["text_col"].equals(df_sorted["text_col"])
+ assert df_read["date_col"].equals(df_sorted["date_col"])
+ assert df_read["timestamp_col"].equals(df_sorted["timestamp_col"])
+ for i in range(50):
+ assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i]
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_schema_time_column():
+ table = TableSchema("test_table",
+ [ColumnSchema("time", TSDataType.TIMESTAMP,
ColumnCategory.TIME),
+ ColumnSchema("device", TSDataType.STRING,
ColumnCategory.TAG),
+ ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_schema_time.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'time': [i * 100 for i in range(50)],
+ 'device': [f"device{i}" for i in range(50)],
+ 'value': [i * 1.5 for i in range(50)]
+ })
+ writer.write_dataframe(df)
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('time').reset_index(drop=True))
+ assert df_read.shape == (50, 3)
+ assert df_read[TIME_COLUMN].equals(df_sorted[TIME_COLUMN])
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["value"].equals(df_sorted["value"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_schema_time_and_dataframe_time():
+ table = TableSchema("test_table",
+ [ColumnSchema("device", TSDataType.STRING,
ColumnCategory.TAG),
+ ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_schema_and_df_time.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'Time': [i for i in range(30)],
+ 'device': [f"dev{i}" for i in range(30)],
+ 'value': [float(i) for i in range(30)]
+ })
+ writer.write_dataframe(df)
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted = convert_to_nullable_types(
+
df.sort_values('Time').rename(columns=str.lower).reset_index(drop=True)
+ )
+ assert df_read.shape == (30, 3)
+ assert df_read["time"].equals(df_sorted["time"])
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["value"].equals(df_sorted["value"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_write_dataframe_empty():
+ table = TableSchema("test_table",
+ [ColumnSchema("value", TSDataType.DOUBLE,
ColumnCategory.FIELD)])
+ tsfile_path = "test_write_dataframe_empty.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ with TsFileTableWriter(tsfile_path, table) as writer:
+ df = pd.DataFrame({
+ 'time': [],
+ 'value': []
+ })
+ with pytest.raises(ValueError):
+ writer.write_dataframe(df)
+
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
diff --git a/python/tests/test_load_tsfile_from_iotdb.py
b/python/tests/test_load_tsfile_from_iotdb.py
index d865dd35..50ca0baf 100644
--- a/python/tests/test_load_tsfile_from_iotdb.py
+++ b/python/tests/test_load_tsfile_from_iotdb.py
@@ -15,12 +15,13 @@
# specific language governing permissions and limitations
# under the License.
#
-
+import math
import os
import numpy as np
import tsfile as ts
+from tsfile import TIME_COLUMN
def test_load_tsfile_from_iotdb():
@@ -31,8 +32,8 @@ def test_load_tsfile_from_iotdb():
## --------
assert len(df) == 105, "row count mismatch"
- assert df["time"].isna().sum() == 0
- assert int(df["time"].sum()) == 15960
+ assert df[TIME_COLUMN].isna().sum() == 0
+ assert int(df[TIME_COLUMN].sum()) == 15960
assert df["temperature"].isna().sum() == 5
assert df["status"].isna().sum() == 5
assert (df["status"] == True).sum() == 50
@@ -44,8 +45,8 @@ def test_load_tsfile_from_iotdb():
df = ts.to_dataframe(simple_tabl1_path)
## ---------
assert len(df) == 60
- assert df["time"].isna().sum() == 0
- assert df["time"].sum() == (
+ assert df[TIME_COLUMN].isna().sum() == 0
+ assert df[TIME_COLUMN].sum() == (
(1760106020000 + 1760106049000) * 30 // 2 +
(1760106080000 + 1760106109000) * 30 // 2
)
@@ -78,8 +79,8 @@ def test_load_tsfile_from_iotdb():
df = ts.to_dataframe(simple_tabl2_path)
## ---------
assert len(df) == 40
- assert df["time"].isna().sum() == 0
- assert int(df["time"].sum()) == 70404242080000
+ assert df[TIME_COLUMN].isna().sum() == 0
+ assert int(df[TIME_COLUMN].sum()) == 70404242080000
assert df["s0"].isna().sum() == 0
assert df["s1"].isna().sum() == 0
@@ -109,3 +110,25 @@ def test_load_tsfile_from_iotdb():
assert df["s9"].isna().sum() == 5
## ---------
+ table_with_time_column_path = os.path.join(dir_path,
'table_with_time_column.tsfile')
+
+ df = ts.to_dataframe(table_with_time_column_path)
+ assert list(df.columns)[0] == "id"
+ assert len(df) == 25
+ assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9)
+ assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9)
+ assert (df["region_id"] == "loc").sum() == 25
+
+ df = ts.to_dataframe(table_with_time_column_path, table_name="table2",
column_names=["region_id", "temperature", "humidity"])
+ assert list(df.columns)[0] == "id"
+ assert len(df) == 25
+ assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9)
+ assert (df["region_id"] == "loc").sum() == 25
+
+ df = ts.to_dataframe(table_with_time_column_path, table_name="table2",
column_names=["id", "temperature", "humidity"])
+ assert list(df.columns)[0] == "time"
+ assert df["id"].equals(df["time"])
+ assert len(df) == 25
+ assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9)
+ assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9)
+
diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py
new file mode 100644
index 00000000..4e048188
--- /dev/null
+++ b/python/tests/test_to_tsfile.py
@@ -0,0 +1,378 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+import os
+from datetime import date
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from tsfile import to_dataframe, TsFileReader, ColumnCategory, TIME_COLUMN
+from tsfile.utils import dataframe_to_tsfile
+
+
+def convert_to_nullable_types(df):
+ df = df.copy()
+ for col in df.columns:
+ dtype = df[col].dtype
+ if dtype == 'int64':
+ df[col] = df[col].astype('Int64')
+ elif dtype == 'int32':
+ df[col] = df[col].astype('Int32')
+ elif dtype == 'float64':
+ df[col] = df[col].astype('Float64')
+ elif dtype == 'float32':
+ df[col] = df[col].astype('Float32')
+ elif dtype == 'bool':
+ df[col] = df[col].astype('boolean')
+ return df
+
+
+def test_dataframe_to_tsfile_basic():
+ tsfile_path = "test_dataframe_to_tsfile_basic.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [i for i in range(100)],
+ 'device': [f"device{i}" for i in range(100)],
+ 'value': [i * 1.5 for i in range(100)],
+ 'value2': [i * 10 for i in range(100)]
+ })
+
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table")
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values('time').reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('time').reset_index(drop=True))
+
+ assert df_read.shape == (100, 4)
+ assert df_read["time"].equals(df_sorted["time"])
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["value"].equals(df_sorted["value"])
+ assert df_read["value2"].equals(df_sorted["value2"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_default_table_name():
+ tsfile_path = "test_dataframe_to_tsfile_default.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({'time': [0, 1], 'value': [1.0, 2.0]})
+ dataframe_to_tsfile(df, tsfile_path)
+
+ df_read = to_dataframe(tsfile_path, table_name="default_table")
+ assert len(df_read) == 2
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_with_index():
+ tsfile_path = "test_dataframe_to_tsfile_index.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'device': [f"device{i}" for i in range(30)],
+ 'value': [i * 2.0 for i in range(30)]
+ })
+ df.index = [i * 100 for i in range(30)]
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table")
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values('time').reset_index(drop=True)
+ time_expected = pd.Series(df.index.values, dtype='Int64')
+ assert df_read.shape == (30, 3)
+ assert df_read["time"].equals(time_expected)
+
+ with TsFileReader(tsfile_path) as reader:
+ table_schema = reader.get_table_schema("test_table")
+ device_col = table_schema.get_column("device")
+ assert device_col is not None
+ assert device_col.get_category() == ColumnCategory.FIELD
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_custom_time_column():
+ tsfile_path = "test_dataframe_to_tsfile_custom_time.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'timestamp': [i for i in range(30)],
+ 'device': [f"device{i}" for i in range(30)],
+ 'value': [i * 3.0 for i in range(30)]
+ })
+
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table",
time_column="timestamp")
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values("timestamp").reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('timestamp').reset_index(drop=True))
+
+ assert df_read.shape == (30, 3)
+ assert df_read["timestamp"].equals(df_sorted["timestamp"])
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["value"].equals(df_sorted["value"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_case_insensitive_time():
+ tsfile_path = "test_dataframe_to_tsfile_case_time.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'Time': [i for i in range(20)],
+ 'value': [i * 2.0 for i in range(20)]
+ })
+
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table")
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ assert df_read.shape == (20, 2)
+ assert df_read["time"].equals(pd.Series([i for i in range(20)],
dtype='Int64'))
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_with_tag_columns():
+ tsfile_path = "test_dataframe_to_tsfile_tags.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [i for i in range(20)],
+ 'device': [f"device{i}" for i in range(20)],
+ 'location': [f"loc{i % 5}" for i in range(20)],
+ 'value': [i * 1.5 for i in range(20)]
+ })
+
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table",
tag_column=["device", "location"])
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('time').reset_index(drop=True))
+
+ assert df_read.shape == (20, 4)
+ assert df_read["device"].equals(df_sorted["device"])
+ assert df_read["location"].equals(df_sorted["location"])
+ assert df_read["value"].equals(df_sorted["value"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_tag_time_unsorted():
+ tsfile_path = "test_dataframe_to_tsfile_tag_time_unsorted.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [30, 10, 20, 50, 40, 15, 25, 35, 5, 45],
+ 'device': ['device1', 'device1', 'device1', 'device2', 'device2',
'device1', 'device1', 'device2',
+ 'device1', 'device2'],
+ 'value': [i * 1.5 for i in range(10)]
+ })
+
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table",
tag_column=["device"])
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_expected = df.sort_values(by=['device',
'time']).reset_index(drop=True)
+ df_expected = convert_to_nullable_types(df_expected)
+
+ assert df_read.shape == (10, 3)
+ assert df_read["device"].equals(df_expected["device"])
+ assert df_read[TIME_COLUMN].equals(df_expected["time"])
+ assert df_read["value"].equals(df_expected["value"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_all_datatypes():
+ tsfile_path = "test_dataframe_to_tsfile_all_types.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [i for i in range(50)],
+ 'bool_col': [i % 2 == 0 for i in range(50)],
+ 'int32_col': pd.Series([i for i in range(50)], dtype='int32'),
+ 'int64_col': [i * 10 for i in range(50)],
+ 'float_col': pd.Series([i * 1.5 for i in range(50)],
dtype='float32'),
+ 'double_col': [i * 2.5 for i in range(50)],
+ 'string_col': [f"str{i}" for i in range(50)],
+ 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)],
+ 'text_col': [f"text{i}" for i in range(50)],
+ 'date_col': [date(2025, i % 11 + 1, i % 20 + 1) for i in
range(50)],
+ 'timestamp_col': [i for i in range(50)]
+ })
+
+ dataframe_to_tsfile(df, tsfile_path, table_name="test_table")
+
+ df_read = to_dataframe(tsfile_path, table_name="test_table")
+ df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True)
+ df_sorted =
convert_to_nullable_types(df.sort_values('time').reset_index(drop=True))
+
+ assert df_read.shape == (50, 11)
+ assert df_read["bool_col"].equals(df_sorted["bool_col"])
+ assert df_read["int32_col"].equals(df_sorted["int32_col"])
+ assert df_read["int64_col"].equals(df_sorted["int64_col"])
+ assert np.allclose(df_read["float_col"], df_sorted["float_col"])
+ assert np.allclose(df_read["double_col"], df_sorted["double_col"])
+ assert df_read["string_col"].equals(df_sorted["string_col"])
+ assert df_read["text_col"].equals(df_sorted["text_col"])
+ assert df_read["date_col"].equals(df_sorted["date_col"])
+ assert df_read["timestamp_col"].equals(df_sorted["timestamp_col"])
+ for i in range(50):
+ assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i]
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_empty_dataframe():
+ tsfile_path = "test_dataframe_to_tsfile_empty.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame()
+
+ with pytest.raises(ValueError, match="DataFrame cannot be None or
empty"):
+ dataframe_to_tsfile(df, tsfile_path)
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_no_data_columns():
+ tsfile_path = "test_dataframe_to_tsfile_no_data.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [i for i in range(10)]
+ })
+
+ with pytest.raises(ValueError, match="DataFrame must have at least one
data column"):
+ dataframe_to_tsfile(df, tsfile_path)
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_time_column_not_found():
+ tsfile_path = "test_dataframe_to_tsfile_time_err.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({'time': [0, 1], 'value': [1.0, 2.0]})
+ with pytest.raises(ValueError, match="Time column 'timestamp' not
found"):
+ dataframe_to_tsfile(df, tsfile_path, time_column="timestamp")
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_invalid_time_column():
+ tsfile_path = "test_dataframe_to_tsfile_invalid_time.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'timestamp': [i for i in range(10)],
+ 'value': [i * 1.0 for i in range(10)]
+ })
+
+ with pytest.raises(ValueError, match="Time column 'time' not found"):
+ dataframe_to_tsfile(df, tsfile_path, time_column="time")
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_non_integer_time_column():
+ tsfile_path = "test_dataframe_to_tsfile_non_int_time.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [f"time{i}" for i in range(10)],
+ 'value': [i * 1.0 for i in range(10)]
+ })
+
+ with pytest.raises(TypeError, match="must be integer type"):
+ dataframe_to_tsfile(df, tsfile_path)
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_tag_column_not_found():
+ tsfile_path = "test_dataframe_to_tsfile_tag_err.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({'time': [0, 1], 'device': ['a', 'b'], 'value':
[1.0, 2.0]})
+ with pytest.raises(ValueError, match="Tag column 'invalid' not found"):
+ dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+
+def test_dataframe_to_tsfile_invalid_tag_column():
+ tsfile_path = "test_dataframe_to_tsfile_invalid_tag.tsfile"
+ try:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
+
+ df = pd.DataFrame({
+ 'time': [i for i in range(10)],
+ 'value': [i * 1.0 for i in range(10)]
+ })
+
+ with pytest.raises(ValueError, match="Tag column 'invalid' not found"):
+ dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"])
+ finally:
+ if os.path.exists(tsfile_path):
+ os.remove(tsfile_path)
diff --git a/python/tests/test_write_and_read.py
b/python/tests/test_write_and_read.py
index b327e2d3..57294a84 100644
--- a/python/tests/test_write_and_read.py
+++ b/python/tests/test_write_and_read.py
@@ -16,14 +16,16 @@
# under the License.
#
+import os
from datetime import date
import numpy as np
import pandas as pd
import pytest
+from pandas import Float64Dtype
from pandas.core.dtypes.common import is_integer_dtype
-from tsfile import ColumnSchema, TableSchema, TSEncoding
+from tsfile import ColumnSchema, TableSchema, TSEncoding, TIME_COLUMN
from tsfile import Compressor
from tsfile import TSDataType
from tsfile import Tablet, RowRecord, Field
@@ -31,7 +33,7 @@ from tsfile import TimeseriesSchema
from tsfile import TsFileTableWriter
from tsfile import TsFileWriter, TsFileReader, ColumnCategory
from tsfile import to_dataframe
-from tsfile.exceptions import TableNotExistError, ColumnNotExistError,
NotSupportedError
+from tsfile.exceptions import TableNotExistError, ColumnNotExistError,
NotSupportedError, TypeMismatchError
def test_row_record_write_and_read():
@@ -82,7 +84,7 @@ def test_row_record_write_and_read():
assert result.get_value_by_index(4) == row_num * 2
assert result.get_value_by_index(5) == f"string_value_{row_num}"
assert result.get_value_by_index(6) == f"text_value_{row_num}"
- assert result.get_value_by_index(7) == f"blob_data_{row_num}"
+ assert result.get_value_by_index(7) ==
f"blob_data_{row_num}".encode('utf-8')
assert result.get_value_by_index(8) == date(2025, 1, row_num % 20
+ 1)
assert result.get_value_by_index(9) == row_num
@@ -168,7 +170,7 @@ def test_tree_query_to_dataframe_variants():
assert df_all.shape[0] == total_rows
for measurement in all_measurements:
assert measurement in df_all.columns
- assert "time" in df_all.columns
+ assert TIME_COLUMN in df_all.columns
path_columns = sorted(
[col for col in df_all.columns if col.startswith("col_")],
key=lambda name: int(name.split("_")[1]),
@@ -177,7 +179,7 @@ def test_tree_query_to_dataframe_variants():
for _, row in df_all.iterrows():
device = _extract_device(row, path_columns)
- timestamp = int(row["time"])
+ timestamp = int(row[TIME_COLUMN])
assert (device, timestamp) in expected_values
expected_row = expected_values[(device, timestamp)]
for measurement in all_measurements:
@@ -199,7 +201,7 @@ def test_tree_query_to_dataframe_variants():
assert measurement not in df_subset.columns
for _, row in df_subset.iterrows():
device = _extract_device(row, path_columns)
- timestamp = int(row["time"])
+ timestamp = int(row[TIME_COLUMN])
expected_row = expected_values[(device, timestamp)]
for measurement in requested_columns:
value = row.get(measurement)
@@ -225,7 +227,7 @@ def test_tree_query_to_dataframe_variants():
iter_rows = 0
for batch in iterator:
assert isinstance(batch, pd.DataFrame)
- assert set(batch.columns).issuperset({"time", "level"})
+ assert set(batch.columns).issuperset({TIME_COLUMN, "level"})
iter_rows += len(batch)
assert iter_rows == 18
@@ -240,7 +242,7 @@ def test_tree_query_to_dataframe_variants():
iter_rows = 0
for batch in iterator:
assert isinstance(batch, pd.DataFrame)
- assert set(batch.columns).issuperset({"time", "level"})
+ assert set(batch.columns).issuperset({TIME_COLUMN, "level"})
iter_rows += len(batch)
assert iter_rows == 9
@@ -382,7 +384,7 @@ def test_table_writer_and_reader():
0, 10) as result:
cur_line = 0
while result.next():
- cur_time = result.get_value_by_name("time")
+ cur_time = result.get_value_by_name(TIME_COLUMN)
assert result.get_value_by_name("device") == "device" +
str(cur_time)
assert result.is_null_by_name("device") == False
assert result.is_null_by_name("value") == False
@@ -543,8 +545,8 @@ def test_tsfile_to_df():
df1 = to_dataframe("table_write_to_df.tsfile")
assert df1.shape == (4097, 4)
assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096
- assert is_integer_dtype(df1["time"])
- assert df1["value"].dtype == np.float64
+ assert is_integer_dtype(df1[TIME_COLUMN])
+ assert df1["value"].dtype == Float64Dtype()
assert is_integer_dtype(df1["value2"])
df2 = to_dataframe("table_write_to_df.tsfile", column_names=["device",
"value2"])
assert df2.shape == (4097, 3)
@@ -755,237 +757,9 @@ def test_tree_all_datatype_query_to_dataframe_variants():
pass
finally:
- if os.path.exists(tsfile_path):
- os.remove(tsfile_path)
-
-
-def test_table_all_datatype_query_to_dataframe_variants():
- tsfile_path = "test_table.tsfile"
- table = TableSchema(
- "test_table",
- [
- ColumnSchema("Device1", TSDataType.STRING, ColumnCategory.TAG),
- ColumnSchema("Device2", TSDataType.STRING, ColumnCategory.TAG),
- ColumnSchema("Value1", TSDataType.BOOLEAN, ColumnCategory.FIELD),
- ColumnSchema("Value2", TSDataType.INT32, ColumnCategory.FIELD),
- ColumnSchema("Value3", TSDataType.INT64, ColumnCategory.FIELD),
- ColumnSchema("Value4", TSDataType.FLOAT, ColumnCategory.FIELD),
- ColumnSchema("Value5", TSDataType.DOUBLE, ColumnCategory.FIELD),
- ColumnSchema("Value6", TSDataType.TEXT, ColumnCategory.FIELD),
- ColumnSchema("Value7", TSDataType.STRING, ColumnCategory.FIELD),
- ColumnSchema("Value8", TSDataType.BLOB, ColumnCategory.FIELD),
- ColumnSchema("Value9", TSDataType.TIMESTAMP, ColumnCategory.FIELD),
- ColumnSchema("Value10", TSDataType.DATE, ColumnCategory.FIELD),
- ],
- )
- dateSet = set()
- try:
- if os.path.exists(tsfile_path):
- os.remove(tsfile_path)
- max_row_num = 100
- with TsFileTableWriter(tsfile_path, table) as writer:
- tablet = Tablet(
- [
- "Device1",
- "Device2",
- "Value1",
- "Value2",
- "Value3",
- "Value4",
- "Value5",
- "Value6",
- "Value7",
- "Value8",
- "Value9",
- "Value10",
- ],
- [
- TSDataType.STRING,
- TSDataType.STRING,
- TSDataType.BOOLEAN,
- TSDataType.INT32,
- TSDataType.INT64,
- TSDataType.FLOAT,
- TSDataType.DOUBLE,
- TSDataType.TEXT,
- TSDataType.STRING,
- TSDataType.BLOB,
- TSDataType.TIMESTAMP,
- TSDataType.DATE,
- ],
- max_row_num,
- )
- for i in range(max_row_num):
- tablet.add_timestamp(i, i)
- tablet.add_value_by_name("Device1", i, "d1_" + str(i))
- tablet.add_value_by_name("Device2", i, "d2_" + str(i))
- tablet.add_value_by_name("Value1", i, i % 2 == 0)
- tablet.add_value_by_name("Value2", i, i * 3)
- tablet.add_value_by_name("Value3", i, i * 4)
- tablet.add_value_by_name("Value4", i, i * 5.5)
- tablet.add_value_by_name("Value5", i, i * 6.6)
- tablet.add_value_by_name("Value6", i, f"string_value_{i}")
- tablet.add_value_by_name("Value7", i, f"text_value_{i}")
- tablet.add_value_by_name("Value8", i,
f"blob_data_{i}".encode('utf-8'))
- tablet.add_value_by_name("Value9", i, i * 9)
- tablet.add_value_by_name("Value10", i, date(2025, 1, i % 20 +
1))
- dateSet.add(date(2025, 1, i % 20 + 1))
- writer.write_table(tablet)
-
- df1_1 = to_dataframe(tsfile_path)
- assert df1_1.shape[0] == max_row_num
- for i in range(max_row_num):
- assert df1_1.iloc[i, 1] == "d1_" + str(df1_1.iloc[i, 0])
- assert df1_1.iloc[i, 2] == "d2_" + str(df1_1.iloc[i, 0])
-
- df2_1 = to_dataframe(tsfile_path, column_names=["Value1"])
- for i in range(max_row_num):
- assert df2_1.iloc[i, 1] == np.bool_(df2_1.iloc[i, 0] % 2 == 0)
- df2_2 = to_dataframe(tsfile_path, column_names=["Value2"])
- for i in range(max_row_num):
- assert df2_2.iloc[i, 1] == np.int32(df2_2.iloc[i, 0] * 3)
- df2_3 = to_dataframe(tsfile_path, column_names=["Value3"])
- for i in range(max_row_num):
- assert df2_3.iloc[i, 1] == np.int64(df2_3.iloc[i, 0] * 4)
- df2_4 = to_dataframe(tsfile_path, column_names=["Value4"])
- for i in range(max_row_num):
- assert df2_4.iloc[i, 1] == np.float32(df2_4.iloc[i, 0] * 5.5)
- df2_5 = to_dataframe(tsfile_path, column_names=["Value5"])
- for i in range(max_row_num):
- assert df2_5.iloc[i, 1] == np.float64(df2_5.iloc[i, 0] * 6.6)
- df2_6 = to_dataframe(tsfile_path, column_names=["Value6"])
- for i in range(max_row_num):
- assert df2_6.iloc[i, 1] == f"string_value_{df2_6.iloc[i, 0]}"
- df2_7 = to_dataframe(tsfile_path, column_names=["Value7"])
- for i in range(max_row_num):
- assert df2_7.iloc[i, 1] == f"text_value_{df2_7.iloc[i, 0]}"
- df2_8 = to_dataframe(tsfile_path, column_names=["Value8"])
- for i in range(max_row_num):
- assert df2_8.iloc[i, 1] == f"blob_data_{df2_8.iloc[i,
0]}".encode('utf-8')
- df2_9 = to_dataframe(tsfile_path, column_names=["Value9"])
- for i in range(max_row_num):
- assert df2_9.iloc[i, 1] == np.int64(df2_9.iloc[i, 0] * 9)
- df2_10 = to_dataframe(tsfile_path, column_names=["Value10"])
- for i in range(max_row_num):
- assert df2_10.iloc[i, 1] in dateSet
- df2_11 = to_dataframe(tsfile_path, column_names=["Device1", "Value1"])
- for i in range(max_row_num):
- assert df2_11.iloc[i, 1] == "d1_" + str(df2_11.iloc[i, 0])
- assert df2_11.iloc[i, 2] == np.bool_(df2_11.iloc[i, 0] % 2 == 0)
- df2_12 = to_dataframe(
- tsfile_path,
- column_names=[
- "Device1",
- "Device2",
- "Value1",
- "Value2",
- "Value3",
- "Value4",
- "Value5",
- "Value6",
- "Value7",
- "Value8",
- "Value9",
- "Value10",
- ],
- )
- for i in range(max_row_num):
- assert df2_12.iloc[i, 1] == "d1_" + str(df2_12.iloc[i, 0])
- assert df2_12.iloc[i, 2] == "d2_" + str(df2_12.iloc[i, 0])
- assert df2_12.iloc[i, 3] == np.bool_(df2_12.iloc[i, 0] % 2 == 0)
- assert df2_12.iloc[i, 4] == np.int32(df2_12.iloc[i, 0] * 3)
- assert df2_12.iloc[i, 5] == np.int64(df2_12.iloc[i, 0] * 4)
- assert df2_12.iloc[i, 6] == np.float32(df2_12.iloc[i, 0] * 5.5)
- assert df2_12.iloc[i, 7] == np.float64(df2_12.iloc[i, 0] * 6.6)
- assert df2_12.iloc[i, 8] == f"string_value_{df2_12.iloc[i, 0]}"
- assert df2_12.iloc[i, 9] == f"text_value_{df2_12.iloc[i, 0]}"
- assert df2_12.iloc[i, 10] == f"blob_data_{df2_12.iloc[i,
0]}".encode(
- "utf-8"
- )
- assert df2_12.iloc[i, 11] == np.int64(df2_12.iloc[i, 0] * 9)
- assert df2_12.iloc[i, 12] == date(2025, 1, df2_12.iloc[i, 0] % 20
+ 1)
- df2_13 = to_dataframe(
- tsfile_path, column_names=["Device1", "Device2", "Value1"]
- )
- for i in range(max_row_num):
- assert df2_13.iloc[i, 1] == "d1_" + str(df2_13.iloc[i, 0])
- assert df2_13.iloc[i, 2] == "d2_" + str(df2_13.iloc[i, 0])
- assert df2_13.iloc[i, 3] == np.bool_(df2_13.iloc[i, 0] % 2 == 0)
-
- df3_1 = to_dataframe(tsfile_path, table_name="test_table")
- assert df3_1.shape[0] == max_row_num
- assert df3_1.iloc[0, 0] == 0
- df3_2 = to_dataframe(tsfile_path, table_name="TEST_TABLE")
- assert df3_2.shape[0] == max_row_num
- assert df3_2.iloc[0, 0] == 0
-
- df4_1 = to_dataframe(tsfile_path, start_time=10)
- assert df4_1.shape[0] == 90
- df4_2 = to_dataframe(tsfile_path, start_time=-10)
- assert df4_2.shape[0] == max_row_num
- df4_3 = to_dataframe(tsfile_path, end_time=5)
- assert df4_3.shape[0] == 6
- df4_4 = to_dataframe(tsfile_path, end_time=-5)
- assert df4_4.shape[0] == 0
- df4_5 = to_dataframe(tsfile_path, start_time=5, end_time=5)
- assert df4_5.shape[0] == 1
- df4_6 = to_dataframe(tsfile_path, start_time=-5, end_time=-5)
- assert df4_6.shape[0] == 0
- df4_7 = to_dataframe(tsfile_path, start_time=10, end_time=-10)
- assert df4_7.shape[0] == 0
- df4_8 = to_dataframe(tsfile_path, start_time=-10, end_time=10)
- assert df4_8.shape[0] == 11
- df4_8 = to_dataframe(tsfile_path, start_time=-50, end_time=50)
- assert df4_8.shape[0] == 51
-
- df5_1 = to_dataframe(tsfile_path, max_row_num=1)
- assert df5_1.shape[0] == 1
- df5_2 = to_dataframe(tsfile_path, max_row_num=50)
- assert df5_2.shape[0] == 50
- df5_3 = to_dataframe(tsfile_path, max_row_num=100)
- assert df5_3.shape[0] == 100
- df5_4 = to_dataframe(tsfile_path, max_row_num=1000)
- assert df5_4.shape[0] == 100
- df5_5 = to_dataframe(tsfile_path, max_row_num=0)
- assert df5_5.shape[0] == 0
- df5_6 = to_dataframe(tsfile_path, max_row_num=-10)
- assert df5_6.shape[0] == 0
-
- for df6_1 in to_dataframe(tsfile_path, max_row_num=20,
as_iterator=True):
- assert df6_1.shape[0] == 20
- for df6_2 in to_dataframe(tsfile_path, max_row_num=1000,
as_iterator=True):
- assert df6_2.shape[0] == 100
-
- for df7_1 in to_dataframe(
- tsfile_path,
- table_name="test_table",
- column_names=["Device1", "Value1"],
- start_time=21,
- end_time=50,
- max_row_num=10,
- as_iterator=True,
- ):
- assert df7_1.shape[0] == 10
- for i in range(30):
- assert df2_11.iloc[i, 1] == "d1_" + str(df2_11.iloc[i, 0])
- assert df2_11.iloc[i, 2] == np.bool_(df2_11.iloc[i, 0] % 2 ==
0)
-
- try:
- to_dataframe(tsfile_path, table_name="non_existent_table")
- except TableNotExistError as e:
- assert e.args[0] == "[non_existent_table] Requested table does not
exist"
-
- try:
- to_dataframe(tsfile_path, column_names=["non_existent_column"])
- except ColumnNotExistError as e:
- assert e.args[0] == "[non_existent_column] Column does not exist"
-
- finally:
- if os.path.exists(tsfile_path):
- os.remove(tsfile_path)
-
+ if os.path.exists("tablet_write_and_read.tsfile"):
+ os.remove("tablet_write_and_read.tsfile")
-import os
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.abspath(__file__)))
diff --git a/python/tsfile/__init__.py b/python/tsfile/__init__.py
index bf755fce..a9237257 100644
--- a/python/tsfile/__init__.py
+++ b/python/tsfile/__init__.py
@@ -34,4 +34,4 @@ from .tsfile_reader import TsFileReaderPy as TsFileReader,
ResultSetPy as Result
from .tsfile_writer import TsFileWriterPy as TsFileWriter
from .tsfile_py_cpp import get_tsfile_config, set_tsfile_config
from .tsfile_table_writer import TsFileTableWriter
-from .utils import to_dataframe
\ No newline at end of file
+from .utils import to_dataframe, dataframe_to_tsfile
\ No newline at end of file
diff --git a/python/tsfile/constants.py b/python/tsfile/constants.py
index 7d1f5ff5..18da3aef 100644
--- a/python/tsfile/constants.py
+++ b/python/tsfile/constants.py
@@ -17,6 +17,9 @@
#
from enum import unique, IntEnum
+import numpy as np
+
+TIME_COLUMN = "time"
@unique
class TSDataType(IntEnum):
@@ -31,6 +34,11 @@ class TSDataType(IntEnum):
BLOB = 10
STRING = 11
+ def is_compatible_with(self, other: 'TSDataType') -> bool:
+ if self == other:
+ return True
+ return other in _TSDATATYPE_COMPATIBLE_SOURCES.get(self, ())
+
def to_py_type(self):
if self == TSDataType.BOOLEAN:
return bool
@@ -62,20 +70,100 @@ class TSDataType(IntEnum):
elif self == TSDataType.INT64:
return "Int64"
elif self == TSDataType.FLOAT:
- return "float32"
+ return "Float32"
elif self == TSDataType.DOUBLE:
- return "float64"
+ return "Float64"
elif self == TSDataType.TEXT or self == TSDataType.STRING:
return "object"
elif self == TSDataType.TIMESTAMP:
- return "int64"
+ return "Int64"
elif self == TSDataType.DATE:
return "object"
elif self == TSDataType.BLOB:
- return "bytes"
+ return "object"
else:
raise ValueError(f"Unknown data type: {self}")
+ @classmethod
+ def from_pandas_datatype(cls, dtype):
+ if dtype is np.bool_:
+ return cls.BOOLEAN
+ elif dtype is np.int32:
+ return cls.INT32
+ elif dtype is np.int64:
+ return cls.INT64
+ elif dtype is np.float32:
+ return cls.FLOAT
+ elif dtype is np.float64:
+ return cls.DOUBLE
+ elif dtype is np.object_:
+ return cls.STRING
+
+ try:
+ import pandas as pd
+ if hasattr(pd, 'StringDtype') and isinstance(dtype,
pd.StringDtype):
+ return cls.STRING
+ except (ImportError, AttributeError):
+ pass
+
+ if hasattr(dtype, 'type'):
+ dtype = dtype.type
+ if dtype is np.bool_:
+ return cls.BOOLEAN
+ elif dtype is np.int32:
+ return cls.INT32
+ elif dtype is np.int64:
+ return cls.INT64
+ elif dtype is np.float32:
+ return cls.FLOAT
+ elif dtype is np.float64:
+ return cls.DOUBLE
+ elif dtype is np.object_:
+ return cls.STRING
+
+ dtype_str = str(dtype)
+
+ if 'stringdtype' in dtype_str.lower() or
dtype_str.startswith('string'):
+ return cls.STRING
+
+ dtype_map = {
+ 'bool': cls.BOOLEAN,
+ 'boolean': cls.BOOLEAN,
+ 'int32': cls.INT32,
+ 'Int32': cls.INT32,
+ 'int64': cls.INT64,
+ 'Int64': cls.INT64,
+ 'float32': cls.FLOAT,
+ 'float64': cls.DOUBLE,
+ 'bytes': cls.BLOB,
+ 'object': cls.STRING,
+ 'string': cls.STRING,
+ }
+
+ if dtype_str in dtype_map:
+ return dtype_map[dtype_str]
+
+ dtype_lower = dtype_str.lower()
+ if dtype_lower in dtype_map:
+ return dtype_map[dtype_lower]
+
+ if 'object_' in dtype_lower or dtype_str == "<class 'numpy.object_'>":
+ return cls.STRING
+
+ if dtype_str.startswith('datetime64'):
+ return cls.TIMESTAMP
+
+ return cls.STRING
+
+
+_TSDATATYPE_COMPATIBLE_SOURCES = {
+ TSDataType.INT64: (TSDataType.INT32, TSDataType.TIMESTAMP),
+ TSDataType.STRING: (TSDataType.TEXT,),
+ TSDataType.TEXT: (TSDataType.STRING,),
+ TSDataType.DOUBLE: (TSDataType.FLOAT,),
+ TSDataType.TIMESTAMP: (TSDataType.INT64, TSDataType.INT32)
+}
+
@unique
class TSEncoding(IntEnum):
@@ -112,3 +200,5 @@ class Compressor(IntEnum):
class ColumnCategory(IntEnum):
TAG = 0
FIELD = 1
+ ATTRIBUTE = 2
+ TIME = 3
diff --git a/python/tsfile/exceptions.py b/python/tsfile/exceptions.py
index 2a3df283..a02f392c 100644
--- a/python/tsfile/exceptions.py
+++ b/python/tsfile/exceptions.py
@@ -23,7 +23,7 @@ class LibraryError(Exception):
def __init__(self, code=None, context=None):
self.code = code if code is not None else self._default_code
self.message = context if context is not None else
self._default_message
- super().__init__(f"[{code}] {self.message}")
+ super().__init__(f"[{self.code}] {self.message}")
def __str__(self):
return f"{self.code}: {self.message}"
diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py
index 3aa1313c..c89649bf 100644
--- a/python/tsfile/schema.py
+++ b/python/tsfile/schema.py
@@ -17,6 +17,7 @@
#
from typing import List
+from .exceptions import TypeMismatchError
from .constants import TSDataType, ColumnCategory, TSEncoding, Compressor
@@ -53,7 +54,6 @@ class TimeseriesSchema:
return f"TimeseriesSchema({self.timeseries_name},
{self.data_type.name}, {self.encoding_type.name}, {self.compression_type.name})"
-
class DeviceSchema:
"""Represents a device entity containing multiple time series."""
@@ -73,6 +73,7 @@ class DeviceSchema:
def __repr__(self):
return f"DeviceSchema({self.device_name}, {self.timeseries_list})"
+
class ColumnSchema:
"""Defines schema for a table column (name, datatype, category)."""
@@ -85,6 +86,11 @@ class ColumnSchema:
self.column_name = column_name.lower()
if data_type is None:
raise ValueError("Data type cannot be None")
+ if category == ColumnCategory.TIME and data_type not in
[TSDataType.INT64, TSDataType.TIMESTAMP]:
+ raise TypeError(f"Time Column should have type : INT64/Timestamp,"
+ f" but got {data_type}")
+ elif category == ColumnCategory.TAG and data_type not in
[TSDataType.STRING, TSDataType.TEXT]:
+ raise TypeMismatchError(context="Tag column should be string or
text")
self.data_type = data_type
self.category = category
@@ -105,6 +111,7 @@ class TableSchema:
"""Schema definition for a table structure."""
table_name = None
columns = None
+ time_column = None
def __init__(self, table_name: str, columns: List[ColumnSchema]):
if table_name is None or len(table_name) == 0:
@@ -113,6 +120,14 @@ class TableSchema:
if len(columns) == 0:
raise ValueError("Columns cannot be empty")
self.columns = columns
+ for column in columns:
+ if column.get_category() == ColumnCategory.TIME:
+ if self.time_column is not None:
+ raise ValueError(
+ f"Table '{self.table_name}' cannot have multiple time
columns: "
+ f"'{self.time_column.get_column_name()}' and
'{column.get_column_name()}'"
+ )
+ self.time_column = column
def get_table_name(self):
return self.table_name
@@ -120,9 +135,49 @@ class TableSchema:
def get_columns(self):
return self.columns
+ def get_column(self, column_name: str):
+ name_lower = column_name.lower()
+ for col in self.columns:
+ if col.get_column_name() == name_lower:
+ return col
+ return None
+
+ def get_time_column(self):
+ return self.time_column
+
def get_column_names(self):
return [name.get_column_name() for name in self.columns]
+ def get_field_columns(self):
+ return [
+ column
+ for column in self.columns
+ if column.get_category() == ColumnCategory.FIELD
+ ]
+
+ def get_tag_columns(self):
+ return [
+ column
+ for column in self.columns
+ if column.get_category() == ColumnCategory.TAG
+ ]
+
+ def add_column(self, column: ColumnSchema):
+ if column.get_category() == ColumnCategory.TIME:
+ if self.time_column is not None:
+ raise ValueError(
+ f"Table '{self.table_name}' cannot have multiple time
columns: "
+ f"'{self.time_column.name}' and '{column.name}'"
+ )
+ self.time_column = column
+ else:
+ for col in self.columns:
+ if col.get_column_name() == column.get_column_name():
+ raise ValueError(
+ f"Duplicate column name {col.get_column_name()}"
+ )
+ self.columns.append(column)
+
def __repr__(self) -> str:
return f"TableSchema({self.table_name}, {self.columns})"
@@ -140,6 +195,13 @@ class ResultSetMetaData:
def set_table_name(self, table_name: str):
self.table_name = table_name
+ def add_column_at(self, index: int, column_name: str, data_type:
TSDataType):
+ """Insert a column and its data type at the given position (0-based
index)."""
+ if index < 0 or index > len(self.column_list):
+ raise IndexError(f"column index {index} out of range (0 to
{len(self.column_list)})")
+ self.column_list.insert(index, column_name)
+ self.data_types.insert(index, data_type)
+
def get_data_type(self, column_index: int) -> TSDataType:
if column_index < 1 or column_index > len(self.column_list):
raise OverflowError
diff --git a/python/tsfile/tsfile_cpp.pxd b/python/tsfile/tsfile_cpp.pxd
index 40bff4eb..9c65fb26 100644
--- a/python/tsfile/tsfile_cpp.pxd
+++ b/python/tsfile/tsfile_cpp.pxd
@@ -76,7 +76,10 @@ cdef extern from "./tsfile_cwrapper.h":
ctypedef enum ColumnCategory:
TAG = 0,
- FIELD = 1
+ FIELD = 1,
+ ATTRIBUTE = 2,
+ TIME = 3
+
# struct types
ctypedef struct ColumnSchema:
@@ -137,7 +140,8 @@ cdef extern from "./tsfile_cwrapper.h":
TSDataType * data_types,
int column_num, int max_rows);
- Tablet tablet_new(const char** column_names, TSDataType * data_types, int
column_num);
+ Tablet tablet_new(char** column_name_list, TSDataType* data_types,
+ uint32_t column_num, uint32_t max_rows);
ErrorCode tablet_add_timestamp(Tablet tablet, uint32_t row_index, int64_t
timestamp);
ErrorCode tablet_add_value_by_index_int64_t(Tablet tablet, uint32_t
row_index, uint32_t column_index,
diff --git a/python/tsfile/tsfile_py_cpp.pxd b/python/tsfile/tsfile_py_cpp.pxd
index e44bb588..2389aa9a 100644
--- a/python/tsfile/tsfile_py_cpp.pxd
+++ b/python/tsfile/tsfile_py_cpp.pxd
@@ -33,6 +33,7 @@ cdef public api DeviceSchema* to_c_device_schema(object
py_schema)
cdef public api ColumnSchema* to_c_column_schema(object py_schema)
cdef public api TableSchema* to_c_table_schema(object py_schema)
cdef public api Tablet to_c_tablet(object tablet)
+cdef public api Tablet dataframe_to_c_tablet(object target_name, object
dataframe, object table_schema)
cdef public api TsRecord to_c_record(object row_record)
cdef public api void free_c_table_schema(TableSchema* c_schema)
cdef public api void free_c_column_schema(ColumnSchema* c_schema)
diff --git a/python/tsfile/tsfile_py_cpp.pyx b/python/tsfile/tsfile_py_cpp.pyx
index d9924d7a..3ca79a2a 100644
--- a/python/tsfile/tsfile_py_cpp.pyx
+++ b/python/tsfile/tsfile_py_cpp.pyx
@@ -16,9 +16,13 @@
# under the License.
#
#cython: language_level=3
+from datetime import date as date_type
from .date_utils import parse_date_to_int
from .tsfile_cpp cimport *
+import pandas as pd
+import numpy as np
+
from libc.stdlib cimport free
from libc.stdlib cimport malloc
from libc.string cimport strdup
@@ -26,7 +30,7 @@ from cpython.exc cimport PyErr_SetObject
from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_AsUTF8,
PyUnicode_AsUTF8AndSize
from cpython.bytes cimport PyBytes_AsString, PyBytes_AsStringAndSize
-from tsfile.exceptions import ERROR_MAPPING
+from tsfile.exceptions import ERROR_MAPPING, TypeMismatchError
from tsfile.schema import ResultSetMetaData as ResultSetMetaDataPy
from tsfile.schema import TSDataType as TSDataTypePy, TSEncoding as
TSEncodingPy
from tsfile.schema import Compressor as CompressorPy, ColumnCategory as
CategoryPy
@@ -130,7 +134,9 @@ cdef dict COMPRESSION_TYPE_MAP = {
cdef dict CATEGORY_MAP = {
CategoryPy.TAG: ColumnCategory.TAG,
- CategoryPy.FIELD: ColumnCategory.FIELD
+ CategoryPy.FIELD: ColumnCategory.FIELD,
+ CategoryPy.ATTRIBUTE: ColumnCategory.ATTRIBUTE,
+ CategoryPy.TIME: ColumnCategory.TIME
}
cdef TSDataType to_c_data_type(object data_type):
@@ -220,7 +226,7 @@ cdef Tablet to_c_tablet(object tablet):
cdef TSDataType * column_types
cdef bytes row_bytes
cdef char *raw_str
- cdef const char* str_ptr
+ cdef const char * str_ptr
cdef Py_ssize_t raw_len
if tablet.get_target_name() is not None:
@@ -293,7 +299,7 @@ cdef Tablet to_c_tablet(object tablet):
for row in range(max_row_num):
if value[row] is not None:
py_value = value[row]
- str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len)
+ str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len)
tablet_add_value_by_index_string_with_len(ctablet, row,
col, str_ptr, raw_len)
elif data_type == TS_DATATYPE_BLOB:
@@ -304,13 +310,172 @@ cdef Tablet to_c_tablet(object tablet):
return ctablet
+cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe, object
table_schema):
+ cdef Tablet ctablet
+ cdef int max_row_num
+ cdef TSDataType data_type
+ cdef int64_t timestamp
+ cdef const char * device_id_c = NULL
+ cdef char** columns_names
+ cdef TSDataType * columns_types
+ cdef char *raw_str
+ cdef const char * str_ptr
+ cdef Py_ssize_t raw_len
+ cdef int column_num
+ cdef int i, row
+ cdef object value
+ cdef object py_value
+ cdef object value_bytes
+
+ device_id_bytes = PyUnicode_AsUTF8String(target_name.lower())
+ device_id_c = device_id_bytes
+ df_columns = list(dataframe.columns)
+ use_id_as_time = False
+
+ time_column = table_schema.get_time_column()
+ use_id_as_time = time_column is None
+ time_column_name = None if time_column is None else
time_column.get_column_name()
+
+ data_columns = [col for col in df_columns if col != time_column_name]
+ column_num = len(data_columns)
+
+ if column_num == 0:
+ raise ValueError("DataFrame must have at least one data column besides
'time'")
+
+ max_row_num = len(dataframe)
+
+ column_types_list = []
+ for column in data_columns:
+ data_type = table_schema.get_column(column).get_data_type()
+ column_types_list.append(data_type)
+
+ columns_names = <char**> malloc(sizeof(char *) * column_num)
+ columns_types = <TSDataType *> malloc(sizeof(TSDataType) * column_num)
+
+ for i in range(column_num):
+ columns_names[i] = strdup(data_columns[i].lower().encode('utf-8'))
+ columns_types[i] = column_types_list[i]
+
+ ctablet = _tablet_new_with_target_name(device_id_c, columns_names,
columns_types, column_num,
+ max_row_num)
+
+ free(columns_types)
+ for i in range(column_num):
+ free(columns_names[i])
+ free(columns_names)
+
+ if use_id_as_time:
+ for row in range(max_row_num):
+ timestamp_py = dataframe.index[row]
+ if pd.isna(timestamp_py):
+ continue
+ timestamp = <int64_t> timestamp_py
+ tablet_add_timestamp(ctablet, row, timestamp)
+ else:
+ time_values = dataframe[time_column.get_column_name()].values
+ for row in range(max_row_num):
+ timestamp_py = time_values[row]
+ if pd.isna(timestamp_py):
+ continue
+ timestamp = <int64_t> timestamp_py
+ tablet_add_timestamp(ctablet, row, timestamp)
+
+ for col in range(column_num):
+ col_name = data_columns[col]
+ data_type = column_types_list[col]
+ column_values = dataframe[col_name].values
+
+ # Per-column validation for object types (check first non-null value
only)
+ if data_type in (TS_DATATYPE_DATE, TS_DATATYPE_STRING,
TS_DATATYPE_TEXT, TS_DATATYPE_BLOB):
+ col_series = dataframe[col_name]
+ first_valid_idx = col_series.first_valid_index()
+ if first_valid_idx is not None:
+ value = col_series[first_valid_idx]
+ if data_type == TS_DATATYPE_DATE:
+ if not isinstance(value, date_type):
+ raise TypeMismatchError(context=
+ f"Column '{col_name}': expected DATE
(datetime.date), "
+ f"got {type(value).__name__}: {value!r}"
+ )
+ elif data_type in (TS_DATATYPE_STRING, TS_DATATYPE_TEXT):
+ if not isinstance(value, str):
+ raise TypeMismatchError(context=
+ f"Column '{col_name}': expected STRING/TEXT, "
+ f"got {type(value).__name__}: {value!r}"
+ )
+ elif data_type == TS_DATATYPE_BLOB:
+ if not isinstance(value, bytes):
+ raise TypeMismatchError(context=
+ f"Column '{col_name}': expected BLOB (bytes or
bytearray), "
+ f"got {type(value).__name__}: {value!r}"
+ )
+
+ # BOOLEAN
+ if data_type == TS_DATATYPE_BOOLEAN:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ tablet_add_value_by_index_bool(ctablet, row, col, <bint>
value)
+ # INT32
+ elif data_type == TS_DATATYPE_INT32:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ tablet_add_value_by_index_int32_t(ctablet, row, col,
<int32_t> value)
+ # INT64
+ elif data_type == TS_DATATYPE_INT64 or data_type ==
TS_DATATYPE_TIMESTAMP:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ tablet_add_value_by_index_int64_t(ctablet, row, col,
<int64_t> value)
+ # FLOAT
+ elif data_type == TS_DATATYPE_FLOAT:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ tablet_add_value_by_index_float(ctablet, row, col, <float>
value)
+ # DOUBLE
+ elif data_type == TS_DATATYPE_DOUBLE:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ tablet_add_value_by_index_double(ctablet, row, col,
<double> value)
+ # DATE (validated per-column above)
+ elif data_type == TS_DATATYPE_DATE:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ tablet_add_value_by_index_int32_t(ctablet, row, col,
parse_date_to_int(value))
+ # STRING or TEXT (validated per-column above)
+ elif data_type == TS_DATATYPE_STRING or data_type == TS_DATATYPE_TEXT:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ py_value = str(value)
+ str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len)
+ tablet_add_value_by_index_string_with_len(ctablet, row,
col, str_ptr, raw_len)
+ # BLOB (validated per-column above)
+ elif data_type == TS_DATATYPE_BLOB:
+ for row in range(max_row_num):
+ value = column_values[row]
+ if not pd.isna(value):
+ if isinstance(value, bytes):
+ PyBytes_AsStringAndSize(value, &raw_str, &raw_len)
+ tablet_add_value_by_index_string_with_len(ctablet,
row, col, raw_str, raw_len)
+ else:
+ value_bytes = bytes(value)
+ PyBytes_AsStringAndSize(value_bytes, &raw_str,
&raw_len)
+ tablet_add_value_by_index_string_with_len(ctablet,
row, col, raw_str, raw_len)
+
+ return ctablet
+
cdef TsRecord to_c_record(object row_record):
cdef int field_num = row_record.get_fields_num()
cdef int64_t timestamp = <int64_t> row_record.get_timestamp()
cdef bytes device_id_bytes =
PyUnicode_AsUTF8String(row_record.get_device_id())
- cdef const char* device_id = device_id_bytes
- cdef const char* str_ptr
- cdef char* blob_ptr
+ cdef const char * device_id = device_id_bytes
+ cdef const char * str_ptr
+ cdef char * blob_ptr
cdef Py_ssize_t str_len
cdef TsRecord record
cdef int i
@@ -320,9 +485,11 @@ cdef TsRecord to_c_record(object row_record):
field = row_record.get_fields()[i]
data_type = to_c_data_type(field.get_data_type())
if data_type == TS_DATATYPE_BOOLEAN:
- _insert_data_into_ts_record_by_name_bool(record,
PyUnicode_AsUTF8(field.get_field_name()), field.get_bool_value())
+ _insert_data_into_ts_record_by_name_bool(record,
PyUnicode_AsUTF8(field.get_field_name()),
+ field.get_bool_value())
elif data_type == TS_DATATYPE_INT32 or data_type == TS_DATATYPE_DATE:
- _insert_data_into_ts_record_by_name_int32_t(record,
PyUnicode_AsUTF8(field.get_field_name()), field.get_int_value())
+ _insert_data_into_ts_record_by_name_int32_t(record,
PyUnicode_AsUTF8(field.get_field_name()),
+ field.get_int_value())
elif data_type == TS_DATATYPE_INT64:
_insert_data_into_ts_record_by_name_int64_t(record,
PyUnicode_AsUTF8(field.get_field_name()),
field.get_long_value())
@@ -333,15 +500,17 @@ cdef TsRecord to_c_record(object row_record):
_insert_data_into_ts_record_by_name_double(record,
PyUnicode_AsUTF8(field.get_field_name()),
field.get_double_value())
elif data_type == TS_DATATYPE_FLOAT:
- _insert_data_into_ts_record_by_name_float(record,
PyUnicode_AsUTF8(field.get_field_name()), field.get_float_value())
+ _insert_data_into_ts_record_by_name_float(record,
PyUnicode_AsUTF8(field.get_field_name()),
+ field.get_float_value())
elif data_type == TS_DATATYPE_TEXT or data_type == TS_DATATYPE_STRING:
- str_ptr = PyUnicode_AsUTF8AndSize(field.get_string_value(),
&str_len)
- _insert_data_into_ts_record_by_name_string_with_len(record,
PyUnicode_AsUTF8(field.get_field_name()), str_ptr, str_len)
+ str_ptr = PyUnicode_AsUTF8AndSize(field.get_string_value(),
&str_len)
+ _insert_data_into_ts_record_by_name_string_with_len(record,
PyUnicode_AsUTF8(field.get_field_name()),
+ str_ptr,
str_len)
elif data_type == TS_DATATYPE_BLOB:
if PyBytes_AsStringAndSize(field.get_string_value(), &blob_ptr,
&str_len) < 0:
raise ValueError("blob not legal")
_insert_data_into_ts_record_by_name_string_with_len(record,
PyUnicode_AsUTF8(field.get_field_name()),
- <const char*>
blob_ptr, <uint32_t>str_len)
+ <const char *>
blob_ptr, <uint32_t> str_len)
return record
# Free c structs' space
diff --git a/python/tsfile/tsfile_reader.pyx b/python/tsfile/tsfile_reader.pyx
index 359492d6..4476d24d 100644
--- a/python/tsfile/tsfile_reader.pyx
+++ b/python/tsfile/tsfile_reader.pyx
@@ -19,11 +19,12 @@
#cython: language_level=3
import weakref
-from email.contentmanager import raw_data_manager
from typing import List
import pandas as pd
from libc.stdint cimport INT64_MIN, INT64_MAX
+from libc.string cimport strlen
+from cpython.bytes cimport PyBytes_FromStringAndSize
from tsfile.schema import TSDataType as TSDataTypePy
from .date_utils import parse_int_to_date
@@ -152,7 +153,6 @@ cdef class ResultSetPy:
# Well when we check is null, id from 0, so there index -1.
if tsfile_result_set_is_null_by_index(self.result, index):
return None
- # data type in metadata is an array, id from 0.
data_type = self.metadata.get_data_type(index)
if data_type == TSDataTypePy.INT32:
return tsfile_result_set_get_value_by_index_int32_t(self.result,
index)
@@ -166,7 +166,7 @@ cdef class ResultSetPy:
return tsfile_result_set_get_value_by_index_double(self.result,
index)
elif data_type == TSDataTypePy.BOOLEAN:
return tsfile_result_set_get_value_by_index_bool(self.result,
index)
- elif data_type == TSDataTypePy.STRING or data_type ==
TSDataTypePy.TEXT or data_type == TSDataTypePy.BLOB:
+ elif data_type == TSDataTypePy.STRING or data_type ==
TSDataTypePy.TEXT:
try:
string =
tsfile_result_set_get_value_by_index_string(self.result, index)
if string == NULL:
@@ -174,6 +174,14 @@ cdef class ResultSetPy:
return string.decode('utf-8')
finally:
pass
+ elif data_type == TSDataTypePy.BLOB:
+ try:
+ string =
tsfile_result_set_get_value_by_index_string(self.result, index)
+ if string == NULL:
+ return None
+ return PyBytes_FromStringAndSize(string, strlen(string))
+ finally:
+ pass
def get_value_by_name(self, column_name : str):
"""
@@ -287,7 +295,7 @@ cdef class TsFileReaderPy:
return pyresult
def query_table_on_tree(self, column_names : List[str],
- start_time : int = INT64_MIN, end_time : int = INT64_MAX)
-> ResultSetPy:
+ start_time : int = INT64_MIN, end_time : int =
INT64_MAX) -> ResultSetPy:
"""
Execute a time range query on specified columns on tree structure.
:return: query result handler.
diff --git a/python/tsfile/tsfile_table_writer.py
b/python/tsfile/tsfile_table_writer.py
index 28193360..0346fd52 100644
--- a/python/tsfile/tsfile_table_writer.py
+++ b/python/tsfile/tsfile_table_writer.py
@@ -15,9 +15,63 @@
# specific language governing permissions and limitations
# under the License.
#
+from datetime import date, datetime
-from tsfile import TableSchema, Tablet, TableNotExistError
-from tsfile import TsFileWriter
+import pandas as pd
+
+from tsfile import TableSchema, Tablet, TableNotExistError, ColumnCategory
+from tsfile import TsFileWriter, ColumnSchema
+from tsfile.constants import TSDataType
+from tsfile.exceptions import TypeMismatchError, ColumnNotExistError
+
+
+def validate_dataframe_for_tsfile(df: pd.DataFrame) -> None:
+ if df is None or df.empty:
+ raise ValueError("DataFrame cannot be None or empty")
+
+ columns = list(df.columns)
+
+ seen = set()
+ duplicates = []
+ for c in columns:
+ lower = c.lower()
+ if lower in seen:
+ duplicates.append(c)
+ seen.add(lower)
+ if duplicates:
+ raise ValueError(
+ f"Column names must be unique (case-insensitive). Duplicate
columns: {duplicates}"
+ )
+
+ unsupported = []
+ for col in columns:
+ dtype = df[col].dtype
+ try:
+ TSDataType.from_pandas_datatype(dtype)
+ except (ValueError, TypeError) as e:
+ unsupported.append((col, str(dtype), str(e)))
+
+ if unsupported:
+ msg_parts = [f" - {col}: dtype={dtype}" for col, dtype in unsupported]
+ raise ValueError(
+ "Data types not supported by tsfile:\n" + "\n".join(msg_parts)
+ )
+
+
+def infer_object_column_type(column_series: pd.Series) -> TSDataType:
+ first_valid_idx = column_series.first_valid_index()
+ if first_valid_idx is None:
+ return TSDataType.STRING
+ value = column_series[first_valid_idx]
+ if isinstance(value, (bytes, bytearray)):
+ return TSDataType.BLOB
+ if isinstance(value, (date, datetime)):
+ return TSDataType.DATE
+ if isinstance(value, str):
+ return TSDataType.STRING
+ raise TypeError(
+ f"Cannot infer type from object column: expected str/bytes/date, got
{type(value).__name__}: {value!r}"
+ )
class TsFileTableWriter:
@@ -39,7 +93,7 @@ class TsFileTableWriter:
"""
self.writer = TsFileWriter(path, memory_threshold)
self.writer.register_table(table_schema)
- self.exclusive_table_name_ = table_schema.get_table_name()
+ self.tableSchema = table_schema
def write_table(self, tablet: Tablet):
"""
@@ -49,11 +103,81 @@ class TsFileTableWriter:
:raise: TableNotExistError if table does not exist or tablet's
table_name does not match tableschema.
"""
if tablet.get_target_name() is None:
- tablet.set_table_name(self.exclusive_table_name_)
- elif self.exclusive_table_name_ is not None and
tablet.get_target_name() != self.exclusive_table_name_:
+ tablet.set_table_name(self.tableSchema.get_table_name())
+ elif (self.tableSchema.get_table_name() is not None
+ and tablet.get_target_name() !=
self.tableSchema.get_table_name()):
raise TableNotExistError
self.writer.write_table(tablet)
+ def write_dataframe(self, dataframe: pd.DataFrame):
+ """
+ Write a pandas DataFrame into table in tsfile.
+ :param dataframe: pandas DataFrame with 'time' column and data columns
matching schema.
+ :return: no return value.
+ :raise: ValueError if dataframe is None or is empty.
+ :raise: ColumnNotExistError if DataFrame columns don't match schema.
+ :raise: TypeMismatchError if DataFrame column types are incompatible
with schema.
+ """
+
+ validate_dataframe_for_tsfile(dataframe)
+
+ # rename columns to lowercase
+ dataframe = dataframe.rename(columns=str.lower)
+ time_column = self.tableSchema.get_time_column()
+ # tag columns used for sorting
+ tag_columns = self.tableSchema.get_tag_columns()
+ if time_column is None:
+ if 'time' in dataframe.columns:
+ dtype =
TSDataType.from_pandas_datatype(dataframe['time'].dtype)
+ if not TSDataType.TIMESTAMP.is_compatible_with(dtype):
+ raise TypeMismatchError(
+ code=27,
+ context=f"time column require INT/Timestamp"
+ )
+
+ self.tableSchema.add_column(ColumnSchema("time",
+ TSDataType.TIMESTAMP,
+ ColumnCategory.TIME))
+ time_column = self.tableSchema.get_time_column()
+
+ type_mismatches = []
+ for col_name in dataframe.columns:
+ if time_column is not None and col_name ==
time_column.get_column_name():
+ continue
+ schema_col = self.tableSchema.get_column(col_name)
+ if schema_col is None:
+ raise ColumnNotExistError(context=f"{col_name} is not define
in table schema")
+ # Object dtype can represent STRING, DATE, TEXT, BLOB; validation
will be performed during insert, skip here
+ if schema_col.get_data_type() in [TSDataType.INT64,
TSDataType.INT32, TSDataType.DOUBLE, TSDataType.FLOAT,
+ TSDataType.BOOLEAN,
TSDataType.TIMESTAMP]:
+ df_dtype = dataframe[col_name].dtype
+ df_ts_type = TSDataType.from_pandas_datatype(df_dtype)
+ expected_ts_type = schema_col.get_data_type()
+
+ if not expected_ts_type.is_compatible_with(df_ts_type):
+ type_mismatches.append(
+ f"Column '{col_name}': expected
{expected_ts_type.name}, got {df_ts_type.name}"
+ )
+
+ if type_mismatches:
+ raise TypeMismatchError(
+ code=27,
+ context=f"Type mismatches: {'; '.join(type_mismatches)}"
+ )
+
+ if time_column:
+ time_column_name = time_column.get_column_name()
+ time_series = dataframe[time_column_name]
+ if time_series.isna().any():
+ raise ValueError(
+ f"Time column '{time_column}' must not contain null/NaN
values"
+ )
+ sort_by = [column.get_column_name() for column in tag_columns]
+ sort_by.append(time_column_name)
+ dataframe = dataframe.sort_values(by=sort_by)
+
+ self.writer.write_dataframe(self.tableSchema.get_table_name(),
dataframe, self.tableSchema)
+
def close(self):
"""
Close TsFileTableWriter and will flush data automatically.
diff --git a/python/tsfile/tsfile_writer.pyx b/python/tsfile/tsfile_writer.pyx
index 20199195..4826ef72 100644
--- a/python/tsfile/tsfile_writer.pyx
+++ b/python/tsfile/tsfile_writer.pyx
@@ -15,21 +15,21 @@
# specific language governing permissions and limitations
# under the License.
#
-
-#cython: language_level=3
-
-from .tsfile_cpp cimport *
-from .tsfile_py_cpp cimport *
+import pandas
from tsfile.row_record import RowRecord
-from tsfile.schema import TimeseriesSchema as TimeseriesSchemaPy, DeviceSchema
as DeviceSchemaPy
from tsfile.schema import TableSchema as TableSchemaPy
+from tsfile.schema import TimeseriesSchema as TimeseriesSchemaPy, DeviceSchema
as DeviceSchemaPy
from tsfile.tablet import Tablet as TabletPy
+from .tsfile_cpp cimport *
+from .tsfile_py_cpp cimport *
+
+#cython: language_level=3
cdef class TsFileWriterPy:
cdef TsFileWriter writer
- def __init__(self, pathname:str, memory_threshold:int = 128 * 1024 * 1024):
+ def __init__(self, pathname: str, memory_threshold: int = 128 * 1024 *
1024):
self.writer = tsfile_writer_new_c(pathname, memory_threshold)
def register_timeseries(self, device_name : str, timeseries_schema :
TimeseriesSchemaPy):
@@ -38,7 +38,7 @@ cdef class TsFileWriterPy:
device_name: device name of the timeseries
timeseries_schema: measurement's name/datatype/encoding/compressor
"""
- cdef TimeseriesSchema* c_schema =
to_c_timeseries_schema(timeseries_schema)
+ cdef TimeseriesSchema * c_schema =
to_c_timeseries_schema(timeseries_schema)
cdef ErrorCode errno
try:
errno = tsfile_writer_register_timeseries_py_cpp(self.writer,
device_name, c_schema)
@@ -51,7 +51,7 @@ cdef class TsFileWriterPy:
Register a device with tsfile writer.
device_schema: the device definition, including device_name, some
measurements' schema.
"""
- cdef DeviceSchema* device_schema_c = to_c_device_schema(device_schema)
+ cdef DeviceSchema * device_schema_c = to_c_device_schema(device_schema)
cdef ErrorCode errno
try:
errno = tsfile_writer_register_device_py_cpp(self.writer,
device_schema_c)
@@ -64,7 +64,7 @@ cdef class TsFileWriterPy:
Register a table with tsfile writer.
table_schema: the table definition, include table_name, columns'
schema.
"""
- cdef TableSchema* c_schema = to_c_table_schema(table_schema)
+ cdef TableSchema * c_schema = to_c_table_schema(table_schema)
cdef ErrorCode errno
try:
errno = tsfile_writer_register_table_py_cpp(self.writer, c_schema)
@@ -86,6 +86,15 @@ cdef class TsFileWriterPy:
finally:
free_c_tablet(ctablet)
+ def write_dataframe(self, target_table: str, dataframe: pandas.DataFrame,
tableschema: TableSchemaPy):
+ cdef Tablet ctablet = dataframe_to_c_tablet(target_table, dataframe,
tableschema)
+ cdef ErrorCode errno
+ try:
+ errno = _tsfile_writer_write_table(self.writer, ctablet)
+ check_error(errno)
+ finally:
+ free_c_tablet(ctablet)
+
def write_row_record(self, record : RowRecord):
"""
Write a record into tsfile with tsfile writer.
@@ -143,4 +152,3 @@ cdef class TsFileWriterPy:
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
-
diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py
index d27a0fae..6044ddbb 100644
--- a/python/tsfile/utils.py
+++ b/python/tsfile/utils.py
@@ -20,9 +20,12 @@ from typing import Optional
import numpy as np
import pandas as pd
+from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype
+from tsfile import ColumnSchema, TableSchema, ColumnCategory, TSDataType
from tsfile.exceptions import TableNotExistError, ColumnNotExistError
from tsfile.tsfile_reader import TsFileReaderPy
+from tsfile.tsfile_table_writer import TsFileTableWriter,
infer_object_column_type, validate_dataframe_for_tsfile
def to_dataframe(file_path: str,
@@ -96,29 +99,45 @@ def to_dataframe(file_path: str,
_start_time = start_time if start_time is not None else
np.iinfo(np.int64).min
_end_time = end_time if end_time is not None else
np.iinfo(np.int64).max
+ ## Time column handling (table model):
+ ## 1. Request has no column list (query all):
+ ## 1.1 TsFile has a time column in schema: query only non-time
columns; then rename
+ ## the first column of the returned DataFrame to the schema
time column name.
+ ## 1.2 TsFile has no time column in schema: query as-is; first
column is "time".
+ ## 2. Request has a column list but no time column:
+ ## 2.1 TsFile has a time column in schema: query with requested
columns; rename the
+ ## first column to the schema time column name.
+ ## 2.2 TsFile has no time column in schema: first column stays
"time"; no rename.
+ ## 3. Request has a column list including the time column:
+ ## 3.1 Query with requested columns (including time); do not rename
the first column.
with TsFileReaderPy(file_path) as reader:
total_rows = 0
table_schema = reader.get_all_table_schemas()
is_tree_model = len(table_schema) == 0
-
+ time_column = None
if is_tree_model:
if _column_names is None:
print("columns name is None, return all columns")
else:
if _table_name is None:
- _table_name, columns = next(iter(table_schema.items()))
+ _table_name, table_schema =
next(iter(table_schema.items()))
else:
_table_name = _table_name.lower()
if _table_name.lower() not in table_schema:
raise TableNotExistError(_table_name)
- columns = table_schema[_table_name]
+ table_schema = table_schema[_table_name]
- column_names_in_file = columns.get_column_names()
+ column_names_in_file = []
+ for column in table_schema.get_columns():
+ if column.get_category() == ColumnCategory.TIME:
+ time_column = column.get_column_name()
+ else:
+ column_names_in_file.append(column.get_column_name())
if _column_names is not None:
for column in _column_names:
- if column.lower() not in column_names_in_file:
+ if column.lower() not in column_names_in_file and
column.lower() != time_column :
raise ColumnNotExistError(column)
else:
_column_names = column_names_in_file
@@ -133,18 +152,21 @@ def to_dataframe(file_path: str,
with query_result as result:
while result.next():
if max_row_num is None:
- df = result.read_data_frame()
+ dataframe = result.read_data_frame()
elif is_iterator:
- df = result.read_data_frame(max_row_num)
+ dataframe = result.read_data_frame(max_row_num)
else:
remaining_rows = max_row_num - total_rows
if remaining_rows <= 0:
break
- df = result.read_data_frame(remaining_rows)
- if df is None or df.empty:
+ dataframe = result.read_data_frame(remaining_rows)
+ if dataframe is None or dataframe.empty:
continue
- total_rows += len(df)
- yield df
+ total_rows += len(dataframe)
+ if time_column is not None:
+ if _column_names is None or time_column.lower() not in
[c.lower() for c in _column_names]:
+ dataframe =
dataframe.rename(columns={dataframe.columns[0]: time_column})
+ yield dataframe
if (not is_iterator) and max_row_num is not None and
total_rows >= max_row_num:
break
@@ -159,3 +181,98 @@ def to_dataframe(file_path: str,
return df
else:
return pd.DataFrame()
+
+
+def dataframe_to_tsfile(dataframe: pd.DataFrame,
+ file_path: str,
+ table_name: Optional[str] = None,
+ time_column: Optional[str] = None,
+ tag_column: Optional[list[str]] = None,
+ ):
+ """
+ Write a pandas DataFrame to a TsFile by inferring the table schema from
the DataFrame.
+
+ This function automatically infers the table schema based on the
DataFrame's column
+ names and data types, then writes the data to a TsFile.
+
+ Parameters
+ ----------
+ dataframe : pd.DataFrame
+ The pandas DataFrame to write to TsFile.
+ - If a 'time' column (case-insensitive) exists, it will be used as the
time column.
+ - Otherwise, the DataFrame index will be used as timestamps.
+ - All other columns will be treated as data columns.
+
+ file_path : str
+ Path to the TsFile to write. Will be created if it doesn't exist.
+
+ table_name : Optional[str], default None
+ Name of the table. If None, defaults to "default_table".
+
+ time_column : Optional[str], default None
+ Name of the time column. If None, will look for a column named 'time'
(case-insensitive),
+ or use the DataFrame index if no 'time' column is found.
+
+ tag_column : Optional[list[str]], default None
+ List of column names to be treated as TAG columns. All other columns
will be FIELD columns.
+ If None, all columns are treated as FIELD columns.
+
+ Returns
+ -------
+ None
+
+ Raises
+ ------
+ ValueError
+ If the DataFrame is empty or has no data columns.
+ """
+ validate_dataframe_for_tsfile(dataframe)
+ df = dataframe.rename(columns=str.lower)
+
+ if not table_name:
+ table_name = "default_table"
+
+ if time_column is not None:
+ if time_column.lower() not in df.columns:
+ raise ValueError(f"Time column '{time_column}' not found in
DataFrame")
+ if tag_column is not None:
+ for tag_col in tag_column:
+ if tag_col.lower() not in df.columns:
+ raise ValueError(f"Tag column '{tag_col}' not found in
DataFrame")
+ tag_columns_lower = {t.lower() for t in (tag_column or [])}
+
+ if time_column is not None:
+ time_col_name = time_column.lower()
+ elif 'time' in df.columns:
+ time_col_name = 'time'
+ else:
+ time_col_name = None
+
+ if time_col_name is not None:
+ if not is_integer_dtype(df[time_col_name].dtype):
+ raise TypeError(
+ f"Time column '{time_col_name}' must be integer type (int64 or
int), got {df[time_col_name].dtype}")
+
+ column_schemas = []
+ if time_col_name is not None:
+ column_schemas.append(ColumnSchema(time_col_name,
TSDataType.TIMESTAMP, ColumnCategory.TIME))
+
+ for col in df.columns:
+ if col == time_col_name:
+ continue
+ col_dtype = df[col].dtype
+ if is_object_dtype(col_dtype):
+ ts_data_type = infer_object_column_type(df[col])
+ else:
+ ts_data_type = TSDataType.from_pandas_datatype(col_dtype)
+
+ category = ColumnCategory.TAG if col in tag_columns_lower else
ColumnCategory.FIELD
+ column_schemas.append(ColumnSchema(col, ts_data_type, category))
+
+ if len(column_schemas) == 0:
+ raise ValueError("DataFrame must have at least one data column besides
the time column")
+
+ table_schema = TableSchema(table_name, column_schemas)
+
+ with TsFileTableWriter(file_path, table_schema) as writer:
+ writer.write_dataframe(df)