lzyy2024 commented on code in PR #47307:
URL: https://github.com/apache/doris/pull/47307#discussion_r1924960735
##########
be/src/vec/functions/function_compress.cpp:
##########
@@ -0,0 +1,299 @@
+#include <glog/logging.h>
+
+#include <cctype>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "common/status.h"
+#include "util/block_compression.h"
+#include "util/faststring.h"
+#include "vec/aggregate_functions/aggregate_function.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/columns/columns_number.h"
+#include "vec/common/assert_cast.h"
+#include "vec/core/block.h"
+#include "vec/core/column_numbers.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/functions/function.h"
+#include "vec/functions/simple_function_factory.h"
+
+namespace doris {
+class FunctionContext;
+} // namespace doris
+
+namespace doris::vectorized {
+
+class FunctionCompress : public IFunction {
+public:
+ static constexpr auto name = "compress";
+ static FunctionPtr create() { return std::make_shared<FunctionCompress>();
}
+
+ String get_name() const override { return name; }
+
+ size_t get_number_of_arguments() const override { return 1; }
+
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const
override {
+ return make_nullable(std::make_shared<DataTypeString>());
+ }
+
+ Status execute_impl(FunctionContext* context, Block& block, const
ColumnNumbers& arguments,
+ uint32_t result, size_t input_rows_count) const
override {
+ // LOG(INFO) << "Executing FunctionCompress with " << input_rows_count
+ // << " rows."; // Log the number of rows being processed
+
+ // Get the compression algorithm object
+ BlockCompressionCodec* compression_codec;
+
RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
+ &compression_codec));
+
+ const auto& arg_column =
+ assert_cast<const
ColumnString&>(*block.get_by_position(arguments[0]).column);
+ auto result_column = ColumnString::create();
+
+ auto& col_data = result_column->get_chars();
+ auto& col_offset = result_column->get_offsets();
+ col_offset.resize(input_rows_count);
+
+ auto null_column = ColumnUInt8::create(input_rows_count);
+ auto& null_map = null_column->get_data();
+
+ faststring compressed_str;
+ Slice data;
+ for (int row = 0; row < input_rows_count; row++) {
+ null_map[row] = false;
+ const auto& str = arg_column.get_data_at(row);
+ data = Slice(str.data, str.size);
+
+ // Print the original string (before compression)
+ // LOG(INFO) << "Original string at row " << row << ": "
+ // << std::string(str.data, str.size);
+
+ auto st = compression_codec->compress(data, &compressed_str);
+
+ if (!st.ok()) {
+ // LOG(INFO) << "Compression failed at row " << row
+ // << ", skipping this row."; // Log failure
+ col_offset[row] = col_offset[row - 1];
+ null_map[row] = true;
+ continue;
+ }
+
+ size_t idx = col_data.size();
+ if (!str.size) { // null -> 0x
+ col_data.resize(col_data.size() + 2);
+ col_data[idx] = '0', col_data[idx + 1] = 'x';
+ col_offset[row] = col_offset[row - 1] + 2;
+ continue;
+ }
+
+ // first ten digits represent the length of the uncompressed string
+ int value = (int)str.size;
+ col_data.resize(col_data.size() + 10);
+ col_data[idx] = '0', col_data[idx + 1] = 'x';
+ for (int i = 0; i < 4; i++) {
+ unsigned char byte = (value >> (i * 8)) & 0xFF;
+ col_data[idx + 2 + i * 2] = "0123456789ABCDEF"[byte >> 4];
// 高4位
+ col_data[idx + 3 + i * 2] = "0123456789ABCDEF"[byte & 0x0F];
// 低4位
+ }
+ idx += 10;
+
+ col_data.resize(col_data.size() + 2 * compressed_str.size());
+ // memcpy(col_data.data() + col_data.size(),
compressed_str.data(), compressed_str.size());
+
+ unsigned char* src = compressed_str.data();
+ {
+ auto transform = [](char ch) -> unsigned char {
+ char x;
+ if (ch < 10) {
+ x = ch + '0';
+ } else {
+ x = ch - 10 + 'A';
+ }
+ // LOG(INFO) << "transform" << (int)x << "->" << x;
+ return x;
+ };
+ for (int i = 0; i < compressed_str.size(); i++) {
+ col_data[idx] = transform(((*src) >> 4) & 0x0F);
+ col_data[idx + 1] = transform(*src & 0x0F);
+ LOG(INFO) << (unsigned int)(*src) << " -> " << (unsigned
int)col_data[idx]
+ << " and " << (unsigned int)col_data[idx + 1];
+ idx += 2;
+ src++;
+ }
+
+ // Print the compressed string (after compression)
+ // LOG(INFO) << "Compressed string at row " << row << ": "
+ // << std::string(reinterpret_cast<const
char*>(col_data.data()));
+ col_offset[row] = col_offset[row - 1] + 10 +
compressed_str.size() * 2;
Review Comment:
The first ten digits of the compress value are "0x" and eight digits long,
followed by each digit split into two hexadecimal values
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]