lzyy2024 commented on code in PR #47307: URL: https://github.com/apache/doris/pull/47307#discussion_r1924960735
########## be/src/vec/functions/function_compress.cpp: ########## @@ -0,0 +1,299 @@ +#include <glog/logging.h> + +#include <cctype> +#include <cstddef> +#include <cstring> +#include <memory> +#include <string> +#include <utility> + +#include "common/status.h" +#include "util/block_compression.h" +#include "util/faststring.h" +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/columns/column.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris { +class FunctionContext; +} // namespace doris + +namespace doris::vectorized { + +class FunctionCompress : public IFunction { +public: + static constexpr auto name = "compress"; + static FunctionPtr create() { return std::make_shared<FunctionCompress>(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 1; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared<DataTypeString>()); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const override { + // LOG(INFO) << "Executing FunctionCompress with " << input_rows_count + // << " rows."; // Log the number of rows being processed + + // Get the compression algorithm object + BlockCompressionCodec* compression_codec; + RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB, + &compression_codec)); + + const auto& arg_column = + assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column); + auto result_column = ColumnString::create(); + + auto& col_data = result_column->get_chars(); + auto& col_offset = result_column->get_offsets(); + col_offset.resize(input_rows_count); + + auto null_column = ColumnUInt8::create(input_rows_count); + auto& null_map = null_column->get_data(); + + faststring compressed_str; + Slice data; + for (int row = 0; row < input_rows_count; row++) { + null_map[row] = false; + const auto& str = arg_column.get_data_at(row); + data = Slice(str.data, str.size); + + // Print the original string (before compression) + // LOG(INFO) << "Original string at row " << row << ": " + // << std::string(str.data, str.size); + + auto st = compression_codec->compress(data, &compressed_str); + + if (!st.ok()) { + // LOG(INFO) << "Compression failed at row " << row + // << ", skipping this row."; // Log failure + col_offset[row] = col_offset[row - 1]; + null_map[row] = true; + continue; + } + + size_t idx = col_data.size(); + if (!str.size) { // null -> 0x + col_data.resize(col_data.size() + 2); + col_data[idx] = '0', col_data[idx + 1] = 'x'; + col_offset[row] = col_offset[row - 1] + 2; + continue; + } + + // first ten digits represent the length of the uncompressed string + int value = (int)str.size; + col_data.resize(col_data.size() + 10); + col_data[idx] = '0', col_data[idx + 1] = 'x'; + for (int i = 0; i < 4; i++) { + unsigned char byte = (value >> (i * 8)) & 0xFF; + col_data[idx + 2 + i * 2] = "0123456789ABCDEF"[byte >> 4]; // 高4位 + col_data[idx + 3 + i * 2] = "0123456789ABCDEF"[byte & 0x0F]; // 低4位 + } + idx += 10; + + col_data.resize(col_data.size() + 2 * compressed_str.size()); + // memcpy(col_data.data() + col_data.size(), compressed_str.data(), compressed_str.size()); + + unsigned char* src = compressed_str.data(); + { + auto transform = [](char ch) -> unsigned char { + char x; + if (ch < 10) { + x = ch + '0'; + } else { + x = ch - 10 + 'A'; + } + // LOG(INFO) << "transform" << (int)x << "->" << x; + return x; + }; + for (int i = 0; i < compressed_str.size(); i++) { + col_data[idx] = transform(((*src) >> 4) & 0x0F); + col_data[idx + 1] = transform(*src & 0x0F); + LOG(INFO) << (unsigned int)(*src) << " -> " << (unsigned int)col_data[idx] + << " and " << (unsigned int)col_data[idx + 1]; + idx += 2; + src++; + } + + // Print the compressed string (after compression) + // LOG(INFO) << "Compressed string at row " << row << ": " + // << std::string(reinterpret_cast<const char*>(col_data.data())); + col_offset[row] = col_offset[row - 1] + 10 + compressed_str.size() * 2; Review Comment: The first ten digits of the compress value are "0x" and eight digits long, followed by each digit split into two hexadecimal values -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org