This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 05a8c3e4edd branch-4.1: [fix](hive) Preserve empty text records #64671
(#64837)
05a8c3e4edd is described below
commit 05a8c3e4edd6ed9c7d189bfd5e6a6809dad761ce
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Jun 25 18:23:02 2026 +0800
branch-4.1: [fix](hive) Preserve empty text records #64671 (#64837)
Cherry-picked from #64671
Co-authored-by: Gabriel <[email protected]>
---
be/src/format/csv/csv_reader.cpp | 18 ++++++++++++----
be/src/format/csv/csv_reader.h | 1 +
be/src/format/text/text_reader.cpp | 6 ++++++
be/src/format/text/text_reader.h | 1 +
.../hive/scripts/data/regression/crdmm_data/run.sh | 7 +++---
.../hive/scripts/data/regression/serde_prop/run.sh | 25 ++++++++++++++++++++++
.../regression/serde_prop/some_serde_table.hql | 25 +++++++++++++++++++++-
.../hive/test_hive_serde_prop.out | 24 +++++++++++++++++++++
.../hive/test_hive_serde_prop.groovy | 20 +++++++++++++++++
9 files changed, 118 insertions(+), 9 deletions(-)
diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp
index e0738efe954..7bc340e7f21 100644
--- a/be/src/format/csv/csv_reader.cpp
+++ b/be/src/format/csv/csv_reader.cpp
@@ -362,8 +362,10 @@ Status CsvReader::get_next_block(Block* block, size_t*
read_rows, bool* eof) {
continue;
}
if (size == 0) {
- if (!_line_reader_eof &&
_state->is_read_csv_empty_line_as_null()) {
- ++rows;
+ if (!_line_reader_eof) {
+ if (_empty_line_as_record() ||
_state->is_read_csv_empty_line_as_null()) {
+ ++rows;
+ }
}
// Read empty line, continue
continue;
@@ -400,8 +402,16 @@ Status CsvReader::get_next_block(Block* block, size_t*
read_rows, bool* eof) {
continue;
}
if (size == 0) {
- if (!_line_reader_eof &&
_state->is_read_csv_empty_line_as_null()) {
- RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+ if (!_line_reader_eof) {
+ if (_empty_line_as_record()) {
+ Slice empty_line("", 0);
+ RETURN_IF_ERROR(_validate_line(empty_line, &success));
+ if (success) {
+ RETURN_IF_ERROR(_fill_dest_columns(empty_line,
columns, &rows));
+ }
+ } else if (_state->is_read_csv_empty_line_as_null()) {
+ RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+ }
}
// Read empty line, continue
continue;
diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h
index 25cbaba31a1..3e5579d71de 100644
--- a/be/src/format/csv/csv_reader.h
+++ b/be/src/format/csv/csv_reader.h
@@ -201,6 +201,7 @@ protected:
virtual Status _create_line_reader();
virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn&
column, Slice& slice);
virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice);
+ virtual bool _empty_line_as_record() const { return false; }
// check the utf8 encoding of a line.
// return error status to stop processing.
// If return Status::OK but "success" is false, which means this is load
request
diff --git a/be/src/format/text/text_reader.cpp
b/be/src/format/text/text_reader.cpp
index e52da7f3249..388cd56a0dd 100644
--- a/be/src/format/text/text_reader.cpp
+++ b/be/src/format/text/text_reader.cpp
@@ -164,6 +164,12 @@ Status TextReader::_validate_line(const Slice& line, bool*
success) {
return Status::OK();
}
+bool TextReader::_empty_line_as_record() const {
+ // Hive TEXTFILE treats an empty physical line as a record. The splitter
maps it
+ // to one empty field and missing trailing fields are filled with
null_format.
+ return true;
+}
+
Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice)
{
auto& null_column = assert_cast<ColumnNullable&>(column);
if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h
index 22073c130a8..60b0fb2f8b5 100644
--- a/be/src/format/text/text_reader.h
+++ b/be/src/format/text/text_reader.h
@@ -67,6 +67,7 @@ private:
Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column,
Slice& slice) override;
Status _validate_line(const Slice& line, bool* success) override;
Status _deserialize_nullable_string(IColumn& column, Slice& slice)
override;
+ bool _empty_line_as_record() const override;
};
#include "common/compile_check_end.h"
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
index f650ead89d7..5197e8b9276 100755
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
@@ -4,10 +4,9 @@ set -x
CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-hadoop fs -mkdir -p /user/doris/suites/regression/
-hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/
-
# create table
hive -f "${CUR_DIR}"/create_table.hql
-
+hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true
+hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data
+hadoop fs -put "${CUR_DIR}"/data/crdmm_data/*
/user/doris/suites/regression/crdmm_data/
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
index ef6538563d5..c4f8e7c5d96 100755
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
@@ -3,6 +3,31 @@ set -x
CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)"
+DEFAULT_MULTI_COL_DATA_FILE="$(mktemp
/tmp/test_default_null_format_multi_col_text.XXXXXX)"
+trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT
+cat > "${SINGLE_COL_DATA_FILE}" <<'EOF'
+null_value
+null_value
+non-null
+
+\N
+EOF
+
+{
+ printf 'a\tb\n'
+ printf '\n'
+ printf '\\N\t\\N\n'
+} > "${DEFAULT_MULTI_COL_DATA_FILE}"
+
+hadoop fs -rm -r -f
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text ||
true
+hadoop fs -mkdir -p
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text
+hadoop fs -put "${SINGLE_COL_DATA_FILE}"
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000
+
+hadoop fs -rm -r -f
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
|| true
+hadoop fs -mkdir -p
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
+hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}"
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000
+
# create table
hive -f "${CUR_DIR}"/some_serde_table.hql
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
index 81bdf03da8e..02393bd5d3b 100644
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
@@ -199,4 +199,27 @@ STORED AS TEXTFILE;
INSERT INTO TABLE test_empty_null_defined_text VALUES
(1, 'Alice'),
(2, NULL),
- (3, '');
\ No newline at end of file
+ (3, '');
+
+drop table if exists test_single_col_null_format_text;
+
+create external table test_single_col_null_format_text (
+ name STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+WITH SERDEPROPERTIES (
+ "serialization.null.format"="null_value"
+)
+STORED AS TEXTFILE
+LOCATION
'/user/doris/suites/regression/serde_prop/test_single_col_null_format_text';
+
+drop table if exists test_default_null_format_multi_col_text;
+
+create external table test_default_null_format_multi_col_text (
+ c1 STRING,
+ c2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+LOCATION
'/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text';
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index cda92c0519a..36866613260 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
@@ -73,6 +73,18 @@ b 2.2
-- !test_empty_null_defined_text3 --
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5 3 2 1 1 1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3 2 1 1 1 2 0 1
+
-- !1 --
a 1.1
b 2.2
@@ -147,3 +159,15 @@ b 2.2
-- !test_empty_null_defined_text3 --
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5 3 2 1 1 1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3 2 1 1 1 2 0 1
+
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
index d4bb051214d..9ca9ad3b6b4 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
@@ -63,6 +63,26 @@ suite("test_hive_serde_prop",
"external_docker,hive,external_docker_hive,p0,exte
qt_test_empty_null_defined_text """select * from
${catalog_name}.regression.test_empty_null_defined_text order by id;"""
qt_test_empty_null_defined_text2 """select * from
${catalog_name}.regression.test_empty_null_defined_text where name is null
order by id;"""
qt_test_empty_null_defined_text3 """select * from
${catalog_name}.regression.test_empty_null_defined_text where name = '' order
by id;"""
+
+ qt_test_single_col_null_format_text_count """select count(*) from
${catalog_name}.regression.test_single_col_null_format_text;"""
+ qt_test_single_col_null_format_text_values """
+ select count(*), count(name), count(case when name is null then 1
end),
+ count(case when name = '' then 1 end),
+ count(case when name = 'non-null' then 1 end),
+ count(case when name is not null and name not in ('',
'non-null') then 1 end)
+ from ${catalog_name}.regression.test_single_col_null_format_text;
+ """
+
+ qt_test_default_null_format_multi_col_text_count """select count(*)
from ${catalog_name}.regression.test_default_null_format_multi_col_text;"""
+ qt_test_default_null_format_multi_col_text_values """
+ select count(*), count(c1), count(c2),
+ count(case when c1 is null then 1 end),
+ count(case when c1 = '' then 1 end),
+ count(case when c2 is null then 1 end),
+ count(case when c2 = '' then 1 end),
+ count(case when c1 = 'a' and c2 = 'b' then 1 end)
+ from
${catalog_name}.regression.test_default_null_format_multi_col_text;
+ """
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]