This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 05a8c3e4edd branch-4.1: [fix](hive) Preserve empty text records #64671 
(#64837)
05a8c3e4edd is described below

commit 05a8c3e4edd6ed9c7d189bfd5e6a6809dad761ce
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Jun 25 18:23:02 2026 +0800

    branch-4.1: [fix](hive) Preserve empty text records #64671 (#64837)
    
    Cherry-picked from #64671
    
    Co-authored-by: Gabriel <[email protected]>
---
 be/src/format/csv/csv_reader.cpp                   | 18 ++++++++++++----
 be/src/format/csv/csv_reader.h                     |  1 +
 be/src/format/text/text_reader.cpp                 |  6 ++++++
 be/src/format/text/text_reader.h                   |  1 +
 .../hive/scripts/data/regression/crdmm_data/run.sh |  7 +++---
 .../hive/scripts/data/regression/serde_prop/run.sh | 25 ++++++++++++++++++++++
 .../regression/serde_prop/some_serde_table.hql     | 25 +++++++++++++++++++++-
 .../hive/test_hive_serde_prop.out                  | 24 +++++++++++++++++++++
 .../hive/test_hive_serde_prop.groovy               | 20 +++++++++++++++++
 9 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp
index e0738efe954..7bc340e7f21 100644
--- a/be/src/format/csv/csv_reader.cpp
+++ b/be/src/format/csv/csv_reader.cpp
@@ -362,8 +362,10 @@ Status CsvReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
                 continue;
             }
             if (size == 0) {
-                if (!_line_reader_eof && 
_state->is_read_csv_empty_line_as_null()) {
-                    ++rows;
+                if (!_line_reader_eof) {
+                    if (_empty_line_as_record() || 
_state->is_read_csv_empty_line_as_null()) {
+                        ++rows;
+                    }
                 }
                 // Read empty line, continue
                 continue;
@@ -400,8 +402,16 @@ Status CsvReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
                 continue;
             }
             if (size == 0) {
-                if (!_line_reader_eof && 
_state->is_read_csv_empty_line_as_null()) {
-                    RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+                if (!_line_reader_eof) {
+                    if (_empty_line_as_record()) {
+                        Slice empty_line("", 0);
+                        RETURN_IF_ERROR(_validate_line(empty_line, &success));
+                        if (success) {
+                            RETURN_IF_ERROR(_fill_dest_columns(empty_line, 
columns, &rows));
+                        }
+                    } else if (_state->is_read_csv_empty_line_as_null()) {
+                        RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+                    }
                 }
                 // Read empty line, continue
                 continue;
diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h
index 25cbaba31a1..3e5579d71de 100644
--- a/be/src/format/csv/csv_reader.h
+++ b/be/src/format/csv/csv_reader.h
@@ -201,6 +201,7 @@ protected:
     virtual Status _create_line_reader();
     virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& 
column, Slice& slice);
     virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice);
+    virtual bool _empty_line_as_record() const { return false; }
     // check the utf8 encoding of a line.
     // return error status to stop processing.
     // If return Status::OK but "success" is false, which means this is load 
request
diff --git a/be/src/format/text/text_reader.cpp 
b/be/src/format/text/text_reader.cpp
index e52da7f3249..388cd56a0dd 100644
--- a/be/src/format/text/text_reader.cpp
+++ b/be/src/format/text/text_reader.cpp
@@ -164,6 +164,12 @@ Status TextReader::_validate_line(const Slice& line, bool* 
success) {
     return Status::OK();
 }
 
+bool TextReader::_empty_line_as_record() const {
+    // Hive TEXTFILE treats an empty physical line as a record. The splitter 
maps it
+    // to one empty field and missing trailing fields are filled with 
null_format.
+    return true;
+}
+
 Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) 
{
     auto& null_column = assert_cast<ColumnNullable&>(column);
     if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h
index 22073c130a8..60b0fb2f8b5 100644
--- a/be/src/format/text/text_reader.h
+++ b/be/src/format/text/text_reader.h
@@ -67,6 +67,7 @@ private:
     Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, 
Slice& slice) override;
     Status _validate_line(const Slice& line, bool* success) override;
     Status _deserialize_nullable_string(IColumn& column, Slice& slice) 
override;
+    bool _empty_line_as_record() const override;
 };
 
 #include "common/compile_check_end.h"
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
index f650ead89d7..5197e8b9276 100755
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
@@ -4,10 +4,9 @@ set -x
 CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 
 
-hadoop fs -mkdir -p /user/doris/suites/regression/
-hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/
-
 # create table
 hive -f "${CUR_DIR}"/create_table.hql
 
-
+hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true
+hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data
+hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* 
/user/doris/suites/regression/crdmm_data/
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
index ef6538563d5..c4f8e7c5d96 100755
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
@@ -3,6 +3,31 @@ set -x
 
 CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 
+SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)"
+DEFAULT_MULTI_COL_DATA_FILE="$(mktemp 
/tmp/test_default_null_format_multi_col_text.XXXXXX)"
+trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT
+cat > "${SINGLE_COL_DATA_FILE}" <<'EOF'
+null_value
+null_value
+non-null
+
+\N
+EOF
+
+{
+    printf 'a\tb\n'
+    printf '\n'
+    printf '\\N\t\\N\n'
+} > "${DEFAULT_MULTI_COL_DATA_FILE}"
+
+hadoop fs -rm -r -f 
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text || 
true
+hadoop fs -mkdir -p 
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text
+hadoop fs -put "${SINGLE_COL_DATA_FILE}" 
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000
+
+hadoop fs -rm -r -f 
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
 || true
+hadoop fs -mkdir -p 
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
+hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" 
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000
+
 # create table
 hive -f "${CUR_DIR}"/some_serde_table.hql
 
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
index 81bdf03da8e..02393bd5d3b 100644
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
@@ -199,4 +199,27 @@ STORED AS TEXTFILE;
 INSERT INTO TABLE test_empty_null_defined_text VALUES
   (1, 'Alice'),
   (2, NULL),
-  (3, '');
\ No newline at end of file
+  (3, '');
+
+drop table if exists test_single_col_null_format_text;
+
+create external table test_single_col_null_format_text (
+  name STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+WITH SERDEPROPERTIES (
+  "serialization.null.format"="null_value"
+)
+STORED AS TEXTFILE
+LOCATION 
'/user/doris/suites/regression/serde_prop/test_single_col_null_format_text';
+
+drop table if exists test_default_null_format_multi_col_text;
+
+create external table test_default_null_format_multi_col_text (
+  c1 STRING,
+  c2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+LOCATION 
'/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text';
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index cda92c0519a..36866613260 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
@@ -73,6 +73,18 @@ b    2.2
 
 -- !test_empty_null_defined_text3 --
 
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5      3       2       1       1       1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3      2       1       1       1       2       0       1
+
 -- !1 --
 a      1.1
 b      2.2
@@ -147,3 +159,15 @@ b  2.2
 
 -- !test_empty_null_defined_text3 --
 
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5      3       2       1       1       1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3      2       1       1       1       2       0       1
+
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
index d4bb051214d..9ca9ad3b6b4 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
@@ -63,6 +63,26 @@ suite("test_hive_serde_prop", 
"external_docker,hive,external_docker_hive,p0,exte
         qt_test_empty_null_defined_text """select * from 
${catalog_name}.regression.test_empty_null_defined_text order by id;"""
         qt_test_empty_null_defined_text2 """select * from 
${catalog_name}.regression.test_empty_null_defined_text where name is null 
order by id;"""
         qt_test_empty_null_defined_text3 """select * from 
${catalog_name}.regression.test_empty_null_defined_text where name = '' order 
by id;"""
+
+        qt_test_single_col_null_format_text_count """select count(*) from 
${catalog_name}.regression.test_single_col_null_format_text;"""
+        qt_test_single_col_null_format_text_values """
+            select count(*), count(name), count(case when name is null then 1 
end),
+                   count(case when name = '' then 1 end),
+                   count(case when name = 'non-null' then 1 end),
+                   count(case when name is not null and name not in ('', 
'non-null') then 1 end)
+            from ${catalog_name}.regression.test_single_col_null_format_text;
+        """
+
+        qt_test_default_null_format_multi_col_text_count """select count(*) 
from ${catalog_name}.regression.test_default_null_format_multi_col_text;"""
+        qt_test_default_null_format_multi_col_text_values """
+            select count(*), count(c1), count(c2),
+                   count(case when c1 is null then 1 end),
+                   count(case when c1 = '' then 1 end),
+                   count(case when c2 is null then 1 end),
+                   count(case when c2 = '' then 1 end),
+                   count(case when c1 = 'a' and c2 = 'b' then 1 end)
+            from 
${catalog_name}.regression.test_default_null_format_multi_col_text;
+        """
     }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to