This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new cdd32d9582d [enhance](hive) support reading hive table with 
OpenCSVSerde #42257 (#42940)
cdd32d9582d is described below

commit cdd32d9582d80e8ce84ff88feeb5c689cce6619a
Author: Mingyu Chen (Rayner) <morning...@163.com>
AuthorDate: Thu Oct 31 11:12:07 2024 +0800

    [enhance](hive) support reading hive table with OpenCSVSerde #42257 (#42940)
    
    cherry pick from #42257
    
    Co-authored-by: Socrates <suxiaogang...@icloud.com>
---
 .../regression/serde_prop/some_serde_table.hql     | 64 ++++++++++++++++++++++
 .../doris/datasource/hive/HiveProperties.java      | 50 +++++++++--------
 .../doris/datasource/hive/source/HiveScanNode.java | 44 ++++++++++-----
 .../hive/test_hive_serde_prop.out                  | 23 ++++++++
 .../hive/test_hive_serde_prop.groovy               |  4 ++
 5 files changed, 148 insertions(+), 37 deletions(-)

diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
index 4de85bc19f0..0368547f8be 100644
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
@@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def");
 insert into serde_test5 values(1, "abc"),(2, "def");
 insert into serde_test6 values(1, "abc"),(2, "def");
 insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\"");
+
+CREATE TABLE test_open_csv_default_prop (
+    id INT,
+    name STRING,
+    age INT,
+    salary DOUBLE,
+    is_active BOOLEAN,
+    hire_date DATE,
+    last_login TIMESTAMP,
+    rating FLOAT,
+    description STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+STORED AS TEXTFILE;
+
+CREATE TABLE test_open_csv_standard_prop (
+    id INT,
+    name STRING,
+    age INT,
+    salary DOUBLE,
+    is_active BOOLEAN,
+    hire_date DATE,
+    last_login TIMESTAMP,
+    rating FLOAT,
+    description STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+    "separatorChar" = ",",
+    "quoteChar"     = "\"",
+    "escapeChar"    = "\\"
+)
+STORED AS TEXTFILE;
+
+CREATE TABLE test_open_csv_custom_prop (
+    id INT,
+    name STRING,
+    age INT,
+    salary DOUBLE,
+    is_active BOOLEAN,
+    hire_date DATE,
+    last_login TIMESTAMP,
+    rating FLOAT,
+    description STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+    "separatorChar" = "\t",
+    "quoteChar"     = "\'",
+    "escapeChar"    = "|"
+)
+STORED AS TEXTFILE;
+
+INSERT INTO TABLE test_open_csv_default_prop VALUES 
+(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 
'Senior Developer'),
+(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project 
Manager\"');
+
+INSERT INTO TABLE test_open_csv_standard_prop VALUES 
+(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 
'Senior Developer'),
+(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project 
Manager\"');
+
+INSERT INTO TABLE test_open_csv_custom_prop VALUES 
+(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 
'Senior Developer'),
+(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project 
Manager\"');
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
index 5ded87e0d23..74f3dcc1a9d 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
@@ -19,6 +19,7 @@ package org.apache.doris.datasource.hive;
 
 import com.google.common.collect.ImmutableSet;
 import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.serde2.OpenCSVSerde;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -27,15 +28,12 @@ import java.util.Set;
 
 public class HiveProperties {
     public static final String PROP_FIELD_DELIMITER = "field.delim";
-    public static final String PROP_SEPARATOR_CHAR = "separatorChar";
     public static final String PROP_SERIALIZATION_FORMAT = 
"serialization.format";
     public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01"
 
     public static final String PROP_LINE_DELIMITER = "line.delim";
     public static final String DEFAULT_LINE_DELIMITER = "\n";
 
-    public static final String PROP_QUOTE_CHAR = "quoteChar";
-
     public static final String PROP_COLLECTION_DELIMITER_HIVE2 = 
"colelction.delim";
     public static final String PROP_COLLECTION_DELIMITER_HIVE3 = 
"collection.delim";
     public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
@@ -49,6 +47,14 @@ public class HiveProperties {
     public static final String PROP_NULL_FORMAT = "serialization.null.format";
     public static final String DEFAULT_NULL_FORMAT = "\\N";
 
+    // The following properties are used for OpenCsvSerde.
+    public static final String PROP_SEPARATOR_CHAR = 
OpenCSVSerde.SEPARATORCHAR;
+    public static final String DEFAULT_SEPARATOR_CHAR = ",";
+    public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR;
+    public static final String DEFAULT_QUOTE_CHAR = "\"";
+    public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
+    public static final String DEFAULT_ESCAPE_CHAR = "\\";
+
     public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
             PROP_FIELD_DELIMITER,
             PROP_COLLECTION_DELIMITER_HIVE2,
@@ -59,37 +65,33 @@ public class HiveProperties {
             PROP_QUOTE_CHAR,
             PROP_MAP_KV_DELIMITER,
             PROP_ESCAPE_DELIMITER,
-            PROP_NULL_FORMAT
-    );
+            PROP_ESCAPE_CHAR,
+            PROP_NULL_FORMAT);
 
     public static String getFieldDelimiter(Table table) {
         // This method is used for text format.
-        // If you need compatibility with csv format, please use 
`getColumnSeparator`.
         Optional<String> fieldDelim = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
         Optional<String> serFormat = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
         return 
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
                 DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
     }
 
-    public static String getColumnSeparator(Table table) {
-        Optional<String> fieldDelim = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
-        Optional<String> columnSeparator = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
-        Optional<String> serFormat = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
-        return 
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
-            DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat));
+    public static String getSeparatorChar(Table table) {
+        Optional<String> separatorChar = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
+        return HiveMetaStoreClientHelper.firstPresentOrDefault(
+                DEFAULT_SEPARATOR_CHAR, separatorChar);
     }
 
-
     public static String getLineDelimiter(Table table) {
         Optional<String> lineDelim = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER);
         return 
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
-            DEFAULT_LINE_DELIMITER, lineDelim));
+                DEFAULT_LINE_DELIMITER, lineDelim));
     }
 
     public static String getMapKvDelimiter(Table table) {
         Optional<String> mapkvDelim = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER);
         return 
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
-            DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
+                DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
     }
 
     public static String getCollectionDelimiter(Table table) {
@@ -101,14 +103,6 @@ public class HiveProperties {
                 DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, 
collectionDelimHive3));
     }
 
-    public static Optional<String> getQuoteChar(Table table) {
-        Map<String, String> serdeParams = 
table.getSd().getSerdeInfo().getParameters();
-        if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
-            return Optional.of(serdeParams.get(PROP_QUOTE_CHAR));
-        }
-        return Optional.empty();
-    }
-
     public static Optional<String> getEscapeDelimiter(Table table) {
         Optional<String> escapeDelim = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER);
         if (escapeDelim.isPresent()) {
@@ -127,6 +121,16 @@ public class HiveProperties {
         return 
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT, 
nullFormat);
     }
 
+    public static String getQuoteChar(Table table) {
+        Optional<String> quoteChar = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR);
+        return 
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar);
+    }
+
+    public static String getEscapeChar(Table table) {
+        Optional<String> escapeChar = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR);
+        return 
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, 
escapeChar);
+    }
+
     // Set properties to table
     public static void setTableProperties(Table table, Map<String, String> 
properties) {
         HashMap<String, String> serdeProps = new HashMap<>();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index f17de4bfe0a..dbf1ea9cd9a 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -385,20 +385,36 @@ public class HiveScanNode extends FileQueryScanNode {
     protected TFileAttributes getFileAttributes() throws UserException {
         TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
         Table table = hmsTable.getRemoteTable();
-        // 1. set column separator
-        
textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table));
-        // 2. set line delimiter
-        textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
-        // 3. set mapkv delimiter
-        textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
-        // 4. set collection delimiter
-        
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
-        // 5. set quote char
-        HiveProperties.getQuoteChar(table).ifPresent(d -> 
textParams.setEnclose(d.getBytes()[0]));
-        // 6. set escape delimiter
-        HiveProperties.getEscapeDelimiter(table).ifPresent(d -> 
textParams.setEscape(d.getBytes()[0]));
-        // 7. set null format
-        textParams.setNullFormat(HiveProperties.getNullFormat(table));
+        // TODO: separate hive text table and OpenCsv table
+        String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
+        if 
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
+            // set properties of LazySimpleSerDe
+            // 1. set column separator
+            
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
+            // 2. set line delimiter
+            
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
+            // 3. set mapkv delimiter
+            
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
+            // 4. set collection delimiter
+            
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
+            // 5. set escape delimiter
+            HiveProperties.getEscapeDelimiter(table).ifPresent(d -> 
textParams.setEscape(d.getBytes()[0]));
+            // 6. set null format
+            textParams.setNullFormat(HiveProperties.getNullFormat(table));
+        } else if 
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
+            // set set properties of OpenCSVSerde
+            // 1. set column separator
+            
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
+            // 2. set line delimiter
+            
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
+            // 3. set enclose char
+            
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
+            // 4. set escape char
+            
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
+        } else {
+            throw new UserException(
+                    "unsupported hive table serde: " + serDeLib);
+        }
 
         TFileAttributes fileAttributes = new TFileAttributes();
         fileAttributes.setTextParams(textParams);
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index a527c7b687d..3d45e525ecf 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
@@ -39,6 +39,18 @@ b    2.2
 3      aaa
 4      "null"
 
+-- !test_open_csv_default_prop --
+1      John Doe        28      50000.75        TRUE    2022-01-15      
2023-10-21 14:30:00     4.5     Senior Developer
+2      Jane,Smith                      FALSE   2020-05-20                      
""Project Manager""
+
+-- !test_open_csv_standard_prop --
+1      John Doe        28      50000.75        TRUE    2022-01-15      
2023-10-21 14:30:00     4.5     Senior Developer
+2      Jane,Smith                      FALSE   2020-05-20                      
"Project Manager"
+
+-- !test_open_csv_custom_prop --
+1      John Doe        28      50000.75        TRUE    2022-01-15      
2023-10-21 14:30:00     4.5     Senior Developer
+2      Jane,Smith                      FALSE   2020-05-20                      
"Project Manager"
+
 -- !1 --
 a      1.1
 b      2.2
@@ -79,3 +91,14 @@ b    2.2
 3      aaa
 4      "null"
 
+-- !test_open_csv_default_prop --
+1      John Doe        28      50000.75        TRUE    2022-01-15      
2023-10-21 14:30:00     4.5     Senior Developer
+2      Jane,Smith                      FALSE   2020-05-20                      
""Project Manager""
+
+-- !test_open_csv_standard_prop --
+1      John Doe        28      50000.75        TRUE    2022-01-15      
2023-10-21 14:30:00     4.5     Senior Developer
+2      Jane,Smith                      FALSE   2020-05-20                      
"Project Manager"
+
+-- !test_open_csv_custom_prop --
+1      John Doe        28      50000.75        TRUE    2022-01-15      
2023-10-21 14:30:00     4.5     Senior Developer
+2      Jane,Smith                      FALSE   2020-05-20                      
"Project Manager"
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
index d0c191f7c67..52cdd25eb07 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
@@ -51,6 +51,10 @@ suite("test_hive_serde_prop", 
"external_docker,hive,external_docker_hive,p0,exte
         hive_docker """truncate table regression.serde_test8;"""
         sql """insert into ${catalog_name}.regression.serde_test8 select * 
from ${catalog_name}.regression.serde_test7;"""
         qt_9 """select * from ${catalog_name}.regression.serde_test8 order by 
id;"""
+
+        qt_test_open_csv_default_prop """select * from 
${catalog_name}.regression.test_open_csv_default_prop order by id;"""
+        qt_test_open_csv_standard_prop """select * from 
${catalog_name}.regression.test_open_csv_standard_prop order by id;"""
+        qt_test_open_csv_custom_prop """select * from 
${catalog_name}.regression.test_open_csv_custom_prop order by id;"""
     }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to