This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new cdd32d9582d [enhance](hive) support reading hive table with OpenCSVSerde #42257 (#42940) cdd32d9582d is described below commit cdd32d9582d80e8ce84ff88feeb5c689cce6619a Author: Mingyu Chen (Rayner) <morning...@163.com> AuthorDate: Thu Oct 31 11:12:07 2024 +0800 [enhance](hive) support reading hive table with OpenCSVSerde #42257 (#42940) cherry pick from #42257 Co-authored-by: Socrates <suxiaogang...@icloud.com> --- .../regression/serde_prop/some_serde_table.hql | 64 ++++++++++++++++++++++ .../doris/datasource/hive/HiveProperties.java | 50 +++++++++-------- .../doris/datasource/hive/source/HiveScanNode.java | 44 ++++++++++----- .../hive/test_hive_serde_prop.out | 23 ++++++++ .../hive/test_hive_serde_prop.groovy | 4 ++ 5 files changed, 148 insertions(+), 37 deletions(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index 4de85bc19f0..0368547f8be 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def"); insert into serde_test5 values(1, "abc"),(2, "def"); insert into serde_test6 values(1, "abc"),(2, "def"); insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\""); + +CREATE TABLE test_open_csv_default_prop ( + id INT, + name STRING, + age INT, + salary DOUBLE, + is_active BOOLEAN, + hire_date DATE, + last_login TIMESTAMP, + rating FLOAT, + description STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +STORED AS TEXTFILE; + +CREATE TABLE test_open_csv_standard_prop ( + id INT, + name STRING, + age INT, + salary DOUBLE, + is_active BOOLEAN, + hire_date DATE, + last_login TIMESTAMP, + rating FLOAT, + description STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ( + "separatorChar" = ",", + "quoteChar" = "\"", + "escapeChar" = "\\" +) +STORED AS TEXTFILE; + +CREATE TABLE test_open_csv_custom_prop ( + id INT, + name STRING, + age INT, + salary DOUBLE, + is_active BOOLEAN, + hire_date DATE, + last_login TIMESTAMP, + rating FLOAT, + description STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ( + "separatorChar" = "\t", + "quoteChar" = "\'", + "escapeChar" = "|" +) +STORED AS TEXTFILE; + +INSERT INTO TABLE test_open_csv_default_prop VALUES +(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'), +(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"'); + +INSERT INTO TABLE test_open_csv_standard_prop VALUES +(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'), +(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"'); + +INSERT INTO TABLE test_open_csv_custom_prop VALUES +(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'), +(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"'); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java index 5ded87e0d23..74f3dcc1a9d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java @@ -19,6 +19,7 @@ package org.apache.doris.datasource.hive; import com.google.common.collect.ImmutableSet; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.serde2.OpenCSVSerde; import java.util.HashMap; import java.util.Map; @@ -27,15 +28,12 @@ import java.util.Set; public class HiveProperties { public static final String PROP_FIELD_DELIMITER = "field.delim"; - public static final String PROP_SEPARATOR_CHAR = "separatorChar"; public static final String PROP_SERIALIZATION_FORMAT = "serialization.format"; public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01" public static final String PROP_LINE_DELIMITER = "line.delim"; public static final String DEFAULT_LINE_DELIMITER = "\n"; - public static final String PROP_QUOTE_CHAR = "quoteChar"; - public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim"; public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim"; public static final String DEFAULT_COLLECTION_DELIMITER = "\2"; @@ -49,6 +47,14 @@ public class HiveProperties { public static final String PROP_NULL_FORMAT = "serialization.null.format"; public static final String DEFAULT_NULL_FORMAT = "\\N"; + // The following properties are used for OpenCsvSerde. + public static final String PROP_SEPARATOR_CHAR = OpenCSVSerde.SEPARATORCHAR; + public static final String DEFAULT_SEPARATOR_CHAR = ","; + public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR; + public static final String DEFAULT_QUOTE_CHAR = "\""; + public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR; + public static final String DEFAULT_ESCAPE_CHAR = "\\"; + public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of( PROP_FIELD_DELIMITER, PROP_COLLECTION_DELIMITER_HIVE2, @@ -59,37 +65,33 @@ public class HiveProperties { PROP_QUOTE_CHAR, PROP_MAP_KV_DELIMITER, PROP_ESCAPE_DELIMITER, - PROP_NULL_FORMAT - ); + PROP_ESCAPE_CHAR, + PROP_NULL_FORMAT); public static String getFieldDelimiter(Table table) { // This method is used for text format. - // If you need compatibility with csv format, please use `getColumnSeparator`. Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER); Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat)); } - public static String getColumnSeparator(Table table) { - Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER); - Optional<String> columnSeparator = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR); - Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT); - return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat)); + public static String getSeparatorChar(Table table) { + Optional<String> separatorChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR); + return HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_SEPARATOR_CHAR, separatorChar); } - public static String getLineDelimiter(Table table) { Optional<String> lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_LINE_DELIMITER, lineDelim)); + DEFAULT_LINE_DELIMITER, lineDelim)); } public static String getMapKvDelimiter(Table table) { Optional<String> mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_MAP_KV_DELIMITER, mapkvDelim)); + DEFAULT_MAP_KV_DELIMITER, mapkvDelim)); } public static String getCollectionDelimiter(Table table) { @@ -101,14 +103,6 @@ public class HiveProperties { DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3)); } - public static Optional<String> getQuoteChar(Table table) { - Map<String, String> serdeParams = table.getSd().getSerdeInfo().getParameters(); - if (serdeParams.containsKey(PROP_QUOTE_CHAR)) { - return Optional.of(serdeParams.get(PROP_QUOTE_CHAR)); - } - return Optional.empty(); - } - public static Optional<String> getEscapeDelimiter(Table table) { Optional<String> escapeDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER); if (escapeDelim.isPresent()) { @@ -127,6 +121,16 @@ public class HiveProperties { return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT, nullFormat); } + public static String getQuoteChar(Table table) { + Optional<String> quoteChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR); + return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar); + } + + public static String getEscapeChar(Table table) { + Optional<String> escapeChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR); + return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, escapeChar); + } + // Set properties to table public static void setTableProperties(Table table, Map<String, String> properties) { HashMap<String, String> serdeProps = new HashMap<>(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index f17de4bfe0a..dbf1ea9cd9a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -385,20 +385,36 @@ public class HiveScanNode extends FileQueryScanNode { protected TFileAttributes getFileAttributes() throws UserException { TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); Table table = hmsTable.getRemoteTable(); - // 1. set column separator - textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table)); - // 2. set line delimiter - textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table)); - // 3. set mapkv delimiter - textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table)); - // 4. set collection delimiter - textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table)); - // 5. set quote char - HiveProperties.getQuoteChar(table).ifPresent(d -> textParams.setEnclose(d.getBytes()[0])); - // 6. set escape delimiter - HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0])); - // 7. set null format - textParams.setNullFormat(HiveProperties.getNullFormat(table)); + // TODO: separate hive text table and OpenCsv table + String serDeLib = table.getSd().getSerdeInfo().getSerializationLib(); + if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) { + // set properties of LazySimpleSerDe + // 1. set column separator + textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table)); + // 2. set line delimiter + textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table)); + // 3. set mapkv delimiter + textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table)); + // 4. set collection delimiter + textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table)); + // 5. set escape delimiter + HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0])); + // 6. set null format + textParams.setNullFormat(HiveProperties.getNullFormat(table)); + } else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) { + // set set properties of OpenCSVSerde + // 1. set column separator + textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table)); + // 2. set line delimiter + textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table)); + // 3. set enclose char + textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]); + // 4. set escape char + textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]); + } else { + throw new UserException( + "unsupported hive table serde: " + serDeLib); + } TFileAttributes fileAttributes = new TFileAttributes(); fileAttributes.setTextParams(textParams); diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index a527c7b687d..3d45e525ecf 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -39,6 +39,18 @@ b 2.2 3 aaa 4 "null" +-- !test_open_csv_default_prop -- +1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer +2 Jane,Smith FALSE 2020-05-20 ""Project Manager"" + +-- !test_open_csv_standard_prop -- +1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer +2 Jane,Smith FALSE 2020-05-20 "Project Manager" + +-- !test_open_csv_custom_prop -- +1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer +2 Jane,Smith FALSE 2020-05-20 "Project Manager" + -- !1 -- a 1.1 b 2.2 @@ -79,3 +91,14 @@ b 2.2 3 aaa 4 "null" +-- !test_open_csv_default_prop -- +1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer +2 Jane,Smith FALSE 2020-05-20 ""Project Manager"" + +-- !test_open_csv_standard_prop -- +1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer +2 Jane,Smith FALSE 2020-05-20 "Project Manager" + +-- !test_open_csv_custom_prop -- +1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer +2 Jane,Smith FALSE 2020-05-20 "Project Manager" diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index d0c191f7c67..52cdd25eb07 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -51,6 +51,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte hive_docker """truncate table regression.serde_test8;""" sql """insert into ${catalog_name}.regression.serde_test8 select * from ${catalog_name}.regression.serde_test7;""" qt_9 """select * from ${catalog_name}.regression.serde_test8 order by id;""" + + qt_test_open_csv_default_prop """select * from ${catalog_name}.regression.test_open_csv_default_prop order by id;""" + qt_test_open_csv_standard_prop """select * from ${catalog_name}.regression.test_open_csv_standard_prop order by id;""" + qt_test_open_csv_custom_prop """select * from ${catalog_name}.regression.test_open_csv_custom_prop order by id;""" } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org