This is an automated email from the ASF dual-hosted git repository.
liugddx pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/seatunnel.git
The following commit(s) were added to refs/heads/dev by this push:
new 749b2fe364 [Fix][Connector-V2] Fix When reading Excel data, string and
date type conversion errors (#7796)
749b2fe364 is described below
commit 749b2fe364bcca33ce4cf4e69c4863c0853c278b
Author: CosmosNi <[email protected]>
AuthorDate: Thu Oct 10 07:53:01 2024 +0800
[Fix][Connector-V2] Fix When reading Excel data, string and date type
conversion errors (#7796)
---
.../file/source/reader/ExcelReadStrategy.java | 17 ++-
.../file/writer/ExcelReadStrategyTest.java | 162 +++++++++++++++++++++
.../src/test/resources/excel/test_read_excel.conf | 39 +++++
.../src/test/resources/excel/test_read_excel.xlsx | Bin 0 -> 10390 bytes
.../excel/test_read_excel_date_string.xlsx | Bin 0 -> 10387 bytes
5 files changed, 213 insertions(+), 5 deletions(-)
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java
index b794879ee7..c90b6d6659 100644
---
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java
+++
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java
@@ -36,7 +36,6 @@ import
org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorExc
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
-import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
@@ -151,7 +150,7 @@ public class ExcelReadStrategy extends AbstractReadStrategy
{
|| isNullOrEmpty(seaTunnelRowType.getFieldTypes())) {
throw new FileConnectorException(
CommonErrorCodeDeprecated.UNSUPPORTED_OPERATION,
- "Schmea information is not set or incorrect schmea
settings");
+ "Schema information is not set or incorrect Schema
settings");
}
SeaTunnelRowType userDefinedRowTypeWithPartition =
mergePartitionTypes(fileNames.get(0), seaTunnelRowType);
@@ -190,8 +189,7 @@ public class ExcelReadStrategy extends AbstractReadStrategy
{
return cell.getBooleanCellValue();
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
- DataFormatter formatter = new DataFormatter();
- return formatter.formatCellValue(cell);
+ return cell.getLocalDateTimeCellValue();
}
return cell.getNumericCellValue();
case ERROR:
@@ -215,7 +213,7 @@ public class ExcelReadStrategy extends AbstractReadStrategy
{
case ARRAY:
return objectMapper.readValue((String) field,
fieldType.getTypeClass());
case STRING:
- return field;
+ return String.valueOf(field);
case DOUBLE:
return Double.parseDouble(field.toString());
case BOOLEAN:
@@ -233,12 +231,21 @@ public class ExcelReadStrategy extends
AbstractReadStrategy {
case DECIMAL:
return
BigDecimal.valueOf(Double.parseDouble(field.toString()));
case DATE:
+ if (field instanceof LocalDateTime) {
+ return ((LocalDateTime) field).toLocalDate();
+ }
return LocalDate.parse(
(String) field,
DateTimeFormatter.ofPattern(dateFormat.getValue()));
case TIME:
+ if (field instanceof LocalDateTime) {
+ return ((LocalDateTime) field).toLocalTime();
+ }
return LocalTime.parse(
(String) field,
DateTimeFormatter.ofPattern(timeFormat.getValue()));
case TIMESTAMP:
+ if (field instanceof LocalDateTime) {
+ return field;
+ }
return LocalDateTime.parse(
(String) field,
DateTimeFormatter.ofPattern(datetimeFormat.getValue()));
case NULL:
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/writer/ExcelReadStrategyTest.java
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/writer/ExcelReadStrategyTest.java
new file mode 100644
index 0000000000..8aa43a03bd
--- /dev/null
+++
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/writer/ExcelReadStrategyTest.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.file.writer;
+
+import org.apache.seatunnel.shade.com.typesafe.config.Config;
+import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory;
+
+import org.apache.seatunnel.api.source.Collector;
+import org.apache.seatunnel.api.table.catalog.CatalogTableUtil;
+import org.apache.seatunnel.api.table.type.SeaTunnelRow;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+import org.apache.seatunnel.common.utils.DateTimeUtils;
+import org.apache.seatunnel.common.utils.DateUtils;
+import org.apache.seatunnel.common.utils.TimeUtils;
+import org.apache.seatunnel.connectors.seatunnel.file.config.HadoopConf;
+import
org.apache.seatunnel.connectors.seatunnel.file.source.reader.ExcelReadStrategy;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import lombok.Getter;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.LocalTime;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import static
org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_DEFAULT;
+
+public class ExcelReadStrategyTest {
+
+ @Test
+ public void testExcelRead() throws IOException, URISyntaxException {
+ testExcelRead("/excel/test_read_excel.xlsx");
+ testExcelRead("/excel/test_read_excel_date_string.xlsx");
+ }
+
+ private void testExcelRead(String filePath) throws IOException,
URISyntaxException {
+ URL excelFile = ExcelReadStrategyTest.class.getResource(filePath);
+ URL conf =
ExcelReadStrategyTest.class.getResource("/excel/test_read_excel.conf");
+ Assertions.assertNotNull(excelFile);
+ Assertions.assertNotNull(conf);
+ String excelFilePath = Paths.get(excelFile.toURI()).toString();
+ String confPath = Paths.get(conf.toURI()).toString();
+ Config pluginConfig = ConfigFactory.parseFile(new File(confPath));
+ ExcelReadStrategy excelReadStrategy = new ExcelReadStrategy();
+ LocalConf localConf = new LocalConf(FS_DEFAULT_NAME_DEFAULT);
+ excelReadStrategy.setPluginConfig(pluginConfig);
+ excelReadStrategy.init(localConf);
+
+ List<String> fileNamesByPath =
excelReadStrategy.getFileNamesByPath(excelFilePath);
+ SeaTunnelRowType userDefinedSchema =
+
CatalogTableUtil.buildWithConfig(pluginConfig).getSeaTunnelRowType();
+ excelReadStrategy.setSeaTunnelRowTypeInfo(userDefinedSchema);
+ TestCollector testCollector = new TestCollector();
+ excelReadStrategy.read(fileNamesByPath.get(0), "", testCollector);
+ for (SeaTunnelRow seaTunnelRow : testCollector.getRows()) {
+ Assertions.assertEquals(seaTunnelRow.getArity(), 14);
+ Assertions.assertEquals(seaTunnelRow.getField(0).getClass(),
Byte.class);
+ Assertions.assertEquals(seaTunnelRow.getField(1).getClass(),
Short.class);
+ Assertions.assertEquals(seaTunnelRow.getField(2).getClass(),
Integer.class);
+ Assertions.assertEquals(seaTunnelRow.getField(3).getClass(),
Long.class);
+ Assertions.assertEquals(seaTunnelRow.getField(4).getClass(),
String.class);
+ Assertions.assertEquals(seaTunnelRow.getField(5).getClass(),
Double.class);
+ Assertions.assertEquals(seaTunnelRow.getField(6).getClass(),
Float.class);
+ Assertions.assertEquals(seaTunnelRow.getField(7).getClass(),
BigDecimal.class);
+ Assertions.assertEquals(seaTunnelRow.getField(8).getClass(),
Boolean.class);
+ Assertions.assertEquals(seaTunnelRow.getField(9).getClass(),
LinkedHashMap.class);
+ Assertions.assertEquals(seaTunnelRow.getField(10).getClass(),
String[].class);
+ Assertions.assertEquals(seaTunnelRow.getField(11).getClass(),
LocalDate.class);
+ Assertions.assertEquals(seaTunnelRow.getField(12).getClass(),
LocalDateTime.class);
+ Assertions.assertEquals(seaTunnelRow.getField(13).getClass(),
LocalTime.class);
+
+ Assertions.assertEquals(seaTunnelRow.getField(0), (byte) 1);
+ Assertions.assertEquals(seaTunnelRow.getField(1), (short) 22);
+ Assertions.assertEquals(seaTunnelRow.getField(2), 333);
+ Assertions.assertEquals(seaTunnelRow.getField(3), 4444L);
+ Assertions.assertEquals(seaTunnelRow.getField(4), "Cosmos");
+ Assertions.assertEquals(seaTunnelRow.getField(5), 5.555);
+ Assertions.assertEquals(seaTunnelRow.getField(6), (float) 6.666);
+ Assertions.assertEquals(seaTunnelRow.getField(7), new
BigDecimal("7.78"));
+ Assertions.assertEquals(seaTunnelRow.getField(8), Boolean.FALSE);
+ Assertions.assertEquals(
+ seaTunnelRow.getField(9),
+ new LinkedHashMap<String, String>() {
+ {
+ put("name", "Ivan");
+ put("age", "26");
+ }
+ });
+ Assertions.assertArrayEquals(
+ (String[]) seaTunnelRow.getField(10), new String[]
{"Ivan", "Dusayi"});
+ Assertions.assertEquals(
+ seaTunnelRow.getField(11),
+ DateUtils.parse("2024-01-31",
DateUtils.Formatter.YYYY_MM_DD));
+ Assertions.assertEquals(
+ seaTunnelRow.getField(12),
+ DateTimeUtils.parse(
+ "2024-01-31 16:00:48",
DateTimeUtils.Formatter.YYYY_MM_DD_HH_MM_SS));
+ Assertions.assertEquals(
+ seaTunnelRow.getField(13),
+ TimeUtils.parse("16:00:48", TimeUtils.Formatter.HH_MM_SS));
+ }
+ }
+
+ @Getter
+ public static class TestCollector implements Collector<SeaTunnelRow> {
+ private final List<SeaTunnelRow> rows = new ArrayList<>();
+
+ @Override
+ public void collect(SeaTunnelRow record) {
+ rows.add(record);
+ }
+
+ @Override
+ public Object getCheckpointLock() {
+ return null;
+ }
+ }
+
+ public static class LocalConf extends HadoopConf {
+ private static final String HDFS_IMPL =
"org.apache.hadoop.fs.LocalFileSystem";
+ private static final String SCHEMA = "file";
+
+ public LocalConf(String hdfsNameKey) {
+ super(hdfsNameKey);
+ }
+
+ @Override
+ public String getFsHdfsImpl() {
+ return HDFS_IMPL;
+ }
+
+ @Override
+ public String getSchema() {
+ return SCHEMA;
+ }
+ }
+}
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel.conf
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel.conf
new file mode 100644
index 0000000000..295640031c
--- /dev/null
+++
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel.conf
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+{
+ sheet_name = "Sheet1"
+ skip_header_row_number = 1
+ schema = {
+ fields {
+ c_bytes = "tinyint"
+ c_short = "smallint"
+ c_int = "int"
+ c_bigint = "bigint"
+ c_string = "string"
+ c_double = "double"
+ c_float = "float"
+ c_decimal = "decimal(10, 2)"
+ c_boolean = "boolean"
+ c_map = "map<string, string>"
+ c_array = "array<string>"
+ c_date = "date"
+ c_datetime = "timestamp"
+ c_time = "time"
+ }
+ }
+}
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel.xlsx
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel.xlsx
new file mode 100644
index 0000000000..c277b4d7fd
Binary files /dev/null and
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel.xlsx
differ
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel_date_string.xlsx
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel_date_string.xlsx
new file mode 100644
index 0000000000..4314f1587b
Binary files /dev/null and
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/excel/test_read_excel_date_string.xlsx
differ