This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 184b95aaea [format] csv: optimize parsing of short, byte, int and long
(#6856)
184b95aaea is described below
commit 184b95aaeaeaa36472996ef5bb42df82406df8e1
Author: jerry <[email protected]>
AuthorDate: Wed Dec 24 21:57:59 2025 +0800
[format] csv: optimize parsing of short, byte, int and long (#6856)
---
.../org/apache/paimon/format/csv/CsvParser.java | 163 ++++++++++++++++++---
.../paimon/format/csv/CsvFileFormatTest.java | 36 +++++
2 files changed, 176 insertions(+), 23 deletions(-)
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
b/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
index a74908f73c..8ea5e60904 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
@@ -18,6 +18,7 @@
package org.apache.paimon.format.csv;
+import org.apache.paimon.annotation.VisibleForTesting;
import org.apache.paimon.casting.CastExecutor;
import org.apache.paimon.casting.CastExecutors;
import org.apache.paimon.data.BinaryString;
@@ -27,6 +28,7 @@ import org.apache.paimon.types.DataType;
import org.apache.paimon.types.DataTypeRoot;
import org.apache.paimon.types.DataTypes;
import org.apache.paimon.types.RowType;
+import org.apache.paimon.utils.Pair;
import javax.annotation.Nullable;
@@ -35,6 +37,8 @@ import java.util.Base64;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import static org.apache.paimon.format.csv.CsvOptions.Mode.DROPMALFORMED;
+import static org.apache.paimon.format.csv.CsvOptions.Mode.PERMISSIVE;
import static org.apache.paimon.utils.Preconditions.checkArgument;
import static org.apache.paimon.utils.StringUtils.isNullOrWhitespaceOnly;
@@ -44,6 +48,7 @@ public class CsvParser {
private static final Base64.Decoder BASE64_DECODER = Base64.getDecoder();
private static final Map<String, CastExecutor<?, ?>> CAST_EXECUTOR_CACHE =
new ConcurrentHashMap<>();
+ private static final int NUMBER_PARSE_RADIX = 10;
private final RowType dataSchemaRowType;
private final int[] projectMapping;
@@ -157,20 +162,28 @@ public class CsvParser {
for (int i = 0; i < projectMapping.length; i++) {
int ordinal = projectMapping[i];
DataType type = dataSchemaRowType.getTypeAt(ordinal);
- Object field = null;
+ Pair<Boolean, Object> parseResult = null;
+ Exception exception = null;
+ String parseValue = rowValues[ordinal];
try {
- field = parseField(rowValues[ordinal], type);
+ parseResult = parseField(parseValue, type);
} catch (Exception e) {
- switch (mode) {
- case PERMISSIVE:
- break;
- case DROPMALFORMED:
- return null;
- case FAILFAST:
- throw e;
- }
+ exception = e;
+ }
+ if (parseResult != null && parseResult.getLeft()) {
+ row.setField(i, parseResult.getValue());
+ } else if (mode == PERMISSIVE
+ && (parseResult == null || !parseResult.getLeft() ||
exception != null)) {
+ break;
+ } else if (mode == DROPMALFORMED
+ && (parseResult == null || !parseResult.getLeft() ||
exception != null)) {
+ return null;
+ } else if (exception != null) {
+ throw new RuntimeException(exception);
+ } else if (parseResult == null
+ || !parseResult.getLeft() && parseResult.getValue() ==
null) {
+ throw new NumberFormatException("For input string: \"" +
parseValue + "\"");
}
- row.setField(i, field);
}
return row;
}
@@ -188,35 +201,46 @@ public class CsvParser {
return true;
}
- private Object parseField(String field, DataType dataType) {
+ @VisibleForTesting
+ public Pair<Boolean, Object> parseField(String field, DataType dataType) {
if (field == null || field.equals(nullLiteral)) {
- return null;
+ return Pair.of(true, null);
}
DataTypeRoot typeRoot = dataType.getTypeRoot();
switch (typeRoot) {
case TINYINT:
- return Byte.parseByte(field);
+ Integer intVal = parseInt(field);
+ if (intVal == null || intVal > Byte.MAX_VALUE || intVal <
Byte.MIN_VALUE) {
+ return Pair.of(false, null);
+ }
+ return Pair.of(true, intVal.byteValue());
case SMALLINT:
- return Short.parseShort(field);
+ intVal = parseInt(field);
+ if (intVal == null || intVal > Short.MAX_VALUE || intVal <
Short.MIN_VALUE) {
+ return Pair.of(false, null);
+ }
+ return Pair.of(true, intVal.shortValue());
case INTEGER:
- return Integer.parseInt(field);
+ intVal = parseInt(field);
+ return Pair.of(intVal != null, intVal);
case BIGINT:
- return Long.parseLong(field);
+ Long longVal = parseLong(field);
+ return Pair.of(longVal != null, longVal);
case FLOAT:
- return Float.parseFloat(field);
+ return Pair.of(true, Float.parseFloat(field));
case DOUBLE:
- return Double.parseDouble(field);
+ return Pair.of(true, Double.parseDouble(field));
case BOOLEAN:
- return Boolean.parseBoolean(field);
+ return Pair.of(true, Boolean.parseBoolean(field));
case CHAR:
case VARCHAR:
- return BinaryString.fromString(field);
+ return Pair.of(true, BinaryString.fromString(field));
case BINARY:
case VARBINARY:
- return BASE64_DECODER.decode(field);
+ return Pair.of(true, BASE64_DECODER.decode(field));
default:
- return parseByCastExecutor(field, dataType);
+ return Pair.of(true, parseByCastExecutor(field, dataType));
}
}
@@ -233,4 +257,97 @@ public class CsvParser {
}
return BinaryString.fromString(field);
}
+
+ private static Integer parseInt(String s) {
+ if (s == null || s.isEmpty()) {
+ return null;
+ }
+ int len = s.length();
+ int i = 0;
+ char firstChar = s.charAt(0);
+ boolean negative = false;
+ int limit = -Integer.MAX_VALUE;
+
+ if (firstChar < '0') {
+ if (firstChar == '-') {
+ negative = true;
+ limit = Integer.MIN_VALUE;
+ } else if (firstChar != '+') {
+ return null;
+ }
+
+ if (len == 1) {
+ return null;
+ }
+ i++;
+ }
+
+ int multmin = limit / NUMBER_PARSE_RADIX;
+ int result = 0;
+ int digit;
+
+ while (i < len) {
+ digit = Character.digit(s.charAt(i++), NUMBER_PARSE_RADIX);
+ if (digit < 0) {
+ return null;
+ }
+ if (result < multmin) {
+ return null;
+ }
+ result *= NUMBER_PARSE_RADIX;
+ if (result < limit + digit) {
+ return null;
+ }
+ result -= digit;
+ }
+
+ return negative ? result : -result;
+ }
+
+ private static Long parseLong(String s) {
+ if (s == null || s.isEmpty()) {
+ return null;
+ }
+ int len = s.length();
+ int i = 0;
+ char firstChar = s.charAt(0);
+ boolean negative = false;
+ long limit = -Long.MAX_VALUE;
+
+ if (firstChar < '0') {
+ if (firstChar == '-') {
+ negative = true;
+ limit = Long.MIN_VALUE;
+ } else if (firstChar != '+') {
+ return null;
+ }
+
+ if (len == 1) {
+ return null;
+ }
+ i++;
+ }
+
+ long multmin = limit / NUMBER_PARSE_RADIX;
+ long result = 0;
+ int digit;
+
+ while (i < len) {
+ digit = Character.digit(s.charAt(i++), NUMBER_PARSE_RADIX);
+ if (digit < 0) {
+ return null;
+ }
+
+ if (result < multmin) {
+ return null;
+ }
+ result *= NUMBER_PARSE_RADIX;
+ if (result < limit + digit) {
+ return null;
+ }
+ result -= digit;
+ }
+
+ return negative ? result : -result;
+ }
}
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
index a325bdb415..8e3b223a82 100644
---
a/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
+++
b/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
@@ -534,6 +534,7 @@ public class CsvFileFormatTest extends FormatReadWriteTest {
rowType,
testFile);
})
+ .cause()
.isInstanceOf(IllegalArgumentException.class);
}
@@ -569,6 +570,41 @@ public class CsvFileFormatTest extends FormatReadWriteTest
{
assertThat(permissiveResult.get(3).getDouble(2)).isEqualTo(400.81);
}
+ @Test
+ public void testCsvParserParseField() {
+ RowType rowType =
+ DataTypes.ROW(
+ DataTypes.TINYINT(),
+ DataTypes.SMALLINT(),
+ DataTypes.INT(),
+ DataTypes.BIGINT());
+ int[] projection = {0, 1, 2, 3};
+ CsvParser parser = new CsvParser(rowType, projection, new
CsvOptions(new Options()));
+
+ // Test normal cases
+ assertThat(parser.parseField("123",
DataTypes.INT()).getValue()).isEqualTo(123);
+ assertThat(parser.parseField("-0",
DataTypes.INT()).getValue()).isEqualTo(0);
+ assertThat(parser.parseField("0",
DataTypes.INT()).getValue()).isEqualTo(0);
+ assertThat(parser.parseField("123",
DataTypes.BIGINT()).getValue()).isEqualTo(123L);
+ assertThat(parser.parseField("-0",
DataTypes.BIGINT()).getValue()).isEqualTo(0L);
+ assertThat(parser.parseField("0",
DataTypes.BIGINT()).getValue()).isEqualTo(0L);
+ assertThat(parser.parseField("123",
DataTypes.TINYINT()).getValue()).isEqualTo((byte) 123);
+ assertThat(parser.parseField("12345", DataTypes.SMALLINT()).getValue())
+ .isEqualTo((short) 12345);
+
+ // Test invalid format
+ assertThat(parser.parseField("abc",
DataTypes.INT()).getValue()).isNull();
+ assertThat(parser.parseField("12.3",
DataTypes.INT()).getValue()).isNull();
+
+ // Test overflow
+ assertThat(parser.parseField("2147483648",
DataTypes.INT()).getValue()).isNull();
+ assertThat(parser.parseField("-2147483649",
DataTypes.INT()).getValue()).isNull();
+ assertThat(parser.parseField("9223372036854775808",
DataTypes.BIGINT()).getValue())
+ .isNull();
+ assertThat(parser.parseField("128",
DataTypes.TINYINT()).getValue()).isNull();
+ assertThat(parser.parseField("32768",
DataTypes.SMALLINT()).getValue()).isNull();
+ }
+
private List<InternalRow> read(
FileFormat format, RowType fullRowType, RowType readRowType, Path
testFile)
throws IOException {