This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new c704497d02 [fix](csv_reader)Fixed bug when parsing multi-character
delimiters. (#24572)
c704497d02 is described below
commit c704497d021016d4c8e6087287b90c1dc646f8be
Author: daidai <[email protected]>
AuthorDate: Wed Sep 20 12:41:35 2023 +0800
[fix](csv_reader)Fixed bug when parsing multi-character delimiters. (#24572)
Fixed bug when parsing multi-character delimiters.
---
be/src/vec/exec/format/csv/csv_reader.cpp | 22 ++++++++++--
.../load_p0/stream_load/test_csv_split_line.out | 42 ++++++++++++++++++++++
.../load_p0/stream_load/test_csv_split_line2.csv | 5 ++-
.../load_p0/stream_load/test_csv_split_line3.csv | 3 ++
.../load_p0/stream_load/test_csv_split_line4.csv | 16 +++++++++
.../load_p0/stream_load/test_csv_split_line.groovy | 41 +++++++++++++++++++++
6 files changed, 126 insertions(+), 3 deletions(-)
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index eeb3aac416..93769a97c9 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -139,9 +139,27 @@ void
PlainCsvTextFieldSplitter::_split_field_multi_char(const Slice& line,
if (j == value_sep_len - 1) {
curpos = i - value_sep_len + 1;
- process_value_func(line.data, start, curpos - start,
trimming_char, splitted_values);
+ /*
+ * column_separator : "xx"
+ * data.csv : data1xxxxdata2
+ *
+ * Parse incorrectly:
+ * data1[xx]xxdata2
+ * data1x[xx]xdata2
+ * data1xx[xx]data2
+ * The string "xxxx" is parsed into three "xx" delimiters.
+ *
+ * Parse correctly:
+ * data1[xx]xxdata2
+ * data1xx[xx]data2
+ */
+
+ if (curpos >= start) {
+ process_value_func(line.data, start, curpos - start,
trimming_char,
+ splitted_values);
+ start = i + 1;
+ }
- start = i + 1;
j = next[j];
}
}
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line.out
b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
index 7a97bc9314..0b16a8f480 100644
--- a/regression-test/data/load_p0/stream_load/test_csv_split_line.out
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
@@ -4,11 +4,53 @@
-- !sql --
1000 worldhell 10000000 ello
+1111 22131 123123 0000000
2000 wohellhell 200000 ellohell
+2222 \N \N \N
3000 worellohell 30000000 elloab
4000 hellwohellhell \N abcdeeelhllo
+7777 \N
10001 helloword 114466 0000011445
+33333 00 11111 00000
+44444 00 11111
55555 \N 14455 7711445777
66666 \N \N 113355
77777 0011455 8888 114545
+99999 \N
+
+-- !sql --
+1 USER 13456 430,431,6418,419,31,341,420,421,7,428,429 0
2023-09-13T09:55:32
+10 \N 1 \N
+11 \N \N 2023-09-13T09:57:32
+12 abc 21 1 \N
+13 \N 22 1 \N
+14 \N \N \N \N \N
+15 112 \N 1231 \N \N
+16 1 \N 1231 \N \N
+2 USER 642836 68,260,257,334,30,218,308,309,31,75 0
2023-09-13T09:57:32
+3 CLASS 366 0 2023-09-13T09:57:32
+4 CLASS 10207 0 2023-09-13T09:57:32
+5 CLASS 111 \N \N
+6 USER 1 11 \N \N
+7 USER 1 11 \N 2023-09-13T09:57:32
+8 \N \N \N
+9 \N 1 \N \N
+
+-- !sql --
+10 \N 1 \N
+11 \N \N 2023-09-13T09:57:32
+12 abc 21 1 \N
+3 CLASS 366 0 2023-09-13T09:57:32
+4 CLASS 10207 0 2023-09-13T09:57:32
+5 CLASS 111 \N \N
+8 \N \N \N
+
+-- !sql --
+10 \N 1 \N
+11 \N \N 2023-09-13T09:57:32
+12 abc 21 1 \N
+3 CLASS 366 0 2023-09-13T09:57:32
+4 CLASS 10207 0 2023-09-13T09:57:32
+5 CLASS 111 \N \N
+8 \N \N \N
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
index 04ba509ae4..94340cebd1 100644
--- a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
@@ -1,4 +1,7 @@
1000helloworldhellhello10000000helloello
2000hellowohellhellhello200000helloellohell
3000helloworellohellhello30000000helloelloab
-4000hellohellwohellhellhello\Nhelloabcdeeelhllo
\ No newline at end of file
+4000hellohellwohellhellhello\Nhelloabcdeeelhllo
+"1111"hello"22131"hello"123123"hello0000000
+2222hello\Nhello\Nhello\N
+7777hellohellohello
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
index 4332f6b90e..f2bb26a8fb 100644
--- a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
@@ -2,3 +2,6 @@
55555114455\N114455144551144557711445777
66666114455\N114455\N114455113355
7777711445500114551144558888114455114545
+99999114455114455114455
+33333114455001144551111111445500000
+444441144550011445511111114455
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv
b/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv
new file mode 100644
index 0000000000..8956ed41be
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv
@@ -0,0 +1,16 @@
+1||USER||13456||430,431,6418,419,31,341,420,421,7,428,429||0||2023-09-13
09:55:32
+2||USER||642836||68,260,257,334,30,218,308,309,31,75||0||2023-09-13 09:57:32
+3||CLASS||366||||0||2023-09-13 09:57:32
+4||CLASS||10207||||0||2023-09-13 09:57:32
+5||CLASS||111||||||
+6||USER||1||11||||
+7||USER||1||11||||2023-09-13 09:57:32
+8||||||||||
+9||||||1||||
+10||||||||1||
+11||||||||||2023-09-13 09:57:32
+12||abc||21||||1||
+13||||||22||1||
+14||\N||\N||\N||\N||\N
+15||112||||1231||||
+16||1||||1231||||
\ No newline at end of file
diff --git
a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
index c3b786bfc7..47bd8c3bbc 100644
--- a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
@@ -108,6 +108,7 @@ suite("test_csv_split_line", "p0") {
streamLoad {
table "${tableName}2"
set 'column_separator', 'hello'
+ set 'trim_double_quotes', 'true'
file 'test_csv_split_line2.csv'
}
streamLoad {
@@ -124,4 +125,44 @@ suite("test_csv_split_line", "p0") {
sql """ drop table ${tableName}2; """
+ sql """ DROP TABLE IF EXISTS ${tableName}3 """
+ sql """ create table ${tableName}3 (
+ `user_id` bigint(20) NULL,
+ `tag_type` varchar(20) NULL ,
+ `tag_owner_id` bigint(20) NULL,
+ `tag_value` text NULL ,
+ `deleted` tinyint(4) NULL ,
+ `create_time` datetime NULL DEFAULT CURRENT_TIMESTAMP
+ ) ENGINE=OLAP
+ UNIQUE KEY(`user_id`, `tag_type`, `tag_owner_id`)
+ DISTRIBUTED BY HASH(`user_id`) BUCKETS 20
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "is_being_synced" = "false",
+ "colocate_with" = "__global__crm_user_group",
+ "storage_format" = "V2",
+ "enable_unique_key_merge_on_write" = "true",
+ "disable_auto_compaction" = "false",
+ "enable_single_replica_compaction" = "false"
+ );
+ """
+
+ streamLoad {
+ table "${tableName}3"
+ set 'column_separator', '||'
+ file 'test_csv_split_line4.csv'
+ }
+ order_qt_sql """
+ select * from ${tableName}3 order by user_id;
+ """
+
+ order_qt_sql """
+ select * from ${tableName}3 where tag_value="" order by user_id;
+ """
+ order_qt_sql """
+ select * from ${tableName}3 where tag_value="" order by user_id;
+ """
+
+ sql """ drop table ${tableName}3; """
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]