[doris] branch master updated: [fix](csv_reader)Fixed bug when parsing multi-character delimiters. (#24572)

morningman Tue, 19 Sep 2023 21:41:47 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new c704497d02 [fix](csv_reader)Fixed bug when parsing multi-character 
delimiters. (#24572)
c704497d02 is described below

commit c704497d021016d4c8e6087287b90c1dc646f8be
Author: daidai <[email protected]>
AuthorDate: Wed Sep 20 12:41:35 2023 +0800

    [fix](csv_reader)Fixed bug when parsing multi-character delimiters. (#24572)
    
    Fixed bug when parsing multi-character delimiters.
---
 be/src/vec/exec/format/csv/csv_reader.cpp          | 22 ++++++++++--
 .../load_p0/stream_load/test_csv_split_line.out    | 42 ++++++++++++++++++++++
 .../load_p0/stream_load/test_csv_split_line2.csv   |  5 ++-
 .../load_p0/stream_load/test_csv_split_line3.csv   |  3 ++
 .../load_p0/stream_load/test_csv_split_line4.csv   | 16 +++++++++
 .../load_p0/stream_load/test_csv_split_line.groovy | 41 +++++++++++++++++++++
 6 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index eeb3aac416..93769a97c9 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -139,9 +139,27 @@ void 
PlainCsvTextFieldSplitter::_split_field_multi_char(const Slice& line,
         if (j == value_sep_len - 1) {
             curpos = i - value_sep_len + 1;
 
-            process_value_func(line.data, start, curpos - start, 
trimming_char, splitted_values);
+            /*
+             * column_separator : "xx"
+             * data.csv :  data1xxxxdata2
+             *
+             * Parse incorrectly:
+             *      data1[xx]xxdata2
+             *      data1x[xx]xdata2
+             *      data1xx[xx]data2
+             * The string "xxxx" is parsed into three "xx" delimiters.
+             *
+             * Parse correctly:
+             *      data1[xx]xxdata2
+             *      data1xx[xx]data2
+             */
+
+            if (curpos >= start) {
+                process_value_func(line.data, start, curpos - start, 
trimming_char,
+                                   splitted_values);
+                start = i + 1;
+            }
 
-            start = i + 1;
             j = next[j];
         }
     }
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line.out 
b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
index 7a97bc9314..0b16a8f480 100644
--- a/regression-test/data/load_p0/stream_load/test_csv_split_line.out
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
@@ -4,11 +4,53 @@
 
 -- !sql --
 1000   worldhell       10000000        ello
+1111   22131   123123  0000000
 2000   wohellhell      200000  ellohell
+2222   \N      \N      \N
 3000   worellohell     30000000        elloab
 4000   hellwohellhell  \N      abcdeeelhllo
+7777           \N      
 10001  helloword       114466  0000011445
+33333  00      11111   00000
+44444  00      11111   
 55555  \N      14455   7711445777
 66666  \N      \N      113355
 77777  0011455 8888    114545
+99999          \N      
+
+-- !sql --
+1      USER    13456   430,431,6418,419,31,341,420,421,7,428,429       0       
2023-09-13T09:55:32
+10             \N              1       \N
+11             \N              \N      2023-09-13T09:57:32
+12     abc     21              1       \N
+13             \N      22      1       \N
+14     \N      \N      \N      \N      \N
+15     112     \N      1231    \N      \N
+16     1       \N      1231    \N      \N
+2      USER    642836  68,260,257,334,30,218,308,309,31,75     0       
2023-09-13T09:57:32
+3      CLASS   366             0       2023-09-13T09:57:32
+4      CLASS   10207           0       2023-09-13T09:57:32
+5      CLASS   111             \N      \N
+6      USER    1       11      \N      \N
+7      USER    1       11      \N      2023-09-13T09:57:32
+8              \N              \N      \N
+9              \N      1       \N      \N
+
+-- !sql --
+10             \N              1       \N
+11             \N              \N      2023-09-13T09:57:32
+12     abc     21              1       \N
+3      CLASS   366             0       2023-09-13T09:57:32
+4      CLASS   10207           0       2023-09-13T09:57:32
+5      CLASS   111             \N      \N
+8              \N              \N      \N
+
+-- !sql --
+10             \N              1       \N
+11             \N              \N      2023-09-13T09:57:32
+12     abc     21              1       \N
+3      CLASS   366             0       2023-09-13T09:57:32
+4      CLASS   10207           0       2023-09-13T09:57:32
+5      CLASS   111             \N      \N
+8              \N              \N      \N
 
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv 
b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
index 04ba509ae4..94340cebd1 100644
--- a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
@@ -1,4 +1,7 @@
 1000helloworldhellhello10000000helloello
 2000hellowohellhellhello200000helloellohell
 3000helloworellohellhello30000000helloelloab
-4000hellohellwohellhellhello\Nhelloabcdeeelhllo
\ No newline at end of file
+4000hellohellwohellhellhello\Nhelloabcdeeelhllo
+"1111"hello"22131"hello"123123"hello0000000
+2222hello\Nhello\Nhello\N
+7777hellohellohello
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv 
b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
index 4332f6b90e..f2bb26a8fb 100644
--- a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
@@ -2,3 +2,6 @@
 55555114455\N114455144551144557711445777
 66666114455\N114455\N114455113355
 7777711445500114551144558888114455114545
+99999114455114455114455
+33333114455001144551111111445500000
+444441144550011445511111114455
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv 
b/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv
new file mode 100644
index 0000000000..8956ed41be
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv
@@ -0,0 +1,16 @@
+1||USER||13456||430,431,6418,419,31,341,420,421,7,428,429||0||2023-09-13 
09:55:32
+2||USER||642836||68,260,257,334,30,218,308,309,31,75||0||2023-09-13 09:57:32
+3||CLASS||366||||0||2023-09-13 09:57:32
+4||CLASS||10207||||0||2023-09-13 09:57:32
+5||CLASS||111||||||
+6||USER||1||11||||
+7||USER||1||11||||2023-09-13 09:57:32
+8||||||||||
+9||||||1||||
+10||||||||1||
+11||||||||||2023-09-13 09:57:32
+12||abc||21||||1||
+13||||||22||1||
+14||\N||\N||\N||\N||\N
+15||112||||1231||||
+16||1||||1231||||
\ No newline at end of file
diff --git 
a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy 
b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
index c3b786bfc7..47bd8c3bbc 100644
--- a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
@@ -108,6 +108,7 @@ suite("test_csv_split_line", "p0") {
     streamLoad {
         table "${tableName}2"
         set 'column_separator', 'hello'
+        set 'trim_double_quotes', 'true'
         file 'test_csv_split_line2.csv'
     }
     streamLoad {
@@ -124,4 +125,44 @@ suite("test_csv_split_line", "p0") {
     
     sql """ drop table ${tableName}2; """ 
 
+    sql """ DROP TABLE IF EXISTS ${tableName}3 """
+    sql """ create table ${tableName}3 (
+        `user_id` bigint(20) NULL, 
+        `tag_type` varchar(20) NULL , 
+        `tag_owner_id` bigint(20) NULL, 
+        `tag_value` text NULL ,
+        `deleted` tinyint(4) NULL ,
+        `create_time` datetime NULL DEFAULT CURRENT_TIMESTAMP
+    ) ENGINE=OLAP
+        UNIQUE KEY(`user_id`, `tag_type`, `tag_owner_id`)
+        DISTRIBUTED BY HASH(`user_id`) BUCKETS 20
+        PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1",
+        "is_being_synced" = "false",
+        "colocate_with" = "__global__crm_user_group",
+        "storage_format" = "V2",
+        "enable_unique_key_merge_on_write" = "true",
+        "disable_auto_compaction" = "false",
+        "enable_single_replica_compaction" = "false"
+    ); 
+    """        
+        
+    streamLoad {
+        table "${tableName}3"
+        set 'column_separator', '||'
+        file 'test_csv_split_line4.csv'
+    }
+    order_qt_sql """
+        select * from ${tableName}3 order by user_id;
+    """  
+
+    order_qt_sql """
+        select * from ${tableName}3 where tag_value="" order by user_id;
+    """      
+    order_qt_sql """
+        select * from ${tableName}3 where tag_value="" order by user_id;
+    """
+    
+    sql """ drop table ${tableName}3; """ 
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [fix](csv_reader)Fixed bug when parsing multi-character delimiters. (#24572)

Reply via email to