This is an automated email from the ASF dual-hosted git repository.

ngangam pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 1ceb8368803 HIVE-27586: Parse dates from strings ignoring trailing 
(potentialy) invalid chars (#4545)
1ceb8368803 is described below

commit 1ceb8368803ec3f1d47d815fb2fed9fbae9ebc57
Author: Stamatis Zampetakis <zabe...@gmail.com>
AuthorDate: Tue Aug 15 16:24:32 2023 +0300

    HIVE-27586: Parse dates from strings ignoring trailing (potentialy) invalid 
chars (#4545)
    
    * Parse dates from strings ignoring trailing (potentialy) invalid chars
    
    Without these changes the parser consumes fully the input string and if
    the input does not match exactly the expected pattern null is returned.
    
    However there are cases where the input string starts with a completely
    valid date that matches the pattern but then it is followed by other
    characters; for example timestamps.
    
    The `DateParser` can handle `2023-08-03T16:02:00` but will return null
    for `2023-08-03_16:02:00`. Interestingly, it also handles fine
    `2023-08-03TGARBAGE` but not `2023-08-03_GARBAGE`.
    
    With these changes the `DateParser` can now extract valid dates even if
    there are trailing invalid chars after the expected pattern.
    
    The changes restore backward compatibility when parting dates since
    partial parsing used to work before HIVE-20007. At the same time, it
    makes parsing more uniform since we do not treat 'T' or space (' ') as
    special cases and simplifies the parsing code.
    
    * Change date '12-01-2018' to '2018-01-12' in check_constraint.q
    
    The dd-MM-yyyy format is not accepted by Hive so inserting such values may 
not have the desired effect. The inserted value may be null, 0012-01-20, or 
something completely off depending on how date handling changes.
    
    The check_constraint.q is not about testing the validity of dates so 
changing the values to the common pattern accepted by Hive.
    
    * Enrich TestDateParser with tests for valid dates with trailing digits
    
    The test cases are interesting cause some common date formats, such as 
dd-MM-yyyy, may be interpreted as valid dates in yy-MM-dd format with trailing 
(invalid) digits.
---
 .../org/apache/hadoop/hive/common/type/Date.java   | 21 ++++++-------
 .../apache/hive/common/util/TestDateParser.java    | 16 ++++++++++
 .../test/queries/clientpositive/check_constraint.q | 12 ++++----
 .../clientpositive/llap/check_constraint.q.out     | 36 +++++++++++-----------
 4 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/common/type/Date.java 
b/common/src/java/org/apache/hadoop/hive/common/type/Date.java
index 13e710c5d86..0f47ba513d4 100644
--- a/common/src/java/org/apache/hadoop/hive/common/type/Date.java
+++ b/common/src/java/org/apache/hadoop/hive/common/type/Date.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hive.common.type;
 
 import org.apache.hive.common.util.SuppressFBWarnings;
 
+import java.text.ParsePosition;
 import java.time.DateTimeException;
 import java.time.Instant;
 import java.time.LocalDate;
@@ -27,9 +28,11 @@ import java.time.ZoneId;
 import java.time.ZoneOffset;
 import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeFormatterBuilder;
+import java.time.format.DateTimeParseException;
 import java.time.format.ResolverStyle;
 import java.time.format.SignStyle;
 import java.time.temporal.ChronoField;
+import java.time.temporal.TemporalAccessor;
 import java.util.Objects;
 
 import static java.time.temporal.ChronoField.DAY_OF_MONTH;
@@ -175,22 +178,16 @@ public class Date implements Comparable<Date> {
    */
   public static Date valueOf(final String text) {
     String s = Objects.requireNonNull(text).trim();
-    int idx = s.indexOf(" ");
-    if (idx != -1) {
-      s = s.substring(0, idx);
-    } else {
-      idx = s.indexOf('T');
-      if (idx != -1) {
-        s = s.substring(0, idx);
-      }
-    }
-    LocalDate localDate;
+    ParsePosition pos = new ParsePosition(0);
     try {
-      localDate = LocalDate.parse(s, PARSE_FORMATTER);
+      TemporalAccessor t = PARSE_FORMATTER.parseUnresolved(s, pos);
+      if (pos.getErrorIndex() >= 0) {
+        throw new DateTimeParseException("Text could not be parsed to date", 
s, pos.getErrorIndex());
+      }
+      return new Date(LocalDate.of(t.get(YEAR), t.get(MONTH_OF_YEAR), 
t.get(DAY_OF_MONTH)));
     } catch (DateTimeException e) {
       throw new IllegalArgumentException("Cannot create date, parsing error");
     }
-    return new Date(localDate);
   }
 
   public static Date ofEpochDay(int epochDay) {
diff --git a/common/src/test/org/apache/hive/common/util/TestDateParser.java 
b/common/src/test/org/apache/hive/common/util/TestDateParser.java
index 34695a469b3..f492ad52ead 100644
--- a/common/src/test/org/apache/hive/common/util/TestDateParser.java
+++ b/common/src/test/org/apache/hive/common/util/TestDateParser.java
@@ -60,6 +60,22 @@ public class TestDateParser {
     checkValidCase(" 2001-11-12 01:02:03", Date.valueOf("2001-11-12"));
   }
 
+  @Test
+  public void testParseDateFromTimestampWithCommonTimeDelimiter() {
+    for (String d : new String[] { "T", " ", "-", ".", "_", "" }) {
+      String ts = "2023-08-03" + d + "01:02:03";
+      assertEquals("Parsing " + ts, Date.of(2023, 8, 3), 
DateParser.parseDate(ts));
+    }
+  }
+
+  @Test
+  public void testParseDateFromValidDateLiteralWithTrailingDigits() {
+    assertEquals(Date.of(2023, 8, 8), DateParser.parseDate("2023-08-0800"));
+    // The result may seem unexpected but for many "08-08-20" is a valid date 
so there is no reason to reject
+    // "08-08-2023" and return null unless in the future Hive becomes stricter 
in terms of parsing dates.
+    assertEquals(Date.of(8, 8, 20), DateParser.parseDate("08-08-2023"));
+  }
+
   @Test
   public void testInvalidCases() throws Exception {
     checkInvalidCase("2001");
diff --git a/ql/src/test/queries/clientpositive/check_constraint.q 
b/ql/src/test/queries/clientpositive/check_constraint.q
index 6c769b928bd..521732c7c38 100644
--- a/ql/src/test/queries/clientpositive/check_constraint.q
+++ b/ql/src/test/queries/clientpositive/check_constraint.q
@@ -36,8 +36,8 @@ create table tmulti(url string NOT NULL ENABLE, userName 
string, numClicks int C
 explain alter table tmulti add constraint un1 UNIQUE (userName, numClicks) 
DISABLE;
 alter table tmulti add constraint un1 UNIQUE (userName, numClicks) DISABLE;
 DESC formatted tmulti;
-EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'12-01-2018');
-INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018');
+EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'2018-01-12');
+INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12');
 Select * from tmulti;
 
 -- alter table add constraint
@@ -45,16 +45,16 @@ truncate table tmulti;
 alter table tmulti add constraint chk1 CHECK (userName != NULL);
 alter table tmulti add constraint chk2 CHECK (numClicks <= 10000 AND userName 
!= '');
 DESC formatted tmulti;
-EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'12-01-2018');
-INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018');
+EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'2018-01-12');
+INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12');
 Select * from tmulti;
 Drop table tmulti;
 
 -- case insentivity
 create table tcase(url string NOT NULL ENABLE, userName string, d date, 
numClicks int CHECK (numclicks > 0));
 DESC formatted tcase;
-EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48);
-INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48);
+EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48);
+INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48);
 Select * from tcase ;
 Drop table tcase;
 
diff --git a/ql/src/test/results/clientpositive/llap/check_constraint.q.out 
b/ql/src/test/results/clientpositive/llap/check_constraint.q.out
index 603982b4935..ad95dfa6246 100644
--- a/ql/src/test/results/clientpositive/llap/check_constraint.q.out
+++ b/ql/src/test/results/clientpositive/llap/check_constraint.q.out
@@ -501,11 +501,11 @@ Table:                    default.tmulti
 Constraint Name:       #### A masked pattern was here ####      
 Column Name:numclicks  Check Value:numClicks > 0        
                 
-PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '12-01-2018')
+PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '2018-01-12')
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table
 PREHOOK: Output: default@tmulti
-POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '12-01-2018')
+POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '2018-01-12')
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tmulti
@@ -526,7 +526,7 @@ STAGE PLANS:
                   Row Limit Per Split: 1
                   Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: array(const 
struct('hive.apache.com','user1',48,'12-01-2018')) (type: 
array<struct<col1:string,col2:string,col3:int,col4:string>>)
+                    expressions: array(const 
struct('hive.apache.com','user1',48,'2018-01-12')) (type: 
array<struct<col1:string,col2:string,col3:int,col4:string>>)
                     outputColumnNames: _col0
                     Statistics: Num rows: 1 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
                     UDTF Operator
@@ -567,11 +567,11 @@ STAGE PLANS:
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.tmulti
 
-PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'12-01-2018')
+PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'2018-01-12')
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table
 PREHOOK: Output: default@tmulti
-POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'12-01-2018')
+POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'2018-01-12')
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tmulti
@@ -587,7 +587,7 @@ POSTHOOK: query: Select * from tmulti
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@tmulti
 #### A masked pattern was here ####
-hive.apache.com        user1   48      NULL
+hive.apache.com        user1   48      2018-01-12
 PREHOOK: query: truncate table tmulti
 PREHOOK: type: TRUNCATETABLE
 PREHOOK: Output: default@tmulti
@@ -672,11 +672,11 @@ Column Name:null          Check Value:numClicks <= 10000 
AND userName != ''
 Constraint Name:       #### A masked pattern was here ####      
 Column Name:numclicks  Check Value:numClicks > 0        
                 
-PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '12-01-2018')
+PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '2018-01-12')
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table
 PREHOOK: Output: default@tmulti
-POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '12-01-2018')
+POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 
48, '2018-01-12')
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tmulti
@@ -697,7 +697,7 @@ STAGE PLANS:
                   Row Limit Per Split: 1
                   Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: array(const 
struct('hive.apache.com','user1',48,'12-01-2018')) (type: 
array<struct<col1:string,col2:string,col3:int,col4:string>>)
+                    expressions: array(const 
struct('hive.apache.com','user1',48,'2018-01-12')) (type: 
array<struct<col1:string,col2:string,col3:int,col4:string>>)
                     outputColumnNames: _col0
                     Statistics: Num rows: 1 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
                     UDTF Operator
@@ -738,11 +738,11 @@ STAGE PLANS:
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.tmulti
 
-PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'12-01-2018')
+PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'2018-01-12')
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table
 PREHOOK: Output: default@tmulti
-POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'12-01-2018')
+POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, 
'2018-01-12')
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tmulti
@@ -758,7 +758,7 @@ POSTHOOK: query: Select * from tmulti
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@tmulti
 #### A masked pattern was here ####
-hive.apache.com        user1   48      NULL
+hive.apache.com        user1   48      2018-01-12
 PREHOOK: query: Drop table tmulti
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@tmulti
@@ -823,11 +823,11 @@ Table:                    default.tcase
 Constraint Name:       #### A masked pattern was here ####      
 Column Name:numclicks  Check Value:numclicks > 0        
                 
-PREHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', 
'12-01-2018', 48)
+PREHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', 
'2018-01-12', 48)
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table
 PREHOOK: Output: default@tcase
-POSTHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', 
'12-01-2018', 48)
+POSTHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', 
'2018-01-12', 48)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tcase
@@ -848,7 +848,7 @@ STAGE PLANS:
                   Row Limit Per Split: 1
                   Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: array(const 
struct('hive.apache.com','user1','12-01-2018',48)) (type: 
array<struct<col1:string,col2:string,col3:string,col4:int>>)
+                    expressions: array(const 
struct('hive.apache.com','user1','2018-01-12',48)) (type: 
array<struct<col1:string,col2:string,col3:string,col4:int>>)
                     outputColumnNames: _col0
                     Statistics: Num rows: 1 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
                     UDTF Operator
@@ -889,11 +889,11 @@ STAGE PLANS:
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.tcase
 
-PREHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', 
'12-01-2018', 48)
+PREHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', 
'2018-01-12', 48)
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table
 PREHOOK: Output: default@tcase
-POSTHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', 
'12-01-2018', 48)
+POSTHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', 
'2018-01-12', 48)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tcase
@@ -909,7 +909,7 @@ POSTHOOK: query: Select * from tcase
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@tcase
 #### A masked pattern was here ####
-hive.apache.com        user1   NULL    48
+hive.apache.com        user1   2018-01-12      48
 PREHOOK: query: Drop table tcase
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@tcase

Reply via email to