This is an automated email from the ASF dual-hosted git repository. ngangam pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 1ceb8368803 HIVE-27586: Parse dates from strings ignoring trailing (potentialy) invalid chars (#4545) 1ceb8368803 is described below commit 1ceb8368803ec3f1d47d815fb2fed9fbae9ebc57 Author: Stamatis Zampetakis <zabe...@gmail.com> AuthorDate: Tue Aug 15 16:24:32 2023 +0300 HIVE-27586: Parse dates from strings ignoring trailing (potentialy) invalid chars (#4545) * Parse dates from strings ignoring trailing (potentialy) invalid chars Without these changes the parser consumes fully the input string and if the input does not match exactly the expected pattern null is returned. However there are cases where the input string starts with a completely valid date that matches the pattern but then it is followed by other characters; for example timestamps. The `DateParser` can handle `2023-08-03T16:02:00` but will return null for `2023-08-03_16:02:00`. Interestingly, it also handles fine `2023-08-03TGARBAGE` but not `2023-08-03_GARBAGE`. With these changes the `DateParser` can now extract valid dates even if there are trailing invalid chars after the expected pattern. The changes restore backward compatibility when parting dates since partial parsing used to work before HIVE-20007. At the same time, it makes parsing more uniform since we do not treat 'T' or space (' ') as special cases and simplifies the parsing code. * Change date '12-01-2018' to '2018-01-12' in check_constraint.q The dd-MM-yyyy format is not accepted by Hive so inserting such values may not have the desired effect. The inserted value may be null, 0012-01-20, or something completely off depending on how date handling changes. The check_constraint.q is not about testing the validity of dates so changing the values to the common pattern accepted by Hive. * Enrich TestDateParser with tests for valid dates with trailing digits The test cases are interesting cause some common date formats, such as dd-MM-yyyy, may be interpreted as valid dates in yy-MM-dd format with trailing (invalid) digits. --- .../org/apache/hadoop/hive/common/type/Date.java | 21 ++++++------- .../apache/hive/common/util/TestDateParser.java | 16 ++++++++++ .../test/queries/clientpositive/check_constraint.q | 12 ++++---- .../clientpositive/llap/check_constraint.q.out | 36 +++++++++++----------- 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/common/type/Date.java b/common/src/java/org/apache/hadoop/hive/common/type/Date.java index 13e710c5d86..0f47ba513d4 100644 --- a/common/src/java/org/apache/hadoop/hive/common/type/Date.java +++ b/common/src/java/org/apache/hadoop/hive/common/type/Date.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.common.type; import org.apache.hive.common.util.SuppressFBWarnings; +import java.text.ParsePosition; import java.time.DateTimeException; import java.time.Instant; import java.time.LocalDate; @@ -27,9 +28,11 @@ import java.time.ZoneId; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; import java.time.format.ResolverStyle; import java.time.format.SignStyle; import java.time.temporal.ChronoField; +import java.time.temporal.TemporalAccessor; import java.util.Objects; import static java.time.temporal.ChronoField.DAY_OF_MONTH; @@ -175,22 +178,16 @@ public class Date implements Comparable<Date> { */ public static Date valueOf(final String text) { String s = Objects.requireNonNull(text).trim(); - int idx = s.indexOf(" "); - if (idx != -1) { - s = s.substring(0, idx); - } else { - idx = s.indexOf('T'); - if (idx != -1) { - s = s.substring(0, idx); - } - } - LocalDate localDate; + ParsePosition pos = new ParsePosition(0); try { - localDate = LocalDate.parse(s, PARSE_FORMATTER); + TemporalAccessor t = PARSE_FORMATTER.parseUnresolved(s, pos); + if (pos.getErrorIndex() >= 0) { + throw new DateTimeParseException("Text could not be parsed to date", s, pos.getErrorIndex()); + } + return new Date(LocalDate.of(t.get(YEAR), t.get(MONTH_OF_YEAR), t.get(DAY_OF_MONTH))); } catch (DateTimeException e) { throw new IllegalArgumentException("Cannot create date, parsing error"); } - return new Date(localDate); } public static Date ofEpochDay(int epochDay) { diff --git a/common/src/test/org/apache/hive/common/util/TestDateParser.java b/common/src/test/org/apache/hive/common/util/TestDateParser.java index 34695a469b3..f492ad52ead 100644 --- a/common/src/test/org/apache/hive/common/util/TestDateParser.java +++ b/common/src/test/org/apache/hive/common/util/TestDateParser.java @@ -60,6 +60,22 @@ public class TestDateParser { checkValidCase(" 2001-11-12 01:02:03", Date.valueOf("2001-11-12")); } + @Test + public void testParseDateFromTimestampWithCommonTimeDelimiter() { + for (String d : new String[] { "T", " ", "-", ".", "_", "" }) { + String ts = "2023-08-03" + d + "01:02:03"; + assertEquals("Parsing " + ts, Date.of(2023, 8, 3), DateParser.parseDate(ts)); + } + } + + @Test + public void testParseDateFromValidDateLiteralWithTrailingDigits() { + assertEquals(Date.of(2023, 8, 8), DateParser.parseDate("2023-08-0800")); + // The result may seem unexpected but for many "08-08-20" is a valid date so there is no reason to reject + // "08-08-2023" and return null unless in the future Hive becomes stricter in terms of parsing dates. + assertEquals(Date.of(8, 8, 20), DateParser.parseDate("08-08-2023")); + } + @Test public void testInvalidCases() throws Exception { checkInvalidCase("2001"); diff --git a/ql/src/test/queries/clientpositive/check_constraint.q b/ql/src/test/queries/clientpositive/check_constraint.q index 6c769b928bd..521732c7c38 100644 --- a/ql/src/test/queries/clientpositive/check_constraint.q +++ b/ql/src/test/queries/clientpositive/check_constraint.q @@ -36,8 +36,8 @@ create table tmulti(url string NOT NULL ENABLE, userName string, numClicks int C explain alter table tmulti add constraint un1 UNIQUE (userName, numClicks) DISABLE; alter table tmulti add constraint un1 UNIQUE (userName, numClicks) DISABLE; DESC formatted tmulti; -EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018'); -INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018'); +EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12'); +INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12'); Select * from tmulti; -- alter table add constraint @@ -45,16 +45,16 @@ truncate table tmulti; alter table tmulti add constraint chk1 CHECK (userName != NULL); alter table tmulti add constraint chk2 CHECK (numClicks <= 10000 AND userName != ''); DESC formatted tmulti; -EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018'); -INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018'); +EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12'); +INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12'); Select * from tmulti; Drop table tmulti; -- case insentivity create table tcase(url string NOT NULL ENABLE, userName string, d date, numClicks int CHECK (numclicks > 0)); DESC formatted tcase; -EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48); -INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48); +EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48); +INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48); Select * from tcase ; Drop table tcase; diff --git a/ql/src/test/results/clientpositive/llap/check_constraint.q.out b/ql/src/test/results/clientpositive/llap/check_constraint.q.out index 603982b4935..ad95dfa6246 100644 --- a/ql/src/test/results/clientpositive/llap/check_constraint.q.out +++ b/ql/src/test/results/clientpositive/llap/check_constraint.q.out @@ -501,11 +501,11 @@ Table: default.tmulti Constraint Name: #### A masked pattern was here #### Column Name:numclicks Check Value:numClicks > 0 -PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tmulti -POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tmulti @@ -526,7 +526,7 @@ STAGE PLANS: Row Limit Per Split: 1 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: array(const struct('hive.apache.com','user1',48,'12-01-2018')) (type: array<struct<col1:string,col2:string,col3:int,col4:string>>) + expressions: array(const struct('hive.apache.com','user1',48,'2018-01-12')) (type: array<struct<col1:string,col2:string,col3:int,col4:string>>) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE UDTF Operator @@ -567,11 +567,11 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.tmulti -PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tmulti -POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tmulti @@ -587,7 +587,7 @@ POSTHOOK: query: Select * from tmulti POSTHOOK: type: QUERY POSTHOOK: Input: default@tmulti #### A masked pattern was here #### -hive.apache.com user1 48 NULL +hive.apache.com user1 48 2018-01-12 PREHOOK: query: truncate table tmulti PREHOOK: type: TRUNCATETABLE PREHOOK: Output: default@tmulti @@ -672,11 +672,11 @@ Column Name:null Check Value:numClicks <= 10000 AND userName != '' Constraint Name: #### A masked pattern was here #### Column Name:numclicks Check Value:numClicks > 0 -PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +PREHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tmulti -POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +POSTHOOK: query: EXPLAIN INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tmulti @@ -697,7 +697,7 @@ STAGE PLANS: Row Limit Per Split: 1 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: array(const struct('hive.apache.com','user1',48,'12-01-2018')) (type: array<struct<col1:string,col2:string,col3:int,col4:string>>) + expressions: array(const struct('hive.apache.com','user1',48,'2018-01-12')) (type: array<struct<col1:string,col2:string,col3:int,col4:string>>) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE UDTF Operator @@ -738,11 +738,11 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.tmulti -PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +PREHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tmulti -POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '12-01-2018') +POSTHOOK: query: INSERT INTO tmulti values('hive.apache.com', 'user1', 48, '2018-01-12') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tmulti @@ -758,7 +758,7 @@ POSTHOOK: query: Select * from tmulti POSTHOOK: type: QUERY POSTHOOK: Input: default@tmulti #### A masked pattern was here #### -hive.apache.com user1 48 NULL +hive.apache.com user1 48 2018-01-12 PREHOOK: query: Drop table tmulti PREHOOK: type: DROPTABLE PREHOOK: Input: default@tmulti @@ -823,11 +823,11 @@ Table: default.tcase Constraint Name: #### A masked pattern was here #### Column Name:numclicks Check Value:numclicks > 0 -PREHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48) +PREHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48) PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tcase -POSTHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48) +POSTHOOK: query: EXPLAIN INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48) POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tcase @@ -848,7 +848,7 @@ STAGE PLANS: Row Limit Per Split: 1 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: array(const struct('hive.apache.com','user1','12-01-2018',48)) (type: array<struct<col1:string,col2:string,col3:string,col4:int>>) + expressions: array(const struct('hive.apache.com','user1','2018-01-12',48)) (type: array<struct<col1:string,col2:string,col3:string,col4:int>>) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE UDTF Operator @@ -889,11 +889,11 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.tcase -PREHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48) +PREHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48) PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tcase -POSTHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', '12-01-2018', 48) +POSTHOOK: query: INSERT INTO tcase values('hive.apache.com', 'user1', '2018-01-12', 48) POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tcase @@ -909,7 +909,7 @@ POSTHOOK: query: Select * from tcase POSTHOOK: type: QUERY POSTHOOK: Input: default@tcase #### A masked pattern was here #### -hive.apache.com user1 NULL 48 +hive.apache.com user1 2018-01-12 48 PREHOOK: query: Drop table tcase PREHOOK: type: DROPTABLE PREHOOK: Input: default@tcase