Repository: hive Updated Branches: refs/heads/branch-3 b3313380c -> 32e29cc63
HIVE-19453 : Extend Load Data statement to take Input file format and Serde as parameters (Deepak Jaiswal, reviewed by Jason Dere) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/32e29cc6 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/32e29cc6 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/32e29cc6 Branch: refs/heads/branch-3 Commit: 32e29cc63c41d722a4b2f8ffae4b9c3a660b8db4 Parents: b331338 Author: Deepak Jaiswal <djais...@apache.org> Authored: Wed May 9 11:06:34 2018 -0700 Committer: Deepak Jaiswal <djais...@apache.org> Committed: Fri May 11 10:55:14 2018 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hive/ql/parse/HiveParser.g | 12 +++++-- .../hive/ql/parse/LoadSemanticAnalyzer.java | 33 ++++++++++++++++++-- .../clientpositive/load_data_using_job.q | 8 +++-- .../llap/load_data_using_job.q.out | 8 +++++ 4 files changed, 54 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index a837d67..3712a53 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -422,6 +422,7 @@ TOK_ADD_TRIGGER; TOK_REPLACE; TOK_LIKERP; TOK_UNMANAGED; +TOK_INPUTFORMAT; } @@ -835,8 +836,8 @@ execStatement loadStatement @init { pushMsg("load statement", state); } @after { popMsg(state); } - : KW_LOAD KW_DATA (islocal=KW_LOCAL)? KW_INPATH (path=StringLiteral) (isoverwrite=KW_OVERWRITE)? KW_INTO KW_TABLE (tab=tableOrPartition) - -> ^(TOK_LOAD $path $tab $islocal? $isoverwrite?) + : KW_LOAD KW_DATA (islocal=KW_LOCAL)? KW_INPATH (path=StringLiteral) (isoverwrite=KW_OVERWRITE)? KW_INTO KW_TABLE (tab=tableOrPartition) inputFileFormat? + -> ^(TOK_LOAD $path $tab $islocal? $isoverwrite? inputFileFormat?) ; replicationClause @@ -1489,6 +1490,13 @@ fileFormat | genericSpec=identifier -> ^(TOK_FILEFORMAT_GENERIC $genericSpec) ; +inputFileFormat +@init { pushMsg("Load Data input file format specification", state); } +@after { popMsg(state); } + : KW_INPUTFORMAT inFmt=StringLiteral KW_SERDE serdeCls=StringLiteral + -> ^(TOK_INPUTFORMAT $inFmt $serdeCls) + ; + tabTypeExpr @init { pushMsg("specifying table types", state); } @after { popMsg(state); } http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 2b88ea6..866f43d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -79,6 +79,8 @@ public class LoadSemanticAnalyzer extends SemanticAnalyzer { // AST specific data private Tree fromTree, tableTree; private boolean isLocal = false, isOverWrite = false; + private String inputFormatClassName = null; + private String serDeClassName = null; public LoadSemanticAnalyzer(QueryState queryState) throws SemanticException { super(queryState); @@ -257,12 +259,30 @@ public class LoadSemanticAnalyzer extends SemanticAnalyzer { fromTree = ast.getChild(0); tableTree = ast.getChild(1); - if (ast.getChildCount() == 4) { + boolean inputInfo = false; + // Check the last node + ASTNode child = (ASTNode)ast.getChild(ast.getChildCount() - 1); + if (child.getToken().getType() == HiveParser.TOK_INPUTFORMAT) { + if (child.getChildCount() != 2) { + throw new SemanticException("FileFormat should contain both input format and Serde"); + } + try { + inputFormatClassName = stripQuotes(child.getChild(0).getText()); + serDeClassName = stripQuotes(child.getChild(1).getText()); + inputInfo = true; + } catch (Exception e) { + throw new SemanticException("FileFormat inputFormatClassName or serDeClassName is incorrect"); + } + } + + if ((!inputInfo && ast.getChildCount() == 4) || + (inputInfo && ast.getChildCount() == 5)) { isLocal = true; isOverWrite = true; } - if (ast.getChildCount() == 3) { + if ((!inputInfo && ast.getChildCount() == 3) || + (inputInfo && ast.getChildCount() == 4)) { if (ast.getChild(2).getText().toLowerCase().equals("local")) { isLocal = true; } else { @@ -450,7 +470,14 @@ public class LoadSemanticAnalyzer extends SemanticAnalyzer { // Set data location and input format, it must be text tempTableObj.setDataLocation(new Path(fromURI)); - tempTableObj.setInputFormatClass(TextInputFormat.class); + if (inputFormatClassName != null && serDeClassName != null) { + try { + tempTableObj.setInputFormatClass(inputFormatClassName); + tempTableObj.setSerializationLib(serDeClassName); + } catch (HiveException e) { + throw new SemanticException("Load Data: Failed to set inputFormat or SerDe"); + } + } // Step 2 : create the Insert query StringBuilder rewrittenQueryStr = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/test/queries/clientpositive/load_data_using_job.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/load_data_using_job.q b/ql/src/test/queries/clientpositive/load_data_using_job.q index 3928f1f..3659b6e 100644 --- a/ql/src/test/queries/clientpositive/load_data_using_job.q +++ b/ql/src/test/queries/clientpositive/load_data_using_job.q @@ -84,7 +84,11 @@ drop table srcbucket_mapjoin; -- Load into ORC table using text files CREATE TABLE srcbucket_mapjoin(key int, value string) partitioned by (ds string) STORED AS ORC; -explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin; -load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin; +explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; +load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; select * from srcbucket_mapjoin; drop table srcbucket_mapjoin; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out b/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out index 116630c..c3b70a3 100644 --- a/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out +++ b/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out @@ -2776,8 +2776,12 @@ POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@srcbucket_mapjoin PREHOOK: query: explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' PREHOOK: type: QUERY POSTHOOK: query: explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage @@ -2830,10 +2834,14 @@ STAGE PLANS: Basic Stats Work: PREHOOK: query: load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' PREHOOK: type: QUERY PREHOOK: Input: default@srcbucket_mapjoin__TEMP_TABLE_FOR_LOAD_DATA__ PREHOOK: Output: default@srcbucket_mapjoin POSTHOOK: query: load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' POSTHOOK: type: QUERY POSTHOOK: Input: default@srcbucket_mapjoin__TEMP_TABLE_FOR_LOAD_DATA__ POSTHOOK: Output: default@srcbucket_mapjoin@ds=2008-04-08