This is an automated email from the ASF dual-hosted git repository. nitiraj pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/ambari.git
The following commit(s) were added to refs/heads/trunk by this push: new 123e330 AMBARI-22999 : Ambari Hive View 2.0 'Upload Table' does not support UTF8 files with BOM (nitirajrathore) (#526) 123e330 is described below commit 123e3308c02f0fd64bf0912b1b32b8f48438f034 Author: nitirajrathore <nitiraj.rath...@gmail.com> AuthorDate: Mon Mar 5 13:11:03 2018 +0530 AMBARI-22999 : Ambari Hive View 2.0 'Upload Table' does not support UTF8 files with BOM (nitirajrathore) (#526) * AMBARI-22833 : change commons-collections-3.2.1.jar being used by ambari views to commons-collections-3.2.2.jar (nitirajrathore) * AMBARI-22999 : Ambari Hive View 2.0 'Upload Table' does not support UTF8 files with BOM (nitirajrathore) (#510) * AMBARI-22999 : Ambari Hive View 2.0 'Upload Table' does not support UTF8 files with BOM (nitirajrathore) * AMBARI-22999 : Added relevant test cases (nitirajrathore) --- .../hive20/resources/uploads/UploadService.java | 25 +++++++-- .../resources/uploads/parsers/PreviewData.java | 8 +++ .../InsertFromQueryGeneratorSpecTest.groovy | 13 +++-- .../resources/uploads/UploadServiceTest.java | 60 ++++++++++++++++++++++ 4 files changed, 98 insertions(+), 8 deletions(-) diff --git a/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/UploadService.java b/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/UploadService.java index 8704440..2e7e1e3 100644 --- a/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/UploadService.java +++ b/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/UploadService.java @@ -18,6 +18,7 @@ package org.apache.ambari.view.hive20.resources.uploads; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.sun.jersey.core.header.FormDataContentDisposition; import com.sun.jersey.multipart.FormDataParam; @@ -46,6 +47,8 @@ import org.apache.ambari.view.hive20.resources.uploads.query.InsertFromQueryInpu import org.apache.ambari.view.hive20.utils.ServiceFormattedException; import org.apache.ambari.view.hive20.utils.SharedObjectsFactory; import org.apache.ambari.view.utils.ambari.AmbariApi; +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.io.input.ReaderInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -196,6 +199,19 @@ public class UploadService extends BaseService { } } + private Reader getInputStreamReader(InputStream is) throws IOException { + BOMInputStream bomInputStream = new BOMInputStream(is, + ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE + ); + if (bomInputStream.hasBOM()) { + String charSetName = bomInputStream.getBOMCharsetName(); + return new InputStreamReader(bomInputStream, charSetName); // return with the encoded charset encoding. + } else { + return new InputStreamReader(bomInputStream); //return with default charset + } + } + private CSVParams getCsvParams(String csvDelimiter, String csvQuote, String csvEscape) { char csvq = CSVParams.DEFAULT_QUOTE_CHAR; char csvd = CSVParams.DEFAULT_DELIMITER_CHAR; @@ -465,7 +481,8 @@ public class UploadService extends BaseService { else return e.getMessage(); } - private PreviewData generatePreview(Boolean isFirstRowHeader, String inputFileType, CSVParams csvParams, InputStream uploadedInputStream) throws Exception { + @VisibleForTesting + PreviewData generatePreview(Boolean isFirstRowHeader, String inputFileType, CSVParams csvParams, InputStream uploadedInputStream) throws Exception { ParseOptions parseOptions = new ParseOptions(); parseOptions.setOption(ParseOptions.OPTIONS_FILE_TYPE, inputFileType); if (inputFileType.equals(ParseOptions.InputFileType.CSV.toString())){ @@ -483,7 +500,8 @@ public class UploadService extends BaseService { LOG.info("isFirstRowHeader : {}, inputFileType : {}", isFirstRowHeader, inputFileType); - DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions); + Reader reader = getInputStreamReader(uploadedInputStream); + DataParser dataParser = new DataParser(reader, parseOptions); return dataParser.parsePreview(); } @@ -529,7 +547,8 @@ public class UploadService extends BaseService { parseOptions.setOption(ParseOptions.OPTIONS_CSV_QUOTE, csvParams.getCsvQuote()); } - DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions); + Reader reader = getInputStreamReader(uploadedInputStream); + DataParser dataParser = new DataParser(reader, parseOptions); Reader csvReader = new TableDataReader(dataParser.iterator(), header, containsEndlines); // encode column values into HEX so that \n etc dont appear in the hive table data String path = uploadIntoTable(csvReader, databaseName, tableName); diff --git a/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/parsers/PreviewData.java b/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/parsers/PreviewData.java index dd31457..c68e3a6 100644 --- a/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/parsers/PreviewData.java +++ b/contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/parsers/PreviewData.java @@ -54,4 +54,12 @@ public class PreviewData { public void setPreviewRows(List<Row> previewRows) { this.previewRows = previewRows; } + + @Override + public String toString() { + return "PreviewData{" + + "header=" + header + + ", previewRows=" + previewRows + + '}'; + } } diff --git a/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/internal/query/generators/InsertFromQueryGeneratorSpecTest.groovy b/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/internal/query/generators/InsertFromQueryGeneratorSpecTest.groovy index dfdf9df..5fed745 100644 --- a/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/internal/query/generators/InsertFromQueryGeneratorSpecTest.groovy +++ b/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/internal/query/generators/InsertFromQueryGeneratorSpecTest.groovy @@ -28,7 +28,7 @@ class InsertFromQueryGeneratorSpecTest extends Specification { setup: List<ColumnInfo> colInfos = Arrays.asList(new ColumnInfo("col1", "STRING"), new ColumnInfo("col2", "INT"), new ColumnInfo("col3", "VARCHAR", 255), new ColumnInfo("col4", "CHAR", 25)) - InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", colInfos, false) + InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", Collections.emptyList(), colInfos, null, false) InsertFromQueryGenerator generator = new InsertFromQueryGenerator(insertFromQueryInput); when: @@ -41,14 +41,16 @@ class InsertFromQueryGeneratorSpecTest extends Specification { String queryStr = query.get(); then: - queryStr == "INSERT INTO TABLE `d2`.`t2` SELECT `col1`, `col2`, `col3`, `col4` FROM `d1.t1` ;" + queryStr == "set hive.exec.dynamic.partition.mode=nonstrict;\n" + + " FROM `d1`.`t1` tempTable INSERT INTO TABLE `d2`.`t2` SELECT tempTable.`col1`, tempTable.`col2`, tempTable.`col3`, tempTable.`col4`;" } - def "insert from with unhexing"() { + def "insert from with unhexing and partitioned columns"() { setup: List<ColumnInfo> colInfos = Arrays.asList(new ColumnInfo("col1", "STRING"), new ColumnInfo("col2", "INT"), new ColumnInfo("col3", "VARCHAR", 255), new ColumnInfo("col4", "CHAR", 25)) - InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", colInfos, true) + List<ColumnInfo> partititionedCols = Arrays.asList(new ColumnInfo("col5", "STRING"), new ColumnInfo("col6", "INT")) + InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", partititionedCols, colInfos, null, true) InsertFromQueryGenerator generator = new InsertFromQueryGenerator(insertFromQueryInput); when: @@ -61,6 +63,7 @@ class InsertFromQueryGeneratorSpecTest extends Specification { String queryStr = query.get(); then: - queryStr == "INSERT INTO TABLE `d2`.`t2` SELECT UNHEX(`col1`), `col2`, UNHEX(`col3`), UNHEX(`col4`) FROM `d1.t1` ;" + queryStr == "set hive.exec.dynamic.partition.mode=nonstrict;\n" + + " FROM `d1`.`t1` tempTable INSERT INTO TABLE `d2`.`t2` PARTITION (`col5`,`col6` ) SELECT UNHEX(tempTable.`col1`), tempTable.`col2`, UNHEX(tempTable.`col3`), UNHEX(tempTable.`col4`), UNHEX(tempTable.`col5`), tempTable.`col6`;" } } diff --git a/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/resources/uploads/UploadServiceTest.java b/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/resources/uploads/UploadServiceTest.java new file mode 100644 index 0000000..01921c3 --- /dev/null +++ b/contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/resources/uploads/UploadServiceTest.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ambari.view.hive20.resources.uploads; + +import org.apache.ambari.view.hive20.client.Row; +import org.apache.ambari.view.hive20.internal.dto.ColumnInfo; +import org.apache.ambari.view.hive20.resources.uploads.parsers.PreviewData; +import org.junit.Assert; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.Arrays; + +public class UploadServiceTest { + + @Test + public void generatePreviewWithBOM() throws Exception { + UploadService uploadService = new UploadService(); + // convert String into InputStream + String str = "\ufeffCol1\tCol2\nA\tB\n"; + InputStream inputStream = new ByteArrayInputStream(str.getBytes()); + PreviewData previewData = uploadService.generatePreview(true, "CSV", new CSVParams('\t', '\"', '\\'), inputStream); + + Assert.assertEquals("Incorrect number of columns detected.", 2, previewData.getHeader().size() ); + Assert.assertEquals("incorrect col objects.", Arrays.asList(new ColumnInfo("Col1", "CHAR", null, null, null), + new ColumnInfo("Col2", "CHAR", null, null, null)), previewData.getHeader()); + Assert.assertEquals("incorrect row objects.", Arrays.asList(new Row(new Object[]{"A", "B"})), previewData.getPreviewRows()); + } + + @Test + public void generatePreviewWithoutBOM() throws Exception { + UploadService uploadService = new UploadService(); + // convert String into InputStream + String str = "Col1\tCol2\nA\tB\n"; + InputStream inputStream = new ByteArrayInputStream(str.getBytes()); + PreviewData previewData = uploadService.generatePreview(true, "CSV", new CSVParams('\t', '\"', '\\'), inputStream); + + Assert.assertEquals("Incorrect number of columns detected.", 2, previewData.getHeader().size() ); + Assert.assertEquals("incorrect col objects.", Arrays.asList(new ColumnInfo("Col1", "CHAR", null, null, null), + new ColumnInfo("Col2", "CHAR", null, null, null)), previewData.getHeader()); + Assert.assertEquals("incorrect row objects.", Arrays.asList(new Row(new Object[]{"A", "B"})), previewData.getPreviewRows()); + } +} -- To stop receiving notification emails like this one, please contact niti...@apache.org.