[impala] branch master updated (eb85c6e -> 08367e9)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from eb85c6e IMPALA-9793: Impala quickstart cluster with docker-compose new f4584dd IMPALA-10404: Update docs to reflect RLE_DICTIONARY support new 08367e9 IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docs/shared/impala_common.xml | 5 +-- .../topics/impala_parquet_dictionary_filtering.xml | 8 +++-- .../apache/impala/analysis/CreateTableStmt.java| 25 ++- .../queries/QueryTest/iceberg-create.test | 37 ++ .../queries/QueryTest/iceberg-negative.test| 18 +++ .../queries/QueryTest/show-create-table.test | 19 +++ 6 files changed, 106 insertions(+), 6 deletions(-)
[impala] 02/02: IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 08367e91f04508b54f77b56e0d211dd167b0116f Author: Zoltan Borok-Nagy AuthorDate: Mon Jan 25 16:09:59 2021 +0100 IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax For convenience this patch adds support with the old-style CREATE TABLE ... PARTITIONED BY ...; syntax for Iceberg tables. So users should be able to write the following: CREATE TABLE ice_t (i int) PARTITIONED BY (p int) STORED AS ICEBERG; Which should be equivalent to this: CREATE TABLE ice_t (i int, p int) PARTITION BY SPEC (p IDENTITY) STORED AS ICEBERG; Please note that the old-style CREATE TABLE statement creates IDENTITY-partitioned tables. For other partition transforms the users must use the new, more generic syntax. Hive also supports the old PARTITIONED BY syntax with the same behavior. Testing: * added e2e tests Change-Id: I789876c161bc0987820955aa9ae01414e0dcb45d Reviewed-on: http://gerrit.cloudera.org:8080/16979 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../apache/impala/analysis/CreateTableStmt.java| 25 ++- .../queries/QueryTest/iceberg-create.test | 37 ++ .../queries/QueryTest/iceberg-negative.test| 18 +++ .../queries/QueryTest/show-create-table.test | 19 +++ 4 files changed, 98 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java index 5e89724..1477536 100644 --- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java @@ -37,6 +37,7 @@ import org.apache.impala.service.BackendConfig; import org.apache.impala.thrift.TCreateTableParams; import org.apache.impala.thrift.THdfsFileFormat; import org.apache.impala.thrift.TIcebergCatalog; +import org.apache.impala.thrift.TIcebergPartitionTransformType; import org.apache.impala.thrift.TSortingOrder; import org.apache.impala.thrift.TTableName; import org.apache.impala.util.AvroSchemaConverter; @@ -271,8 +272,8 @@ public class CreateTableStmt extends StatementBase { } if (getFileFormat() == THdfsFileFormat.ICEBERG) { - analyzeIcebergFormat(analyzer); analyzeIcebergColumns(); + analyzeIcebergFormat(analyzer); } else { List iceSpec = tableDef_.getIcebergPartitionSpecs(); if (iceSpec != null && !iceSpec.isEmpty()) { @@ -703,10 +704,32 @@ public class CreateTableStmt extends StatementBase { * Iceberg field */ private void analyzeIcebergColumns() { +if (!getPartitionColumnDefs().isEmpty()) { + createIcebergPartitionSpecFromPartitionColumns(); +} for (ColumnDef def : getColumnDefs()) { if (!def.isNullabilitySet()) { def.setNullable(true); } } } + + /** + * Creates Iceberg partition spec from partition columns. Needed to support old-style + * CREATE TABLE .. PARTITIONED BY () syntax. In this case the column list in + * 'cols' is appended to the table-level columns, but also Iceberg-level IDENTITY + * partitions are created from this list. + */ + private void createIcebergPartitionSpecFromPartitionColumns() { +Preconditions.checkState(!getPartitionColumnDefs().isEmpty()); +Preconditions.checkState(getIcebergPartitionSpecs().isEmpty()); +List partFields = new ArrayList<>(); +for (ColumnDef colDef : getPartitionColumnDefs()) { + partFields.add(new IcebergPartitionField(colDef.getColName(), + new IcebergPartitionTransform(TIcebergPartitionTransformType.IDENTITY))); +} +getIcebergPartitionSpecs().add(new IcebergPartitionSpec(partFields)); +getColumnDefs().addAll(getPartitionColumnDefs()); +getPartitionColumnDefs().clear(); + } } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test index 4b7daf9..eba73d8 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test @@ -445,3 +445,40 @@ DESCRIBE iceberg_nullable_test; TYPES STRING,STRING,STRING,STRING + QUERY +CREATE TABLE iceberg_old_style_partitions ( + register_time DATE, + message STRING, + price DECIMAL(8,1), + map_test MAP >, + struct_test STRUCT +) +PARTITIONED BY ( + level STRING, + event_id INT +) +STORED AS ICEBERG; + RESULTS +'Table has been created.' + + QUERY +DESCRIBE iceberg_old_style_partitions; + RESULTS +'register_time','date','','true' +'message','string','','true'
[impala] 01/02: IMPALA-10404: Update docs to reflect RLE_DICTIONARY support
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit f4584dd2763edda9a24e4466b2d0a8f4bb065437 Author: Tim Armstrong AuthorDate: Mon Jan 25 14:41:53 2021 -0800 IMPALA-10404: Update docs to reflect RLE_DICTIONARY support Fix references to PLAIN_DICTIONARY to reflect that RLE_DICTIONARY is supported too. Change-Id: Iee98abfd760396cf43302c9077c6165eb3623335 Reviewed-on: http://gerrit.cloudera.org:8080/16982 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- docs/shared/impala_common.xml | 5 +++-- docs/topics/impala_parquet_dictionary_filtering.xml | 8 +--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml index 6b0e812..f18ca00 100644 --- a/docs/shared/impala_common.xml +++ b/docs/shared/impala_common.xml @@ -3175,8 +3175,9 @@ flight_num: INT32 SNAPPY DO:83456393 FPO:83488603 SZ:10216514/11474301 Impala can query Parquet files that use the PLAIN, -PLAIN_DICTIONARY, BIT_PACKED, and RLE -encodings. Currently, Impala does not support RLE_DICTIONARY encoding. +PLAIN_DICTIONARY, BIT_PACKED, RLE +and RLE_DICTIONARY encodings. RLE_DICTIONARY is supported +only in and up. When creating files outside of Impala for use by Impala, make sure to use one of the supported encodings. In particular, for MapReduce jobs, parquet.writer.version must not be defined (especially as diff --git a/docs/topics/impala_parquet_dictionary_filtering.xml b/docs/topics/impala_parquet_dictionary_filtering.xml index 3460f2a..2d68b4d 100644 --- a/docs/topics/impala_parquet_dictionary_filtering.xml +++ b/docs/topics/impala_parquet_dictionary_filtering.xml @@ -58,7 +58,8 @@ under the License. If the encoding_stats is in the Parquet file, dictionary filtering uses it to determine if there are only dictionary encoded pages (i.e. there are no - data pages with an encoding other than PLAIN_DICTIONARY). + data pages with an encoding other than RLE_DICTIONARY or + PLAIN_DICTIONARY). @@ -66,11 +67,12 @@ under the License. The column is purely dictionary encoded if both of the conditions satisfy: - PLAIN_DICTIONARY is present. + PLAIN_DICTIONARY or RLE_DICTIONARY is present. - Only PLAIN_DICTIONARY, RLE, or BIT_PACKED encodings are listed. + Only PLAIN_DICTIONARY, RLE_DICTIONARY, + RLE, or BIT_PACKED encodings are listed.
[impala] branch master updated (e8720b4 -> eb85c6e)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from e8720b4 IMPALA-2019(Part-1): Provide UTF-8 support in length, substring and reverse functions new 3b763b5 IMPALA-10447: Add a newline when exporting shell output to a file. new eb85c6e IMPALA-9793: Impala quickstart cluster with docker-compose The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docker/CMakeLists.txt | 36 +- docker/README.md | 144 ++- docs/build-doc.sh => docker/docker-build.sh| 29 +- docker/impala_base/Dockerfile | 10 + docker/quickstart-kudu-minimal.yml | 128 ++ .../quickstart-load-data.yml | 34 +- docker/quickstart.yml | 104 ++ docker/quickstart_client/Dockerfile| 70 ++ docker/quickstart_client/data-load-entrypoint.sh | 86 ++ .../quickstart_client/load_tpcds_kudu.sql | 316 +++-- docker/quickstart_client/load_tpcds_parquet.sql| 1248 docker/quickstart_conf/hive-site.xml | 74 ++ docker/quickstart_hms/Dockerfile | 67 ++ docker/quickstart_hms/hms-entrypoint.sh| 68 ++ shell/shell_output.py |1 + tests/shell/test_shell_commandline.py | 28 + 16 files changed, 2250 insertions(+), 193 deletions(-) copy docs/build-doc.sh => docker/docker-build.sh (54%) create mode 100644 docker/quickstart-kudu-minimal.yml copy be/src/catalog/CMakeLists.txt => docker/quickstart-load-data.yml (58%) create mode 100644 docker/quickstart.yml create mode 100644 docker/quickstart_client/Dockerfile create mode 100755 docker/quickstart_client/data-load-entrypoint.sh copy testdata/datasets/tpcds/tpcds_kudu_template.sql => docker/quickstart_client/load_tpcds_kudu.sql (68%) create mode 100644 docker/quickstart_client/load_tpcds_parquet.sql create mode 100644 docker/quickstart_conf/hive-site.xml create mode 100644 docker/quickstart_hms/Dockerfile create mode 100755 docker/quickstart_hms/hms-entrypoint.sh
[impala] 01/02: IMPALA-10447: Add a newline when exporting shell output to a file.
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 3b763b5c3235ebdb445ad0a4ae1bf79385e8df02 Author: Andrew Sherman AuthorDate: Mon Jan 18 19:10:11 2021 -0800 IMPALA-10447: Add a newline when exporting shell output to a file. Impala shell outputs a batch of rows using OutputStream. Inside OutputStream, output to a file is handled slightly differently from output that is written to stdout. When writing to stdout we use print() (which appends a newline) while when writing to a file we use write() (which adds nothing). This difference was introduced in IMPALA-3343 so this bug may be a regression introduced then. To ensure that output is the same in either case we need to add a newline after writing each batch of rows to a file. TESTING: Added a new test for this case. Change-Id: I078a06c54e0834bc1f898626afbfff4ded579fa9 Reviewed-on: http://gerrit.cloudera.org:8080/16966 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- shell/shell_output.py | 1 + tests/shell/test_shell_commandline.py | 28 2 files changed, 29 insertions(+) diff --git a/shell/shell_output.py b/shell/shell_output.py index 31d91a0..bfb418c 100644 --- a/shell/shell_output.py +++ b/shell/shell_output.py @@ -117,6 +117,7 @@ class OutputStream(object): # Note that instances of this class do not persist, so it's fine to # close the we close the file handle after each write. out_file.write(formatted_data.encode('utf-8')) # file opened in binary mode + out_file.write(b'\n') except IOError as err: file_err_msg = "Error opening file %s: %s" % (self.filename, str(err)) print('{0} (falling back to stderr)'.format(file_err_msg), file=sys.stderr) diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py index 666162f..30cd85d 100644 --- a/tests/shell/test_shell_commandline.py +++ b/tests/shell/test_shell_commandline.py @@ -105,6 +105,16 @@ def populated_table(empty_table, request): return fq_table_name +@pytest.yield_fixture +def tmp_file(): + """ + Test fixture which manages a temporary file + """ + _, tmp_file = tempfile.mkstemp() + yield tmp_file + os.remove(tmp_file) + + class TestImpalaShell(ImpalaTestSuite): """A set of sanity tests for the Impala shell commandline parameters. @@ -1071,3 +1081,21 @@ class TestImpalaShell(ImpalaTestSuite): expected_result = """anonymous\tanonymous\n""" assert result.stdout == expected_result assert result.stderr == "" + + def test_output_file(self, vector, tmp_file): +"""Test that writing output to a file using '--output_file' produces the same output +as is written to stdout.""" +row_count = 6000 # Should be > 2048 to tickle IMPALA-10447. +query = "select * from tpcds.item order by i_item_sk limit %d" % row_count +# Run the query normally and keep the stdout. +output = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;']) +assert "Fetched %d row(s)" % row_count in output.stderr +rows_from_stdout = output.stdout.strip().split('\n') +# Run the query with output sent to a file using '--output_file'. +result = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;', + '--output_file=%s' % tmp_file]) +assert "Fetched %d row(s)" % row_count in result.stderr +# Check that the output from the file is the same as that written to stdout. +with open(tmp_file, "r") as f: + rows_from_file = [line.rstrip() for line in f] + assert rows_from_stdout == rows_from_file
[impala] 02/02: IMPALA-9793: Impala quickstart cluster with docker-compose
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit eb85c6eeca77c748a01d304625ea5d608a3e12c0 Author: Tim Armstrong AuthorDate: Sat May 9 21:27:41 2020 -0700 IMPALA-9793: Impala quickstart cluster with docker-compose What works: * A single node cluster can be started up with docker-compose * HMS data is stored in Derby database in a docker volume * Filesystem data is stored in a shared docker volume, using the localfs support in the Hadoop client. * A Kudu cluster with a single master can be optionally added on to the Impala cluster. * TPC-DS data can be loaded automatically by a data loading container. We need to set up a docker network called quickstart-network, purely because docker-compose insists on generating network names with underscores, which are part of the FQDN and end up causing problems with Java's URL parsing, which rejects these technically invalid domain names. How to run: Instructions for running the quickstart cluster are in docker/README.md. How to build containers: ./buildall.sh -release -noclean -notests -ninja ninja quickstart_hms_image quickstart_client_image docker_images How to upload containers to dockerhub: IMPALA_QUICKSTART_IMAGE_PREFIX=timgarmstrong/ for i in impalad_coord_exec impalad_coordinator statestored \ impalad_executor catalogd impala_quickstart_client \ impala_quickstart_hms do docker tag $i ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i docker push ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i done I pushed containers build from commit f260cce22, which was branched from 6cb7cecacf on master. Misc other stuff: * Added more metadata to all images. TODO: * Test and instructions to run against Kudu quickstart * Upload latest version of containers before merging. Change-Id: Ifc0b862af40a368381ada7ec2a355fe4b0aa778c Reviewed-on: http://gerrit.cloudera.org:8080/15966 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- docker/CMakeLists.txt| 36 +- docker/README.md | 144 ++- docker/docker-build.sh | 39 + docker/impala_base/Dockerfile| 10 + docker/quickstart-kudu-minimal.yml | 128 +++ docker/quickstart-load-data.yml | 38 + docker/quickstart.yml| 104 ++ docker/quickstart_client/Dockerfile | 70 ++ docker/quickstart_client/data-load-entrypoint.sh | 86 ++ docker/quickstart_client/load_tpcds_kudu.sql | 877 +++ docker/quickstart_client/load_tpcds_parquet.sql | 1248 ++ docker/quickstart_conf/hive-site.xml | 74 ++ docker/quickstart_hms/Dockerfile | 67 ++ docker/quickstart_hms/hms-entrypoint.sh | 68 ++ 14 files changed, 2985 insertions(+), 4 deletions(-) diff --git a/docker/CMakeLists.txt b/docker/CMakeLists.txt index 60fd8c2..7fe085b 100644 --- a/docker/CMakeLists.txt +++ b/docker/CMakeLists.txt @@ -19,6 +19,8 @@ set(IMPALA_BASE_BUILD_CONTEXT_DIR ${CMAKE_SOURCE_DIR}/docker/build_context ) +set(DOCKER_BUILD ${CMAKE_SOURCE_DIR}/docker/docker-build.sh) + find_program(LSB_RELEASE_EXEC lsb_release) execute_process(COMMAND ${LSB_RELEASE_EXEC} -is OUTPUT_VARIABLE LSB_RELEASE_ID @@ -38,6 +40,7 @@ else() endif() MESSAGE(STATUS "Picked docker base image based on host OS: ${DISTRO_BASE_IMAGE}") + if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # Add a target to build a base docker image for 'build_type'. 'build_context_args' are # passed to the setup_build_context.py script. @@ -58,7 +61,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # sent to the docker daemon. This allows the Dockerfile built to copy all necessary # dependencies. COMMAND tar cvh . -C ${CMAKE_SOURCE_DIR}/docker/impala_base/ . | - docker build -t impala_base_${build_type} + ${DOCKER_BUILD} -t impala_base_${build_type} --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} - WORKING_DIRECTORY ${IMPALA_BASE_BUILD_CONTEXT_DIR}/${build_type} DEPENDS impala_base_build_context_${build_type} ${CMAKE_SOURCE_DIR}/docker/impala_base/Dockerfile @@ -88,7 +91,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # build context used for the base image is used for each daemon image. This allows # each daemon image to only copy in the dependencies it requires. COMMAND tar cvh . -C ${CMAKE_SOURCE_DIR}/docker/${daemon_name}/ . | - docker build --build-arg BASE_IMAGE=impala_base_${build_type} + ${DOCKER_BUILD}