[impala] branch master updated: IMPALA-4805: Avoid hash exchange before analytic function if appropriate
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 4721978 IMPALA-4805: Avoid hash exchange before analytic function if appropriate 4721978 is described below commit 4721978e8fb6d80a9f023e568b983b12b14f8acc Author: Aman Sinha AuthorDate: Thu Feb 11 14:44:08 2021 -0800 IMPALA-4805: Avoid hash exchange before analytic function if appropriate This patch avoids adding a hash exchange below an analytic function that has partition by b as long as the child can satisfy that requirement through an equivalence relationship .. i.e an exact match is not required. For example: select count(*) over (partition by b) from t1, t2 where a = b In this case, the analytic sort has a required partitioning on b but the child is an inner join whose output partition key could be either 'a' or 'b' (it happens to be 'a' given how the data partition was populated), then we should still be able to use the child's partitioning without adding a hash exchange. Note that for outer joins the logic is slightly different. Testing: - Added a new planner test with analytic function + inner join (outer join test case already existed before). Change-Id: Icb6289d1e70cfb6bbd5b38eedb00856dbc85ac77 Reviewed-on: http://gerrit.cloudera.org:8080/16888 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../apache/impala/planner/DistributedPlanner.java | 21 +++- .../queries/PlannerTest/analytic-fns.test | 38 ++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java b/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java index 8a75f9d..46242e9 100644 --- a/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java +++ b/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java @@ -1085,7 +1085,26 @@ public class DistributedPlanner { childFragment.getPlanRoot().getOutputSmap(), ctx_.getRootAnalyzer()); // Make sure the childFragment's output is partitioned as required by the sortNode. DataPartition sortPartition = sortNode.getInputPartition(); - if (!childFragment.getDataPartition().equals(sortPartition)) { + boolean hasNullableTupleIds = childFragment.getPlanRoot(). + getNullableTupleIds().size() > 0; + boolean hasCompatiblePartition = false; + if (hasNullableTupleIds) { +// If the input stream has nullable tuple ids (produced from the nullable +// side of an outer join), do an exact equality comparison of the child's +// data partition with the required partition) since a hash exchange is +// required to co-locate all the null values produced from the outer join +// (these tuples may not be originally null but became null after OJ). +hasCompatiblePartition = childFragment.getDataPartition().equals(sortPartition); + } else { +// Otherwise, a mutual value transfer is sufficient for compatible partitions. +// E.g if analytic fragment's required partition key is t2.a2 and the child is +// an inner join with t1.a1 = t2.a2, either a1 or a2 are sufficient to satisfy +// the required partitioning. +hasCompatiblePartition = ctx_.getRootAnalyzer().setsHaveValueTransfer( +childFragment.getDataPartition().getPartitionExprs(), +sortPartition.getPartitionExprs(), true); + } + if (!hasCompatiblePartition) { if (sortNode.isTypeTopN() || sortNode.isPartitionedTopN()) { lowerTopN = sortNode; childFragment.addPlanRoot(lowerTopN); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test index 5253b09..d74fd9e 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test @@ -3343,3 +3343,41 @@ PLAN-ROOT SINK HDFS partitions=24/24 files=24 size=478.45KB row-size=0B cardinality=7.30K +# IMPALA-4805: Avoid hash exchange below analytic operator since join +# output is already hash partitioned +select [straight_join] count(*) over (partition by t2.id) +from functional.alltypes t1 +inner join [shuffle] functional.alltypes t2 + on t1.id = t2.id + DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +07:EXCHANGE [UNPARTITIONED] +| +04:ANALYTIC +| functions: count(*) +| partition by: t2.id +| row-size=16B cardinality=7.30K +| +03:SORT +| order by: id ASC NULLS LAST +| row-size=8B cardinality=7.30K +| +02:HASH JOIN [INNER JOIN, PARTITIONED] +| hash predicates:
[impala] branch master updated: IMPALA-10470: Add link to quickstart container from README
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 24a3f00 IMPALA-10470: Add link to quickstart container from README 24a3f00 is described below commit 24a3f007710c38f71ccd8f7f2412a387708771f9 Author: Tim Armstrong AuthorDate: Fri May 1 16:49:49 2020 -0700 IMPALA-10470: Add link to quickstart container from README Change-Id: If76376557efffe9c7b8e02a5b840e128b343a74e Reviewed-on: http://gerrit.cloudera.org:8080/17058 Reviewed-by: Joe McDonnell Tested-by: Tim Armstrong --- README.md | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5adf7c0..44e3d69 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,19 @@ you analyze, transform and combine data from a variety of data sources: ## More about Impala -To learn more about Impala as a business user, or to try Impala live or in a VM, please +The fastest way to try out Impala is a [quickstart Docker container]( +https://github.com/apache/impala/blob/master/docker/README.md#docker-quickstart-with-docker-compose). +You can try out running queries and processing data sets in Impala on a single machine +without installing dependencies. It can automatically load test data sets into Apache Kudu +and Apache Parquet formats and you can start playing around with Apache Impala SQL +within minutes. + +To learn more about Impala as a user or administrator, or to try Impala, please visit the [Impala homepage](https://impala.apache.org). Detailed documentation for administrators and users is available at [Apache Impala documentation](https://impala.apache.org/impala-docs.html). + If you are interested in contributing to Impala as a developer, or learning more about Impala's internals and architecture, visit the [Impala wiki](https://cwiki.apache.org/confluence/display/IMPALA/Impala+Home).
[impala] branch master updated (1f7b413 -> 79bee3b)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 1f7b413 IMPALA-8721: re-enable test_hive_impala_interop new f888d36 IMPALA-10397 : Reduce flakiness in test_single_workload new 79bee3b IMPALA-10469: push quickstart to apache repo The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docker/CMakeLists.txt | 13 +++-- docker/README.md | 26 +++--- docker/publish_images_to_apache.sh| 80 +++ docker/quickstart-load-data.yml | 2 +- docker/quickstart.yml | 8 ++-- tests/custom_cluster/test_auto_scaling.py | 25 ++ 6 files changed, 131 insertions(+), 23 deletions(-) create mode 100755 docker/publish_images_to_apache.sh
[impala] 02/02: IMPALA-10469: push quickstart to apache repo
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 79bee3befbc6cdcd358373822a0a3b4d19ab5ce0 Author: Tim Armstrong AuthorDate: Fri Feb 5 15:35:30 2021 -0800 IMPALA-10469: push quickstart to apache repo This adds a script, docker/publish_images_to_apache.sh, that allows uploading images to the apache/impala docker hub repo, prefixed with a version string. E.g. with the following commands: ninja docker_images quickstart_docker_images ./docker/publish_images_to_apache.sh -v 81d5377c2 The uploaded images can then be used for the quickstart cluster, as documented in docker/README. Updated docs for quickstart to use a prefix from apache/impala Remove IMPALA_QUICKSTART_VERSION, which doesn't interact well with the tagging since the image name and version are now encoded in the tag. Fix an incorrect image name added to docker-images.txt: impala_profile_tool_image. Testing: Ran Impala quickstart with data loading using instructions in README. export IMPALA_QUICKSTART_IMAGE_PREFIX="apache/impala:81d5377c2-" docker network create -d bridge quickstart-network export QUICKSTART_IP=$(docker network inspect quickstart-network -f '{{(index .IPAM.Config 0).Gateway}}') export QUICKSTART_LISTEN_ADDR=$QUICKSTART_IP docker-compose -f docker/quickstart.yml \ -f docker/quickstart-kudu-minimal.yml \ -f docker/quickstart-load-data.yml up -d docker run --network=quickstart-network -it \ ${IMPALA_QUICKSTART_IMAGE_PREFIX}impala_quickstart_client impala-shell Change-Id: I535d77e565b73d732ae511d7525193467086c76a Reviewed-on: http://gerrit.cloudera.org:8080/17030 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- docker/CMakeLists.txt | 13 +-- docker/README.md | 26 ++--- docker/publish_images_to_apache.sh | 80 ++ docker/quickstart-load-data.yml| 2 +- docker/quickstart.yml | 8 ++-- 5 files changed, 115 insertions(+), 14 deletions(-) diff --git a/docker/CMakeLists.txt b/docker/CMakeLists.txt index 23efee8..9b45a11a 100644 --- a/docker/CMakeLists.txt +++ b/docker/CMakeLists.txt @@ -82,6 +82,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # instantiated below. add_custom_target(docker_images) add_custom_target(docker_debug_images) + add_custom_target(quickstart_docker_images) set(exported_image_names "") @@ -129,6 +130,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") add_daemon_docker_images(admissiond) # HMS quickstart image, which requires Hive and Hadoop builds. + set(QUICKSTART_HMS_IMAGE impala_quickstart_hms) set(quickstart_hms_build_dir ${CMAKE_SOURCE_DIR}/docker/quickstart_hms) add_custom_target(quickstart_hms_build_setup COMMAND rm -f ${quickstart_hms_build_dir}/hive ${quickstart_hms_build_dir}/hadoop @@ -139,23 +141,28 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # Supply the appropriate base image as an argument for the Dockerfile. # Use tar with -h flag to assemble a tarball including all the symlinked files and # directories in the build context. -COMMAND tar cvh . -C ${quickstart_hms_build_dir} . | ${DOCKER_BUILD} --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} -t impala_quickstart_hms - +COMMAND tar cvh . -C ${quickstart_hms_build_dir} . | ${DOCKER_BUILD} --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} -t ${QUICKSTART_HMS_IMAGE} - DEPENDS ${quickstart_hms_build_dir}/Dockerfile quickstart_hms_build_setup COMMENT "Building quickstart HMS docker image." VERBATIM ) + ADD_DEPENDENCIES(quickstart_docker_images quickstart_hms_image) + set(exported_image_names "${exported_image_names} ${QUICKSTART_HMS_IMAGE}") # Client quickstart image, which only requires some scripts. + set(QUICKSTART_CLIENT_IMAGE impala_quickstart_client) set(quickstart_client_build_dir ${CMAKE_SOURCE_DIR}/docker/quickstart_client) add_custom_target(quickstart_client_image # Supply the appropriate base image as an argument for the Dockerfile. # Use tar with -h flag to assemble a tarball including all the symlinked files and # directories in the build context. -COMMAND tar cvh . -C ${quickstart_client_build_dir} . | ${DOCKER_BUILD} ${COMMON_DOCKER_BUILD_ARGS} --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} -t impala_quickstart_client - +COMMAND tar cvh . -C ${quickstart_client_build_dir} . | ${DOCKER_BUILD} ${COMMON_DOCKER_BUILD_ARGS} --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} -t ${QUICKSTART_CLIENT_IMAGE} - DEPENDS ${quickstart_client_build_dir}/Dockerfile ${quickstart_clien
[impala] 01/02: IMPALA-10397 : Reduce flakiness in test_single_workload
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit f888d362951454b273114c98686193498c0d3fe0 Author: Bikramjeet Vig AuthorDate: Fri Feb 5 13:40:07 2021 -0800 IMPALA-10397 : Reduce flakiness in test_single_workload This test failed recently due to a timeout waiting for executors to come up. The logs showed that the executors came up on time but it was not recognized by the coordinator. This patch attempts to reduce flakiness by increasing the timeout and adding more logging in case this happens in the future. Testing: Ran in a loop on my local for a few hours. Change-Id: I73ea5eb663db6d03832b19ed323670590946f514 Reviewed-on: http://gerrit.cloudera.org:8080/17028 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- tests/custom_cluster/test_auto_scaling.py | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/custom_cluster/test_auto_scaling.py b/tests/custom_cluster/test_auto_scaling.py index 6f999b0..bbb7dd0 100644 --- a/tests/custom_cluster/test_auto_scaling.py +++ b/tests/custom_cluster/test_auto_scaling.py @@ -27,7 +27,7 @@ from tests.common.custom_cluster_test_suite import CustomClusterTestSuite from tests.common.skip import SkipIfEC LOG = logging.getLogger("test_auto_scaling") - +TOTAL_BACKENDS_METRIC_NAME = "cluster-membership.backends.total" class TestAutoScaling(CustomClusterTestSuite): @classmethod @@ -43,7 +43,7 @@ class TestAutoScaling(CustomClusterTestSuite): """This class contains tests that exercise the logic related to scaling clusters up and down by adding and removing groups of executors.""" INITIAL_STARTUP_TIME_S = 10 - STATE_CHANGE_TIMEOUT_S = 45 + STATE_CHANGE_TIMEOUT_S = 60 # This query will scan two partitions (month = 1, 2) and thus will have 1 fragment # instance per executor on groups of size 2. Each partition has 2 rows, so it performs # two comparisons and should take around 1 second to complete. @@ -51,13 +51,20 @@ class TestAutoScaling(CustomClusterTestSuite): and id + random() < sleep(500)""" def _get_total_admitted_queries(self): -return self.impalad_test_service.get_total_admitted_queries("default-pool") +admitted_queries = self.impalad_test_service.get_total_admitted_queries( + "default-pool") +LOG.info("Current total admitted queries: %s", admitted_queries) +return admitted_queries def _get_num_backends(self): -return self.impalad_test_service.get_metric_value("cluster-membership.backends.total") +metric_val = self.impalad_test_service.get_metric_value(TOTAL_BACKENDS_METRIC_NAME) +LOG.info("Getting metric %s : %s", TOTAL_BACKENDS_METRIC_NAME, metric_val) +return metric_val def _get_num_running_queries(self): -return self.impalad_test_service.get_num_running_queries("default-pool") +running_queries = self.impalad_test_service.get_num_running_queries("default-pool") +LOG.info("Current running queries: %s", running_queries) +return running_queries @SkipIfEC.fix_later def test_single_workload(self): @@ -124,7 +131,7 @@ class TestAutoScaling(CustomClusterTestSuite): # Wait for workers to spin down self.impalad_test_service.wait_for_metric_value( -"cluster-membership.backends.total", 1, +TOTAL_BACKENDS_METRIC_NAME, 1, timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1) assert self.impalad_test_service.get_metric_value( "cluster-membership.executor-groups.total") == 0 @@ -155,7 +162,7 @@ class TestAutoScaling(CustomClusterTestSuite): # Wait for workers to spin up cluster_size = GROUP_SIZE + 1 # +1 to include coordinator. self.impalad_test_service.wait_for_metric_value( -"cluster-membership.backends.total", cluster_size, +TOTAL_BACKENDS_METRIC_NAME, cluster_size, timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1) # Wait until we admitted at least 10 queries @@ -184,7 +191,7 @@ class TestAutoScaling(CustomClusterTestSuite): # Wait for workers to spin down self.impalad_test_service.wait_for_metric_value( -"cluster-membership.backends.total", 1, +TOTAL_BACKENDS_METRIC_NAME, 1, timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1) assert self.impalad_test_service.get_metric_value( "cluster-membership.executor-groups.total") == 0 @@ -242,7 +249,7 @@ class TestAutoScaling(CustomClusterTestSuite): # Wait for workers to spin down self.impalad_test_service.wait_for_metric_value( -"cluster-membership.backends.total", 1
[impala] 02/03: IMPALA-9586: update query option docs for mt_dop
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 8551805875fcd2c701d988df887c1173520fca12 Author: Tim Armstrong AuthorDate: Mon Feb 8 20:44:16 2021 -0800 IMPALA-9586: update query option docs for mt_dop There are interactions between mt_dop and num_nodes and num_scanner_threads. Mention these in the docs. Change-Id: I3d9a6f56ffaf211d7d3ca1fad506ee83d516ccbd Reviewed-on: http://gerrit.cloudera.org:8080/17043 Tested-by: Impala Public Jenkins Reviewed-by: Joe McDonnell --- docs/topics/impala_num_nodes.xml | 5 + docs/topics/impala_num_scanner_threads.xml | 4 2 files changed, 9 insertions(+) diff --git a/docs/topics/impala_num_nodes.xml b/docs/topics/impala_num_nodes.xml index f885124..0bbf782 100644 --- a/docs/topics/impala_num_nodes.xml +++ b/docs/topics/impala_num_nodes.xml @@ -55,6 +55,11 @@ under the License. + Setting NUM_NODES to 1 disables multithreading, i.e. if + MT_DOP is greater than 1, it is effectively reduced to 1. + + + If you are diagnosing a problem that you suspect is due to a timing issue due to distributed query processing, you can set NUM_NODES=1 to verify if the problem still occurs when all the work is done on a single node. diff --git a/docs/topics/impala_num_scanner_threads.xml b/docs/topics/impala_num_scanner_threads.xml index b7a47ae..dd111da 100644 --- a/docs/topics/impala_num_scanner_threads.xml +++ b/docs/topics/impala_num_scanner_threads.xml @@ -42,6 +42,10 @@ under the License. + Has no effect if multi-threaded query execution is enabled, i.e. + MT_DOP is greater than 0. + + Type: numeric
[impala] branch master updated (81d5377 -> 1f7b413)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 81d5377 IMPALA-10475: [DOCS] elaborate SYNC_DDL option new 701714b IMPALA-10379: Add missing HiveLexer classes to shared-deps new 8551805 IMPALA-9586: update query option docs for mt_dop new 1f7b413 IMPALA-8721: re-enable test_hive_impala_interop The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docs/topics/impala_num_nodes.xml| 5 + docs/topics/impala_num_scanner_threads.xml | 4 java/shaded-deps/hive-exec/pom.xml | 3 +++ testdata/workloads/functional-query/queries/QueryTest/misc.test | 9 + tests/custom_cluster/test_hive_parquet_codec_interop.py | 6 ++ 5 files changed, 23 insertions(+), 4 deletions(-)
[impala] 01/03: IMPALA-10379: Add missing HiveLexer classes to shared-deps
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 701714b10a77aee62cf2ad3e25db9e2dfd418780 Author: Tamas Mate AuthorDate: Mon Feb 1 10:41:40 2021 +0100 IMPALA-10379: Add missing HiveLexer classes to shared-deps HIVE-19064 introduced additional lexer classes that are required during runtime. This commit adds the missing HiveLexer lexer classes to the shared-deps. Without these classes queries such as 'select 1 as "``"' would fail with 'NoClassDefFoundError'. Testing: - added a misc.test to verify that the classes are available and that IMPALA-9641 is fixed by HIVE-19064 Change-Id: I6e3a00335983f26498c1130ab9f109f6e67256f5 Reviewed-on: http://gerrit.cloudera.org:8080/17019 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- java/shaded-deps/hive-exec/pom.xml | 3 +++ testdata/workloads/functional-query/queries/QueryTest/misc.test | 9 + 2 files changed, 12 insertions(+) diff --git a/java/shaded-deps/hive-exec/pom.xml b/java/shaded-deps/hive-exec/pom.xml index 6c6d070..639abc6 100644 --- a/java/shaded-deps/hive-exec/pom.xml +++ b/java/shaded-deps/hive-exec/pom.xml @@ -97,6 +97,9 @@ the same dependencies org/apache/hadoop/hive/ql/exec/FunctionUtils* org/apache/hadoop/hive/ql/parse/GenericHiveLexer* org/apache/hadoop/hive/ql/parse/HiveLexer* + org/apache/hadoop/hive/ql/parse/ANTLRNoCaseStringStream* +org/apache/hadoop/hive/ql/parse/ParseError* +org/apache/hadoop/hive/ql/parse/Quotation* org/apache/hadoop/hive/ql/udf/**/* diff --git a/testdata/workloads/functional-query/queries/QueryTest/misc.test b/testdata/workloads/functional-query/queries/QueryTest/misc.test index 903101c..2f48788 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/misc.test +++ b/testdata/workloads/functional-query/queries/QueryTest/misc.test @@ -109,6 +109,15 @@ SELECT "quote \"", 'quote \'' string, string QUERY +# IMPALA-9641: Empty double quotation should not cause infinite loop in the +# ImpalaD Frontend +SELECT 1 as "``" + RESULTS +1 + TYPES +int + + QUERY # Select from table that contains unsupported primitive types SELECT int_col, str_col, bigint_col from functional.unsupported_types RESULTS
[impala] 03/03: IMPALA-8721: re-enable test_hive_impala_interop
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 1f7b413d11321bd74aaa1a9ea9ed30e4d80d Author: Tim Armstrong AuthorDate: Mon Feb 8 20:18:24 2021 -0800 IMPALA-8721: re-enable test_hive_impala_interop The test now passes because HIVE-21290 was fixed. Revert "IMPALA-8689: test_hive_impala_interop failing with "Timeout >7200s"" This reverts commit 5d8c99ce74c45a7d04f11e1f252b346d654f02bf. Change-Id: I7e2beabd7082a45a0fc3b60d318cf698079768ff Reviewed-on: http://gerrit.cloudera.org:8080/17042 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- tests/custom_cluster/test_hive_parquet_codec_interop.py | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/custom_cluster/test_hive_parquet_codec_interop.py b/tests/custom_cluster/test_hive_parquet_codec_interop.py index e8dbefb..e1b43e2 100644 --- a/tests/custom_cluster/test_hive_parquet_codec_interop.py +++ b/tests/custom_cluster/test_hive_parquet_codec_interop.py @@ -58,11 +58,9 @@ class TestParquetInterop(CustomClusterTestSuite): def test_hive_impala_interop(self, vector, unique_database, cluster_properties): # Setup source table. source_table = "{0}.{1}".format(unique_database, "t1_source") -# TODO: Once IMPALA-8721 is fixed add coverage for TimeStamp data type. self.execute_query_expect_success(self.client, -"create table {0} as select id, bool_col, tinyint_col, smallint_col, int_col, " -"bigint_col, float_col, double_col, date_string_col, string_col, year, month " -"from functional_parquet.alltypes".format(source_table)) +"create table {0} as select * from functional_parquet.alltypes" +.format(source_table)) self.execute_query_expect_success(self.client, "insert into {0}(id) values (), (), (), (1), (2), (3)" .format(source_table))
[impala] 01/02: IMPALA-10223: Implement INSERT OVERWRITE for Iceberg tables
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a3f441193d25c6ca721d59ba88129b643b8ad69f Author: Zoltan Borok-Nagy AuthorDate: Mon Feb 1 11:44:00 2021 +0100 IMPALA-10223: Implement INSERT OVERWRITE for Iceberg tables This patch adds support for INSERT OVERWRITE statements for Iceberg tables. We use Iceberg's ReplacePartitions interface for this. This interface provides consistent behavior with INSERT OVERWRITEs against regular tables. It's also consistent with other engines dynamic overwrites, e.g. Spark. INSERT OVERWRITE for partitioned tables replaces the partitions affected by the INSERT, while keeping the other partitions untouched. INSERT OVERWRITE is prohibited for tables that use the BUCKET partition transform because it would randomly overwrite table data. Testing * added e2e test Change-Id: Idf4acfb54cf62a3f3b2e8db9d04044580151299c Reviewed-on: http://gerrit.cloudera.org:8080/17012 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/client-request-state.cc | 1 + common/thrift/CatalogService.thrift| 3 + .../org/apache/impala/analysis/InsertStmt.java | 17 ++- .../impala/service/IcebergCatalogOpExecutor.java | 54 ++- .../queries/QueryTest/iceberg-negative.test| 7 +- .../queries/QueryTest/iceberg-overwrite.test | 168 + tests/query_test/test_iceberg.py | 4 + 7 files changed, 245 insertions(+), 9 deletions(-) diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc index de75e79..7f455a3 100644 --- a/be/src/service/client-request-state.cc +++ b/be/src/service/client-request-state.cc @@ -1298,6 +1298,7 @@ Status ClientRequestState::UpdateCatalog() { TIcebergOperationParam& ice_op = catalog_update.iceberg_operation; ice_op.__set_spec_id(finalize_params.spec_id); ice_op.__set_iceberg_data_files_fb(createIcebergDataFilesVector(*dml_exec_state)); +ice_op.__set_is_overwrite(finalize_params.is_overwrite); } Status cnxn_status; diff --git a/common/thrift/CatalogService.thrift b/common/thrift/CatalogService.thrift index 606d07c..b314369 100644 --- a/common/thrift/CatalogService.thrift +++ b/common/thrift/CatalogService.thrift @@ -201,6 +201,9 @@ struct TIcebergOperationParam { // Iceberg data files to append to the table, encoded in FlatBuffers. 2: required list iceberg_data_files_fb; + + // Is overwrite operation + 3: required bool is_overwrite = false; } // Updates the metastore with new partition information and returns a response diff --git a/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java b/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java index 0b796d8..c880697 100644 --- a/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java @@ -549,9 +549,7 @@ public class InsertStmt extends StatementBase { } if (table_ instanceof FeIcebergTable) { - if (overwrite_) { -throw new AnalysisException("INSERT OVERWRITE not supported for Iceberg tables."); - } + if (overwrite_) validateNoBucketTransform((FeIcebergTable)table_); validateIcebergColumnsForInsert((FeIcebergTable)table_); } @@ -574,6 +572,19 @@ public class InsertStmt extends StatementBase { } } + private void validateNoBucketTransform(FeIcebergTable iceTable) + throws AnalysisException { +IcebergPartitionSpec spec = iceTable.getDefaultPartitionSpec(); +if (!spec.hasPartitionFields()) return; +for (IcebergPartitionField field : spec.getIcebergPartitionFields()) { + if (field.getTransformType() == TIcebergPartitionTransformType.BUCKET) { + throw new AnalysisException("The Iceberg table has BUCKET partitioning. " + + "This means the outcome of dynamic partition overwrite is unforeseeable. " + + "Consider using TRUNCATE and INSERT INTO to overwrite your table."); + } +} + } + private void analyzeWriteAccess() throws AnalysisException { if (!(table_ instanceof FeFsTable)) return; FeFsTable fsTable = (FeFsTable) table_; diff --git a/fe/src/main/java/org/apache/impala/service/IcebergCatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/IcebergCatalogOpExecutor.java index db44aef..c4d0259 100644 --- a/fe/src/main/java/org/apache/impala/service/IcebergCatalogOpExecutor.java +++ b/fe/src/main/java/org/apache/impala/service/IcebergCatalogOpExecutor.java @@ -28,6 +28,7 @@ import org.apache.iceberg.DataFiles; import org.apache.iceberg.DeleteFiles; import org.apache.iceberg.UpdateSchema; impo
[impala] branch master updated (0473e1b -> 81d5377)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 0473e1b IMPALA-10473: Fix wrong analytic results on constant partition/order by exprs new a3f4411 IMPALA-10223: Implement INSERT OVERWRITE for Iceberg tables new 81d5377 IMPALA-10475: [DOCS] elaborate SYNC_DDL option The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/service/client-request-state.cc | 1 + common/thrift/CatalogService.thrift| 3 + docs/topics/impala_sync_ddl.xml| 14 +- .../org/apache/impala/analysis/InsertStmt.java | 17 ++- .../impala/service/IcebergCatalogOpExecutor.java | 54 ++- .../queries/QueryTest/iceberg-negative.test| 7 +- .../queries/QueryTest/iceberg-overwrite.test | 168 + tests/query_test/test_iceberg.py | 4 + 8 files changed, 252 insertions(+), 16 deletions(-) create mode 100644 testdata/workloads/functional-query/queries/QueryTest/iceberg-overwrite.test
[impala] 02/02: IMPALA-10475: [DOCS] elaborate SYNC_DDL option
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 81d5377c27f1940235db332e43f1d0f073cf3d2f Author: Shajini Thayasingh AuthorDate: Fri Feb 5 10:43:53 2021 -0800 IMPALA-10475: [DOCS] elaborate SYNC_DDL option call out that SYNC_DDL applies to all filesystem-based tables Change-Id: I3f1bfce8430c681515101d00cabf9d70ae52e5ec Reviewed-on: http://gerrit.cloudera.org:8080/17027 Tested-by: Impala Public Jenkins Reviewed-by: Tim Armstrong --- docs/topics/impala_sync_ddl.xml | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/topics/impala_sync_ddl.xml b/docs/topics/impala_sync_ddl.xml index 7dd007b..793e9b4 100644 --- a/docs/topics/impala_sync_ddl.xml +++ b/docs/topics/impala_sync_ddl.xml @@ -46,13 +46,13 @@ under the License. you quickly switched to another node, such as by issuing a subsequent query through a load-balancing proxy.) - - Although INSERT is classified as a DML statement, when the SYNC_DDL option - is enabled, INSERT statements also delay their completion until all the underlying data and - metadata changes are propagated to all Impala nodes. Internally, Impala inserts have similarities with DDL - statements in traditional database systems, because they create metadata needed to track HDFS block locations - for new files and they potentially add new partitions to partitioned tables. - + Although INSERT is classified as a DML statement, when the +SYNC_DDL option is enabled, INSERT statements also delay + their completion until all the underlying data and metadata changes are propagated to all + Impala nodes and this option applies to all filesystem-based tables. Internally, Impala + inserts have similarities with DDL statements in traditional database systems, because they + create metadata needed to track HDFS block locations for new files and they potentially add + new partitions to partitioned tables. Because this option can introduce a delay after each write operation, if you are running a sequence of
[impala] branch master updated: IMPALA-10382: fix invalid outer join simplification
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 4ae847b IMPALA-10382: fix invalid outer join simplification 4ae847b is described below commit 4ae847bf94e0a1e07860c9c7aa3f0dfcdf548fac Author: xqhe AuthorDate: Tue Dec 8 16:46:13 2020 +0800 IMPALA-10382: fix invalid outer join simplification When set ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = true, the planner will simplify outer joins if the predicate with case expr or conditional function on both sides of outer join. However, the predicate maybe not null-rejecting, if simplify the outer join, the result is incorrect. E.g. t1.b > coalesce(t1.c, t2.c) can return true if t2.c is null, so it is not null-rejecting predicate for t2. The fix is simply to support the case that the predicate with two operands and the operator is one of (=, !=, >, <, >=, <=), 1. one of the operands or 2. if the operand is arithmetic expression and one of the children does not contain conditional builtin function or case expr and has tuple id in outer joined tuples. E.g. t1.b > coalesce(t2.c, t1.c) or t1.b + coalesce(t2.c, t1.c) > coalesce(t2.c, t1.c) is null-rejecting predicate for t1. Testing: * Add new plan tests in outer-to-inner-joins.test * Add new query tests to verify the correctness on transformation Change-Id: I84a3812f4212fa823f3d1ced6e12f2df05aedb2b Reviewed-on: http://gerrit.cloudera.org:8080/16845 Tested-by: Impala Public Jenkins Reviewed-by: Tim Armstrong --- .../java/org/apache/impala/analysis/Analyzer.java | 51 +-- .../queries/PlannerTest/outer-to-inner-joins.test | 156 + .../queries/QueryTest/outer-to-inner-joins.test| 27 3 files changed, 223 insertions(+), 11 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index 8a73e07..a4633c8 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -59,6 +59,7 @@ import org.apache.impala.catalog.TableLoadingException; import org.apache.impala.catalog.Type; import org.apache.impala.catalog.local.LocalKuduTable; import org.apache.impala.common.AnalysisException; +import org.apache.impala.common.Id; import org.apache.impala.common.IdGenerator; import org.apache.impala.common.ImpalaException; import org.apache.impala.common.InternalException; @@ -3326,25 +3327,53 @@ public class Analyzer { } // Simply assume that a conjunct contains a UDF, is distinct from/ is not distinct -// from operator or nondeterministic buitin functions, it is not null-rejecting -// predicate. +// from operator, nondeterministic buitin functions or is null operator, it is not +// null-rejecting predicate. if (e.contains(Predicates.or(Expr.IS_DISTINCT_FROM_OR_NOT_DISTINCT_PREDICATE, Expr.IS_NONDETERMINISTIC_BUILTIN_FN_PREDICATE, -Expr.IS_UDF_PREDICATE))) { +Expr.IS_UDF_PREDICATE, Expr.IS_IS_NULL_PREDICATE))) { return true; } -// For conditional function, is null expr or case expr, if the tuple id of the expr -// is a subset of 'tupleIds', the result may have null value +// Predicate contains conditional function, case expr may not null-rejecting. List maybeNullableExprs = new ArrayList<>(); e.collectAll(Predicates.or(Expr.IS_CONDITIONAL_BUILTIN_FN_PREDICATE, -Expr.IS_IS_NULL_PREDICATE, Expr.IS_CASE_EXPR_PREDICATE), maybeNullableExprs); -for (Expr expr : maybeNullableExprs) { - List tids = new ArrayList<>(); - expr.getIds(tids, null); - if (tupleIds.containsAll(tids)) { -return true; +Expr.IS_CASE_EXPR_PREDICATE), maybeNullableExprs); +if (!maybeNullableExprs.isEmpty()) { + if (!Expr.IS_BINARY_PREDICATE.apply(e)) return true; + // For t1 left join t2 on t1.a = t2.a where t2.b > coalesce(t1.c, t2.c) can + // simplify to an inner join. Simply support the case that one child does not + // contain conditional builtin function or case expr and has tuple id in outer + // joined tuples. + for (Expr operand : e.getChildren()) { +if (operand instanceof ArithmeticExpr) { + // 't1.id + coalesce(t1.c, t2.c) > coalesce(t2.c, t1.c)' is null-rejecting + // predicate for t1 + for (Expr expr : operand.getChildren()) { +if (noConditionalBuiltinFnOrCaseExpr(expr, tupleIds)) return false; + } +} else { + if (noConditionalBuiltinFnOrCaseExpr(operand, tupleIds)) return false; +} } + return true; +} +return false; + } + +
[impala] branch master updated (eb85c6e -> 08367e9)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from eb85c6e IMPALA-9793: Impala quickstart cluster with docker-compose new f4584dd IMPALA-10404: Update docs to reflect RLE_DICTIONARY support new 08367e9 IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docs/shared/impala_common.xml | 5 +-- .../topics/impala_parquet_dictionary_filtering.xml | 8 +++-- .../apache/impala/analysis/CreateTableStmt.java| 25 ++- .../queries/QueryTest/iceberg-create.test | 37 ++ .../queries/QueryTest/iceberg-negative.test| 18 +++ .../queries/QueryTest/show-create-table.test | 19 +++ 6 files changed, 106 insertions(+), 6 deletions(-)
[impala] 02/02: IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 08367e91f04508b54f77b56e0d211dd167b0116f Author: Zoltan Borok-Nagy AuthorDate: Mon Jan 25 16:09:59 2021 +0100 IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax For convenience this patch adds support with the old-style CREATE TABLE ... PARTITIONED BY ...; syntax for Iceberg tables. So users should be able to write the following: CREATE TABLE ice_t (i int) PARTITIONED BY (p int) STORED AS ICEBERG; Which should be equivalent to this: CREATE TABLE ice_t (i int, p int) PARTITION BY SPEC (p IDENTITY) STORED AS ICEBERG; Please note that the old-style CREATE TABLE statement creates IDENTITY-partitioned tables. For other partition transforms the users must use the new, more generic syntax. Hive also supports the old PARTITIONED BY syntax with the same behavior. Testing: * added e2e tests Change-Id: I789876c161bc0987820955aa9ae01414e0dcb45d Reviewed-on: http://gerrit.cloudera.org:8080/16979 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../apache/impala/analysis/CreateTableStmt.java| 25 ++- .../queries/QueryTest/iceberg-create.test | 37 ++ .../queries/QueryTest/iceberg-negative.test| 18 +++ .../queries/QueryTest/show-create-table.test | 19 +++ 4 files changed, 98 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java index 5e89724..1477536 100644 --- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java @@ -37,6 +37,7 @@ import org.apache.impala.service.BackendConfig; import org.apache.impala.thrift.TCreateTableParams; import org.apache.impala.thrift.THdfsFileFormat; import org.apache.impala.thrift.TIcebergCatalog; +import org.apache.impala.thrift.TIcebergPartitionTransformType; import org.apache.impala.thrift.TSortingOrder; import org.apache.impala.thrift.TTableName; import org.apache.impala.util.AvroSchemaConverter; @@ -271,8 +272,8 @@ public class CreateTableStmt extends StatementBase { } if (getFileFormat() == THdfsFileFormat.ICEBERG) { - analyzeIcebergFormat(analyzer); analyzeIcebergColumns(); + analyzeIcebergFormat(analyzer); } else { List iceSpec = tableDef_.getIcebergPartitionSpecs(); if (iceSpec != null && !iceSpec.isEmpty()) { @@ -703,10 +704,32 @@ public class CreateTableStmt extends StatementBase { * Iceberg field */ private void analyzeIcebergColumns() { +if (!getPartitionColumnDefs().isEmpty()) { + createIcebergPartitionSpecFromPartitionColumns(); +} for (ColumnDef def : getColumnDefs()) { if (!def.isNullabilitySet()) { def.setNullable(true); } } } + + /** + * Creates Iceberg partition spec from partition columns. Needed to support old-style + * CREATE TABLE .. PARTITIONED BY () syntax. In this case the column list in + * 'cols' is appended to the table-level columns, but also Iceberg-level IDENTITY + * partitions are created from this list. + */ + private void createIcebergPartitionSpecFromPartitionColumns() { +Preconditions.checkState(!getPartitionColumnDefs().isEmpty()); +Preconditions.checkState(getIcebergPartitionSpecs().isEmpty()); +List partFields = new ArrayList<>(); +for (ColumnDef colDef : getPartitionColumnDefs()) { + partFields.add(new IcebergPartitionField(colDef.getColName(), + new IcebergPartitionTransform(TIcebergPartitionTransformType.IDENTITY))); +} +getIcebergPartitionSpecs().add(new IcebergPartitionSpec(partFields)); +getColumnDefs().addAll(getPartitionColumnDefs()); +getPartitionColumnDefs().clear(); + } } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test index 4b7daf9..eba73d8 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test @@ -445,3 +445,40 @@ DESCRIBE iceberg_nullable_test; TYPES STRING,STRING,STRING,STRING + QUERY +CREATE TABLE iceberg_old_style_partitions ( + register_time DATE, + message STRING, + price DECIMAL(8,1), + map_test MAP >, + struct_test STRUCT +) +PARTITIONED BY ( + level STRING, + event_id INT +) +STORED AS ICEBERG; + RESULTS +'Table has been created.' + + QUERY +DESCRIBE iceberg_old_style_partitions; + RESULTS +'register_time','date','','true' +'message',
[impala] 01/02: IMPALA-10404: Update docs to reflect RLE_DICTIONARY support
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit f4584dd2763edda9a24e4466b2d0a8f4bb065437 Author: Tim Armstrong AuthorDate: Mon Jan 25 14:41:53 2021 -0800 IMPALA-10404: Update docs to reflect RLE_DICTIONARY support Fix references to PLAIN_DICTIONARY to reflect that RLE_DICTIONARY is supported too. Change-Id: Iee98abfd760396cf43302c9077c6165eb3623335 Reviewed-on: http://gerrit.cloudera.org:8080/16982 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- docs/shared/impala_common.xml | 5 +++-- docs/topics/impala_parquet_dictionary_filtering.xml | 8 +--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml index 6b0e812..f18ca00 100644 --- a/docs/shared/impala_common.xml +++ b/docs/shared/impala_common.xml @@ -3175,8 +3175,9 @@ flight_num: INT32 SNAPPY DO:83456393 FPO:83488603 SZ:10216514/11474301 Impala can query Parquet files that use the PLAIN, -PLAIN_DICTIONARY, BIT_PACKED, and RLE -encodings. Currently, Impala does not support RLE_DICTIONARY encoding. +PLAIN_DICTIONARY, BIT_PACKED, RLE +and RLE_DICTIONARY encodings. RLE_DICTIONARY is supported +only in and up. When creating files outside of Impala for use by Impala, make sure to use one of the supported encodings. In particular, for MapReduce jobs, parquet.writer.version must not be defined (especially as diff --git a/docs/topics/impala_parquet_dictionary_filtering.xml b/docs/topics/impala_parquet_dictionary_filtering.xml index 3460f2a..2d68b4d 100644 --- a/docs/topics/impala_parquet_dictionary_filtering.xml +++ b/docs/topics/impala_parquet_dictionary_filtering.xml @@ -58,7 +58,8 @@ under the License. If the encoding_stats is in the Parquet file, dictionary filtering uses it to determine if there are only dictionary encoded pages (i.e. there are no - data pages with an encoding other than PLAIN_DICTIONARY). + data pages with an encoding other than RLE_DICTIONARY or + PLAIN_DICTIONARY). @@ -66,11 +67,12 @@ under the License. The column is purely dictionary encoded if both of the conditions satisfy: - PLAIN_DICTIONARY is present. + PLAIN_DICTIONARY or RLE_DICTIONARY is present. - Only PLAIN_DICTIONARY, RLE, or BIT_PACKED encodings are listed. + Only PLAIN_DICTIONARY, RLE_DICTIONARY, + RLE, or BIT_PACKED encodings are listed.
[impala] branch master updated (e8720b4 -> eb85c6e)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from e8720b4 IMPALA-2019(Part-1): Provide UTF-8 support in length, substring and reverse functions new 3b763b5 IMPALA-10447: Add a newline when exporting shell output to a file. new eb85c6e IMPALA-9793: Impala quickstart cluster with docker-compose The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docker/CMakeLists.txt | 36 +- docker/README.md | 144 ++- docs/build-doc.sh => docker/docker-build.sh| 29 +- docker/impala_base/Dockerfile | 10 + docker/quickstart-kudu-minimal.yml | 128 ++ .../quickstart-load-data.yml | 34 +- docker/quickstart.yml | 104 ++ docker/quickstart_client/Dockerfile| 70 ++ docker/quickstart_client/data-load-entrypoint.sh | 86 ++ .../quickstart_client/load_tpcds_kudu.sql | 316 +++-- docker/quickstart_client/load_tpcds_parquet.sql| 1248 docker/quickstart_conf/hive-site.xml | 74 ++ docker/quickstart_hms/Dockerfile | 67 ++ docker/quickstart_hms/hms-entrypoint.sh| 68 ++ shell/shell_output.py |1 + tests/shell/test_shell_commandline.py | 28 + 16 files changed, 2250 insertions(+), 193 deletions(-) copy docs/build-doc.sh => docker/docker-build.sh (54%) create mode 100644 docker/quickstart-kudu-minimal.yml copy be/src/catalog/CMakeLists.txt => docker/quickstart-load-data.yml (58%) create mode 100644 docker/quickstart.yml create mode 100644 docker/quickstart_client/Dockerfile create mode 100755 docker/quickstart_client/data-load-entrypoint.sh copy testdata/datasets/tpcds/tpcds_kudu_template.sql => docker/quickstart_client/load_tpcds_kudu.sql (68%) create mode 100644 docker/quickstart_client/load_tpcds_parquet.sql create mode 100644 docker/quickstart_conf/hive-site.xml create mode 100644 docker/quickstart_hms/Dockerfile create mode 100755 docker/quickstart_hms/hms-entrypoint.sh
[impala] 01/02: IMPALA-10447: Add a newline when exporting shell output to a file.
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 3b763b5c3235ebdb445ad0a4ae1bf79385e8df02 Author: Andrew Sherman AuthorDate: Mon Jan 18 19:10:11 2021 -0800 IMPALA-10447: Add a newline when exporting shell output to a file. Impala shell outputs a batch of rows using OutputStream. Inside OutputStream, output to a file is handled slightly differently from output that is written to stdout. When writing to stdout we use print() (which appends a newline) while when writing to a file we use write() (which adds nothing). This difference was introduced in IMPALA-3343 so this bug may be a regression introduced then. To ensure that output is the same in either case we need to add a newline after writing each batch of rows to a file. TESTING: Added a new test for this case. Change-Id: I078a06c54e0834bc1f898626afbfff4ded579fa9 Reviewed-on: http://gerrit.cloudera.org:8080/16966 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- shell/shell_output.py | 1 + tests/shell/test_shell_commandline.py | 28 2 files changed, 29 insertions(+) diff --git a/shell/shell_output.py b/shell/shell_output.py index 31d91a0..bfb418c 100644 --- a/shell/shell_output.py +++ b/shell/shell_output.py @@ -117,6 +117,7 @@ class OutputStream(object): # Note that instances of this class do not persist, so it's fine to # close the we close the file handle after each write. out_file.write(formatted_data.encode('utf-8')) # file opened in binary mode + out_file.write(b'\n') except IOError as err: file_err_msg = "Error opening file %s: %s" % (self.filename, str(err)) print('{0} (falling back to stderr)'.format(file_err_msg), file=sys.stderr) diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py index 666162f..30cd85d 100644 --- a/tests/shell/test_shell_commandline.py +++ b/tests/shell/test_shell_commandline.py @@ -105,6 +105,16 @@ def populated_table(empty_table, request): return fq_table_name +@pytest.yield_fixture +def tmp_file(): + """ + Test fixture which manages a temporary file + """ + _, tmp_file = tempfile.mkstemp() + yield tmp_file + os.remove(tmp_file) + + class TestImpalaShell(ImpalaTestSuite): """A set of sanity tests for the Impala shell commandline parameters. @@ -1071,3 +1081,21 @@ class TestImpalaShell(ImpalaTestSuite): expected_result = """anonymous\tanonymous\n""" assert result.stdout == expected_result assert result.stderr == "" + + def test_output_file(self, vector, tmp_file): +"""Test that writing output to a file using '--output_file' produces the same output +as is written to stdout.""" +row_count = 6000 # Should be > 2048 to tickle IMPALA-10447. +query = "select * from tpcds.item order by i_item_sk limit %d" % row_count +# Run the query normally and keep the stdout. +output = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;']) +assert "Fetched %d row(s)" % row_count in output.stderr +rows_from_stdout = output.stdout.strip().split('\n') +# Run the query with output sent to a file using '--output_file'. +result = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;', + '--output_file=%s' % tmp_file]) +assert "Fetched %d row(s)" % row_count in result.stderr +# Check that the output from the file is the same as that written to stdout. +with open(tmp_file, "r") as f: + rows_from_file = [line.rstrip() for line in f] + assert rows_from_stdout == rows_from_file
[impala] 02/02: IMPALA-9793: Impala quickstart cluster with docker-compose
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit eb85c6eeca77c748a01d304625ea5d608a3e12c0 Author: Tim Armstrong AuthorDate: Sat May 9 21:27:41 2020 -0700 IMPALA-9793: Impala quickstart cluster with docker-compose What works: * A single node cluster can be started up with docker-compose * HMS data is stored in Derby database in a docker volume * Filesystem data is stored in a shared docker volume, using the localfs support in the Hadoop client. * A Kudu cluster with a single master can be optionally added on to the Impala cluster. * TPC-DS data can be loaded automatically by a data loading container. We need to set up a docker network called quickstart-network, purely because docker-compose insists on generating network names with underscores, which are part of the FQDN and end up causing problems with Java's URL parsing, which rejects these technically invalid domain names. How to run: Instructions for running the quickstart cluster are in docker/README.md. How to build containers: ./buildall.sh -release -noclean -notests -ninja ninja quickstart_hms_image quickstart_client_image docker_images How to upload containers to dockerhub: IMPALA_QUICKSTART_IMAGE_PREFIX=timgarmstrong/ for i in impalad_coord_exec impalad_coordinator statestored \ impalad_executor catalogd impala_quickstart_client \ impala_quickstart_hms do docker tag $i ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i docker push ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i done I pushed containers build from commit f260cce22, which was branched from 6cb7cecacf on master. Misc other stuff: * Added more metadata to all images. TODO: * Test and instructions to run against Kudu quickstart * Upload latest version of containers before merging. Change-Id: Ifc0b862af40a368381ada7ec2a355fe4b0aa778c Reviewed-on: http://gerrit.cloudera.org:8080/15966 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- docker/CMakeLists.txt| 36 +- docker/README.md | 144 ++- docker/docker-build.sh | 39 + docker/impala_base/Dockerfile| 10 + docker/quickstart-kudu-minimal.yml | 128 +++ docker/quickstart-load-data.yml | 38 + docker/quickstart.yml| 104 ++ docker/quickstart_client/Dockerfile | 70 ++ docker/quickstart_client/data-load-entrypoint.sh | 86 ++ docker/quickstart_client/load_tpcds_kudu.sql | 877 +++ docker/quickstart_client/load_tpcds_parquet.sql | 1248 ++ docker/quickstart_conf/hive-site.xml | 74 ++ docker/quickstart_hms/Dockerfile | 67 ++ docker/quickstart_hms/hms-entrypoint.sh | 68 ++ 14 files changed, 2985 insertions(+), 4 deletions(-) diff --git a/docker/CMakeLists.txt b/docker/CMakeLists.txt index 60fd8c2..7fe085b 100644 --- a/docker/CMakeLists.txt +++ b/docker/CMakeLists.txt @@ -19,6 +19,8 @@ set(IMPALA_BASE_BUILD_CONTEXT_DIR ${CMAKE_SOURCE_DIR}/docker/build_context ) +set(DOCKER_BUILD ${CMAKE_SOURCE_DIR}/docker/docker-build.sh) + find_program(LSB_RELEASE_EXEC lsb_release) execute_process(COMMAND ${LSB_RELEASE_EXEC} -is OUTPUT_VARIABLE LSB_RELEASE_ID @@ -38,6 +40,7 @@ else() endif() MESSAGE(STATUS "Picked docker base image based on host OS: ${DISTRO_BASE_IMAGE}") + if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # Add a target to build a base docker image for 'build_type'. 'build_context_args' are # passed to the setup_build_context.py script. @@ -58,7 +61,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # sent to the docker daemon. This allows the Dockerfile built to copy all necessary # dependencies. COMMAND tar cvh . -C ${CMAKE_SOURCE_DIR}/docker/impala_base/ . | - docker build -t impala_base_${build_type} + ${DOCKER_BUILD} -t impala_base_${build_type} --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} - WORKING_DIRECTORY ${IMPALA_BASE_BUILD_CONTEXT_DIR}/${build_type} DEPENDS impala_base_build_context_${build_type} ${CMAKE_SOURCE_DIR}/docker/impala_base/Dockerfile @@ -88,7 +91,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED") # build context used for the base image is used for each daemon image. This allows # each daemon image to only copy in the dependencies it requires. COMMAND tar cvh . -C ${CMAKE_SOURCE_DIR}/docker/${daemon_name}/ . | - docker build --build-arg BASE_IMAGE=impala_base_${b
[impala] branch master updated: IMPALA-2019(Part-1): Provide UTF-8 support in length, substring and reverse functions
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new e8720b4 IMPALA-2019(Part-1): Provide UTF-8 support in length, substring and reverse functions e8720b4 is described below commit e8720b40f1b04712442dd9eb69cd603855eb6b8d Author: stiga-huang AuthorDate: Mon Jan 4 13:10:49 2021 +0800 IMPALA-2019(Part-1): Provide UTF-8 support in length, substring and reverse functions A unicode character can be encoded into 1-4 bytes in UTF-8. String functions will return undesired results when the input contains unicode characters, because we deal with a string as a byte array. For instance, length() returns the length in bytes, not in unicode characters. UTF-8 is the dominant unicode encoding used in the Hadoop ecosystem. This patch adds UTF-8 support in some string functions so they can have UTF-8 aware behavior. For compatibility with the old versions, a new query option, UTF8_MODE, is added for turning on/off the UTF-8 aware behavior. Currently, only length(), substring() and reverse() support it. Other function supports will be added in later patches. String functions will check the query option and switch to use the desired implementation. It's similar to how we use the decimal_v2 query option in builtin functions. For easy testing, the UTF-8 aware version of string functions are also exposed as builtin functions (named by utf8_*, e.g. utf8_length). Tests: - Add BE tests for utf8 functions. - Add e2e tests for the UTF8_MODE query option. Change-Id: I0aaf3544e89f8a3d531ad6afe056b3658b525b7c Reviewed-on: http://gerrit.cloudera.org:8080/16908 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/codegen/llvm-codegen.cc | 3 +- be/src/exprs/expr-test.cc | 80 ++ be/src/exprs/string-functions-ir.cc| 103 ++ be/src/exprs/string-functions.h| 6 ++ be/src/runtime/runtime-state.h | 1 + be/src/service/query-options.cc| 4 + be/src/service/query-options.h | 3 +- be/src/udf/udf-internal.h | 4 +- be/src/udf/udf.cc | 12 ++- be/src/util/bit-util.h | 5 + common/function-registry/impala_functions.py | 6 ++ common/thrift/ImpalaInternalService.thrift | 3 + common/thrift/ImpalaService.thrift | 4 + .../functional/functional_schema_template.sql | 15 +++ .../queries/QueryTest/utf8-string-functions.test | 116 + tests/query_test/test_utf8_strings.py | 42 16 files changed, 402 insertions(+), 5 deletions(-) diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc index ac3db8f..98e5bb4 100644 --- a/be/src/codegen/llvm-codegen.cc +++ b/be/src/codegen/llvm-codegen.cc @@ -1010,7 +1010,8 @@ int LlvmCodeGen::InlineConstFnAttrs(const FunctionContext::TypeDesc& ret_type, DCHECK(state_ != nullptr); // All supported constants are currently integers. call_instr->replaceAllUsesWith(GetI32Constant(FunctionContextImpl::GetConstFnAttr( -state_->query_options().decimal_v2, ret_type, arg_types, t_val, i_val))); +state_->query_options().decimal_v2, state_->query_options().utf8_mode, ret_type, +arg_types, t_val, i_val))); call_instr->eraseFromParent(); ++replaced; } diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 58d98d0..9e801a4 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -10539,6 +10539,86 @@ TEST_P(ExprTest, MaskHashTest) { TestIsNull("mask_hash(cast('2016-04-20' as timestamp))", TYPE_TIMESTAMP); } +TEST_P(ExprTest, Utf8Test) { + // Verifies utf8_length() counts length by UTF-8 characters instead of bytes. + // 'ä½ ' and '好' are both encoded into 3 bytes. + TestIsNull("utf8_length(NULL)", TYPE_INT); + TestValue("utf8_length('ä½ å¥½')", TYPE_INT, 2); + TestValue("utf8_length('ä½ å¥½hello')", TYPE_INT, 7); + TestValue("utf8_length('ä½ å¥½ hello ä½ å¥½')", TYPE_INT, 11); + TestValue("utf8_length('hello')", TYPE_INT, 5); + + // Verifies position and length of utf8_substring() are UTF-8 aware. + // 'ä½ ' and '好' are both encoded into 3 bytes. + TestStringValue("utf8_substring('Hello', 1)", "Hello"); + TestStringValue("utf8_substring('Hello', -2)", "lo"); + TestStringValue("utf8_substring('Hello', cast(0 as bigint))", ""); + TestStringValue("utf8_substring
[impala] branch master updated: IMPALA-10147: Avoid getting a file handle for data cache hits
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 2644203 IMPALA-10147: Avoid getting a file handle for data cache hits 2644203 is described below commit 2644203d1cbdd124a75a3da80fc176a447f3164c Author: Riza Suminto AuthorDate: Fri Jan 15 14:05:25 2021 -0800 IMPALA-10147: Avoid getting a file handle for data cache hits When reading from the data cache, the disk IO thread first gets a file handle, then it checks the data cache for a hit. The file handle is only used if there is a data cache miss. It is not used when data cache hit and in turns becomes an overhead. This patch move the file handle retrieval later when data cache miss hapens. Testing: - Add custom cluster test test_no_fd_caching_on_cached_data. - Pass core tests. Change-Id: Icc68f233518f862454e87bcbbef14d65fcdb7c91 Reviewed-on: http://gerrit.cloudera.org:8080/16963 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/runtime/io/hdfs-file-reader.cc| 50 tests/custom_cluster/test_hdfs_fd_caching.py | 42 +++ 2 files changed, 71 insertions(+), 21 deletions(-) diff --git a/be/src/runtime/io/hdfs-file-reader.cc b/be/src/runtime/io/hdfs-file-reader.cc index 34c5d61..1441a2d 100644 --- a/be/src/runtime/io/hdfs-file-reader.cc +++ b/be/src/runtime/io/hdfs-file-reader.cc @@ -96,27 +96,6 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_ *eof = false; *bytes_read = 0; - CachedHdfsFileHandle* borrowed_hdfs_fh = nullptr; - hdfsFile hdfs_file; - - // If the reader has an exclusive file handle, use it. Otherwise, borrow - // a file handle from the cache. - if (exclusive_hdfs_fh_ != nullptr) { -hdfs_file = exclusive_hdfs_fh_->file(); - } else { -RETURN_IF_ERROR(io_mgr->GetCachedHdfsFileHandle(hdfs_fs_, -scan_range_->file_string(), -scan_range_->mtime(), request_context, _hdfs_fh)); -hdfs_file = borrowed_hdfs_fh->file(); - } - // Make sure to release any borrowed file handle. - auto release_borrowed_hdfs_fh = MakeScopeExitTrigger([this, _hdfs_fh]() { -if (borrowed_hdfs_fh != nullptr) { - scan_range_->io_mgr_->ReleaseCachedHdfsFileHandle(scan_range_->file_string(), - borrowed_hdfs_fh); -} - }); - Status status = Status::OK(); { ScopedTimer req_context_read_timer( @@ -133,6 +112,35 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_ *bytes_read = cached_read; } +if (*bytes_read == bytes_to_read) { + // All bytes successfully read from data cache. We can safely return. + return status; +} + +// If we get here, the next bytes are not available in data cache, so we need to get +// file handle in order to read the rest of data from file. +// If the reader has an exclusive file handle, use it. Otherwise, borrow +// a file handle from the cache. +req_context_read_timer.Stop(); +CachedHdfsFileHandle* borrowed_hdfs_fh = nullptr; +hdfsFile hdfs_file; +if (exclusive_hdfs_fh_ != nullptr) { + hdfs_file = exclusive_hdfs_fh_->file(); +} else { + RETURN_IF_ERROR( + io_mgr->GetCachedHdfsFileHandle(hdfs_fs_, scan_range_->file_string(), + scan_range_->mtime(), request_context, _hdfs_fh)); + hdfs_file = borrowed_hdfs_fh->file(); +} +// Make sure to release any borrowed file handle. +auto release_borrowed_hdfs_fh = MakeScopeExitTrigger([this, _hdfs_fh]() { + if (borrowed_hdfs_fh != nullptr) { +scan_range_->io_mgr_->ReleaseCachedHdfsFileHandle( +scan_range_->file_string(), borrowed_hdfs_fh); + } +}); +req_context_read_timer.Start(); + while (*bytes_read < bytes_to_read) { int bytes_remaining = bytes_to_read - *bytes_read; DCHECK_GT(bytes_remaining, 0); diff --git a/tests/custom_cluster/test_hdfs_fd_caching.py b/tests/custom_cluster/test_hdfs_fd_caching.py index 7d94fd3..42c6ca6 100644 --- a/tests/custom_cluster/test_hdfs_fd_caching.py +++ b/tests/custom_cluster/test_hdfs_fd_caching.py @@ -161,6 +161,48 @@ class TestHdfsFdCaching(CustomClusterTestSuite): caching_expected = False self.run_fd_caching_test(vector, caching_expected, cache_capacity, None) + @pytest.mark.execute_serially + @CustomClusterTestSuite.with_args( + impalad_args="--max_cached_file_handles=16 --unused_file_handle_timeout_sec=5 " + + "--data_cache=/tmp:500MB --always_use_data_cache=true", + catalogd_args="--load_catalog_in_background=false") + def test_no_fd_caching_on_cached_data(self, vector): +&qu
[impala] 02/04: IMPALA-10427: Remove SkipIfS3.eventually_consistent pytest marker
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 35bae939abc5534098931221813cb5d500b68993 Author: Joe McDonnell AuthorDate: Wed Jan 6 13:58:14 2021 -0800 IMPALA-10427: Remove SkipIfS3.eventually_consistent pytest marker These tests were disabled due to S3's eventually consistent behavior. Now that S3 is strongly consistent, these tests do not need to be disabled. Testing: - Ran s3 core job Change-Id: Ie9041f530bf3a818f8954b31a3d01d9f6753d7d4 Reviewed-on: http://gerrit.cloudera.org:8080/16931 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- tests/common/skip.py| 2 -- tests/metadata/test_compute_stats.py| 10 -- tests/query_test/test_aggregation.py| 1 - tests/query_test/test_insert.py | 1 - tests/query_test/test_insert_parquet.py | 1 - tests/query_test/test_insert_permutation.py | 1 - tests/query_test/test_nested_types.py | 2 -- 7 files changed, 18 deletions(-) diff --git a/tests/common/skip.py b/tests/common/skip.py index 21bbfdd..92cf18c 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -59,8 +59,6 @@ class SkipIfS3: hbase = pytest.mark.skipif(IS_S3, reason="HBase not started with S3") qualified_path = pytest.mark.skipif(IS_S3, reason="Tests rely on HDFS qualified paths, IMPALA-1872") - eventually_consistent = pytest.mark.skipif(IS_S3, - reason="Flakiness on account of S3 eventual consistency.") iceberg = pytest.mark.skipif(IS_S3, reason="Currently Iceberg is only supported on HDFS.") variable_listing_times = pytest.mark.skipif(IS_S3, diff --git a/tests/metadata/test_compute_stats.py b/tests/metadata/test_compute_stats.py index 5455d60..58754a9 100644 --- a/tests/metadata/test_compute_stats.py +++ b/tests/metadata/test_compute_stats.py @@ -51,12 +51,10 @@ class TestComputeStats(ImpalaTestSuite): create_uncompressed_text_dimension(cls.get_workload())) @SkipIfLocal.hdfs_blocks - @SkipIfS3.eventually_consistent def test_compute_stats(self, vector, unique_database): self.run_test_case('QueryTest/compute-stats', vector, unique_database) @SkipIfLocal.hdfs_blocks - @SkipIfS3.eventually_consistent def test_compute_stats_avro(self, vector, unique_database, cluster_properties): if cluster_properties.is_catalog_v2_cluster(): # IMPALA-7308: changed behaviour of various Avro edge cases significantly in the @@ -67,29 +65,24 @@ class TestComputeStats(ImpalaTestSuite): self.run_test_case('QueryTest/compute-stats-avro', vector, unique_database) @SkipIfLocal.hdfs_blocks - @SkipIfS3.eventually_consistent def test_compute_stats_decimal(self, vector, unique_database): # Test compute stats on decimal columns separately so we can vary between platforms # with and without write support for decimals (Hive < 0.11 and >= 0.11). self.run_test_case('QueryTest/compute-stats-decimal', vector, unique_database) @SkipIfLocal.hdfs_blocks - @SkipIfS3.eventually_consistent def test_compute_stats_date(self, vector, unique_database): # Test compute stats on date columns separately. self.run_test_case('QueryTest/compute-stats-date', vector, unique_database) - @SkipIfS3.eventually_consistent def test_compute_stats_incremental(self, vector, unique_database): self.run_test_case('QueryTest/compute-stats-incremental', vector, unique_database) - @SkipIfS3.eventually_consistent def test_compute_stats_complextype_warning(self, vector, unique_database): self.run_test_case('QueryTest/compute-stats-complextype-warning', vector, unique_database) @pytest.mark.execute_serially - @SkipIfS3.eventually_consistent def test_compute_stats_many_partitions(self, vector): # To cut down on test execution time, only run the compute stats test against many # partitions if performing an exhaustive test run. @@ -97,7 +90,6 @@ class TestComputeStats(ImpalaTestSuite): self.run_test_case('QueryTest/compute-stats-many-partitions', vector) @pytest.mark.execute_serially - @SkipIfS3.eventually_consistent def test_compute_stats_keywords(self, vector): """IMPALA-1055: Tests compute stats with a db/table name that are keywords.""" self.execute_query("drop database if exists `parquet` cascade") @@ -109,7 +101,6 @@ class TestComputeStats(ImpalaTestSuite): finally: self.cleanup_db("parquet") - @SkipIfS3.eventually_consistent def test_compute_stats_compression_codec(self, vector, unique_database): """IMPALA-8254: Tests that running compute stats with compression_codec set should not throw an error.""" @@ -292,7 +283,6
[impala] branch master updated (799bc22 -> ab6b796)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 799bc22 IMPALA-10424: Fix race on not_admitted_reason in AdmissionController new 44bade8 IMPALA-10091: [DOCS] add REFRESH_UPDATED_HMS_PARTITIONS query option new 35bae93 IMPALA-10427: Remove SkipIfS3.eventually_consistent pytest marker new 425e424 IMPALA-9687 Improve estimates for number of hosts in Kudu plans new ab6b796 IMPALA-10027: configurable default anonymous user The 4 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/common/global-flags.cc | 4 +- be/src/service/impala-beeswax-server.cc| 11 ++- be/src/service/impala-hs2-server.cc| 5 +- docs/impala.ditamap| 1 + docs/topics/impala_refresh_updated_hms.xml | 72 +++ .../org/apache/impala/planner/KuduScanNode.java| 82 -- .../java/org/apache/impala/service/JdbcTest.java | 6 +- .../queries/PlannerTest/kudu-selectivity.test | 2 +- .../queries/PlannerTest/tpch-kudu.test | 20 +++--- tests/common/skip.py | 2 - tests/custom_cluster/test_admission_controller.py | 2 +- tests/metadata/test_compute_stats.py | 10 --- tests/query_test/test_aggregation.py | 1 - tests/query_test/test_insert.py| 1 - tests/query_test/test_insert_parquet.py| 1 - tests/query_test/test_insert_permutation.py| 1 - tests/query_test/test_nested_types.py | 2 - tests/shell/test_shell_commandline.py | 24 +++ 18 files changed, 203 insertions(+), 44 deletions(-) create mode 100644 docs/topics/impala_refresh_updated_hms.xml
[impala] 04/04: IMPALA-10027: configurable default anonymous user
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit ab6b7960db3181096134b8aa46cb90baff6df006 Author: Tim Armstrong AuthorDate: Wed Dec 23 14:13:38 2020 -0800 IMPALA-10027: configurable default anonymous user A username can be determined for a session via two mechanisms: * In a secure env, the user is authenticated by LDAP or Kerberos * In an unsecure env, the client specifies the user name, either as a parameter to the OpenSession API (HS2) or as a parameter to the first query run (beeswax) This patch affects what happens if neither of the above mechanisms is used. Previously we would end up with the username being an empty string, but this makes Ranger unhappy. Hive uses the name "anonymous" in this situation, so we change Impala's behaviour too. This is configurable by -anonymous_user_name. -anonymous_user_name= reverts to the old behaviour. Test * Add an end-to-end test that exercises this via impala-shell for HS2, HS2-HTTP and beeswax protocols. * Tweak a couple of existing tests that depended on the previous behavior. Change-Id: I6db491231fa22484aed476062b8fe4c8f69130b0 Reviewed-on: http://gerrit.cloudera.org:8080/16902 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/common/global-flags.cc | 4 +++- be/src/service/impala-beeswax-server.cc| 11 -- be/src/service/impala-hs2-server.cc| 5 - .../java/org/apache/impala/service/JdbcTest.java | 6 +++--- tests/custom_cluster/test_admission_controller.py | 2 +- tests/shell/test_shell_commandline.py | 24 ++ 6 files changed, 44 insertions(+), 8 deletions(-) diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc index 1b8c16f..794c14d 100644 --- a/be/src/common/global-flags.cc +++ b/be/src/common/global-flags.cc @@ -66,7 +66,9 @@ DEFINE_bool(skip_external_kerberos_auth, false, "(Advanced) skip kerberos authentication for incoming external connections to " "this daemon, e.g. clients connecting to the HS2 interface. Only has an effect " "if --principal is set, i.e. Kerberos is enabled."); - +DEFINE_string(anonymous_user_name, "anonymous", +"Default username used when a client connects to an unsecured impala daemon and " +"does not specify a username."); static const string mem_limit_help_msg = "Limit on process memory consumption. " "Includes the JVM's memory consumption only if --mem_limit_includes_jvm is true. " diff --git a/be/src/service/impala-beeswax-server.cc b/be/src/service/impala-beeswax-server.cc index 4d14903..7eb0094 100644 --- a/be/src/service/impala-beeswax-server.cc +++ b/be/src/service/impala-beeswax-server.cc @@ -50,6 +50,7 @@ using namespace beeswax; } while (false) DECLARE_bool(ping_expose_webserver_url); +DECLARE_string(anonymous_user_name); namespace impala { @@ -108,7 +109,10 @@ void ImpalaServer::executeAndWait(beeswax::QueryHandle& beeswax_handle, // transport, the username may be known at that time. If the username hasn't been set // yet, set it now. lock_guard l(session->lock); -if (session->connected_user.empty()) session->connected_user = query.hadoop_user; +if (session->connected_user.empty()) { + session->connected_user = query.hadoop_user.empty() ? + FLAGS_anonymous_user_name : query.hadoop_user; +} } // raise Syntax error or access violation; it's likely to be syntax/analysis error @@ -509,7 +513,10 @@ Status ImpalaServer::QueryToTQueryContext(const Query& query, // transport, the username may be known at that time. If the username hasn't been // set yet, set it now. lock_guard l(session->lock); - if (session->connected_user.empty()) session->connected_user = query.hadoop_user; + if (session->connected_user.empty()) { +session->connected_user = query.hadoop_user.empty() ? +FLAGS_anonymous_user_name : query.hadoop_user; + } query_ctx->client_request.query_options = session->QueryOptions(); set_query_options_mask = session->set_query_options_mask; } diff --git a/be/src/service/impala-hs2-server.cc b/be/src/service/impala-hs2-server.cc index 9cd55bb..b38a254 100644 --- a/be/src/service/impala-hs2-server.cc +++ b/be/src/service/impala-hs2-server.cc @@ -94,6 +94,7 @@ DECLARE_int32(webserver_port); DECLARE_int32(idle_session_timeout); DECLARE_int32(disconnected_session_timeout); DECLARE_bool(ping_expose_webserver_url); +DECLARE_string(anonymous_user_name); namespace impala { @@ -329,8 +330,10 @@ void Imp
[impala] 01/04: IMPALA-10091: [DOCS] add REFRESH_UPDATED_HMS_PARTITIONS query option
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 44bade8e7f14d22b6a16650d54dc9bee62caeb50 Author: Shajini Thayasingh AuthorDate: Tue Jan 5 12:49:21 2021 -0800 IMPALA-10091: [DOCS] add REFRESH_UPDATED_HMS_PARTITIONS query option remove trailing spaces added this new query option for Impala 4.0 Change-Id: I95b31b33f99073c57752e66eaf0f34facf511fc6 Reviewed-on: http://gerrit.cloudera.org:8080/16925 Reviewed-by: Vihang Karajgaonkar Tested-by: Impala Public Jenkins --- docs/impala.ditamap| 1 + docs/topics/impala_refresh_updated_hms.xml | 72 ++ 2 files changed, 73 insertions(+) diff --git a/docs/impala.ditamap b/docs/impala.ditamap index d6aeedd..cf7e3b6 100644 --- a/docs/impala.ditamap +++ b/docs/impala.ditamap @@ -230,6 +230,7 @@ under the License. + diff --git a/docs/topics/impala_refresh_updated_hms.xml b/docs/topics/impala_refresh_updated_hms.xml new file mode 100644 index 000..779c7d2 --- /dev/null +++ b/docs/topics/impala_refresh_updated_hms.xml @@ -0,0 +1,72 @@ + + + + + + REFRESH_UPDATED_HMS_PARTITIONS Query Option + + + + REFRESH_UPDATED_HMS_PARTITIONS + + + + + + + + + + + + + + + + + As the name implies the query option REFRESH_UPDATED_HMS_PARTITIONS is used + to refresh any updated HMS partitions. + + + This option is disabled by default so that the performance is not compromised when refreshing a + table. However, for certain corner case scenarios refresh table command does not detect changed + partitions. In case of the default refresh, catalogd detects, adds any new partitions and removes + any partitions which are not present in HMS anymore. However, it does not update any partitions + that changed (eg. change of location). When this query option is enabled, the refresh table + command will detect certain changes to the partitions and update them accordingly. Currently, + catalogd will update the partitions if any of the following StorageDescriptor properties have + been modified. 1. Partition Location 2. Partition Fileformat. 3 SerdeInfo. 4. Partition schema + changes. 5. Partition bucketInfo. + + + Type:BOOLEAN; recognized values are 1 and 0, or true and false; any other + value will be interpreted as false. + + + + Default:FALSE (shown as 0 in output of SET statement). + + + Added in: + + + + +
[impala] 03/04: IMPALA-9687 Improve estimates for number of hosts in Kudu plans
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 425e424b37f081ba1262f4f39298c6c13910aba8 Author: Akos Kovacs AuthorDate: Tue Dec 8 23:51:09 2020 +0100 IMPALA-9687 Improve estimates for number of hosts in Kudu plans In some cases Kudu plans could contain more hosts than the actual number of executors. This commit fixes it by capping the number of hosts at the number of executors, and determining which executors have local scan ranges. Testing: - Ran core tests Updated Kudu planner tests where the memory estimates changed. Change-Id: I72e341597e980fb6a7e3792905b942ddf5797d03 Reviewed-on: http://gerrit.cloudera.org:8080/16880 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../org/apache/impala/planner/KuduScanNode.java| 82 -- .../queries/PlannerTest/kudu-selectivity.test | 2 +- .../queries/PlannerTest/tpch-kudu.test | 20 +++--- 3 files changed, 86 insertions(+), 18 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java index 613c40b..e4a2d0c 100644 --- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java @@ -19,9 +19,11 @@ package org.apache.impala.planner; import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; +import java.util.Map; import java.util.Set; import org.apache.impala.analysis.Analyzer; @@ -54,6 +56,7 @@ import org.apache.impala.thrift.TScanRangeLocation; import org.apache.impala.thrift.TScanRangeLocationList; import org.apache.impala.thrift.TScanRangeSpec; import org.apache.impala.util.KuduUtil; +import org.apache.impala.util.ExecutorMembershipSnapshot; import org.apache.kudu.ColumnSchema; import org.apache.kudu.Schema; import org.apache.kudu.client.KuduClient; @@ -277,16 +280,81 @@ public class KuduScanNode extends ScanNode { return computeCombinedSelectivity(allConjuncts); } + /** + * Estimate the number of impalad nodes that this scan node will execute on (which is + * ultimately determined by the scheduling done by the backend's Scheduler). + * Assume that scan ranges that can be scheduled locally will be, and that scan + * ranges that cannot will be round-robined across the cluster. + */ + protected void computeNumNodes(Analyzer analyzer) { +ExecutorMembershipSnapshot cluster = ExecutorMembershipSnapshot.getCluster(); +final int maxInstancesPerNode = getMaxInstancesPerNode(analyzer); +final int maxPossibleInstances = cluster.numExecutors() * maxInstancesPerNode; +int totalNodes = 0; +int totalInstances = 0; +int numLocalRanges = 0; +int numRemoteRanges = 0; +// Counts the number of local ranges, capped at maxInstancesPerNode. +Map localRangeCounts = new HashMap<>(); +// Sum of the counter values in localRangeCounts. +int totalLocalParallelism = 0; +if (scanRangeSpecs_.isSetConcrete_ranges()) { + for (TScanRangeLocationList range : scanRangeSpecs_.concrete_ranges) { +boolean anyLocal = false; +if (range.isSetLocations()) { + for (TScanRangeLocation loc : range.locations) { +TNetworkAddress address = +analyzer.getHostIndex().getEntry(loc.getHost_idx()); +if (cluster.contains(address)) { + anyLocal = true; + // Use the full tserver address (including port) to account for the test + // minicluster where there are multiple tservers and impalads on a single + // host. This assumes that when an impalad is colocated with a tserver, + // there are the same number of impalads as tservers on this host in this + // cluster. + int count = localRangeCounts.getOrDefault(address, 0); + if (count < maxInstancesPerNode) { +++totalLocalParallelism; +localRangeCounts.put(address, count + 1); + } +} + } +} +// This range has at least one replica with a colocated impalad, so assume it +// will be scheduled on one of those nodes. +if (anyLocal) { + ++numLocalRanges; +} else { + ++numRemoteRanges; +} +// Approximate the number of nodes that will execute locally assigned ranges to +// be the smaller of the number of locally assigned ranges and the number of +// hosts that hold replica for those ranges. +int numLocalNodes = Math.min(numLocalRanges, localRangeCounts.size()); +// The remote ranges are round-robined a
[impala] branch master updated: IMPALA-6101: call DataStreamMgr::Cancel() once per query
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 868a01d IMPALA-6101: call DataStreamMgr::Cancel() once per query 868a01d is described below commit 868a01dca9071978f482aec2a9c3e18aca957914 Author: Tim Armstrong AuthorDate: Wed Dec 23 10:45:54 2020 -0800 IMPALA-6101: call DataStreamMgr::Cancel() once per query This is a bit of cleanup left over from the KRPC work that could avoid some lock contention for queries with large numbers of fragments. The change is just to do cancellation of receivers once per query instead of once per fragment. Change-Id: I7677d21f0aaddc3d4b56f72c0470ea850e34611e Reviewed-on: http://gerrit.cloudera.org:8080/16901 Reviewed-by: Thomas Tauber-Marshall Tested-by: Impala Public Jenkins --- be/src/runtime/data-stream-test.cc| 4 ++-- be/src/runtime/fragment-instance-state.cc | 2 -- be/src/runtime/krpc-data-stream-mgr.cc| 13 - be/src/runtime/krpc-data-stream-mgr.h | 17 ++--- be/src/runtime/query-state.cc | 3 +++ 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/be/src/runtime/data-stream-test.cc b/be/src/runtime/data-stream-test.cc index ec61d5e..01928aa 100644 --- a/be/src/runtime/data-stream-test.cc +++ b/be/src/runtime/data-stream-test.cc @@ -708,9 +708,9 @@ TEST_F(DataStreamTest, UnknownSenderLargeResult) { TEST_F(DataStreamTest, Cancel) { TUniqueId instance_id; StartReceiver(TPartitionType::UNPARTITIONED, 1, 1, 1024, false, _id); - stream_mgr_->Cancel(instance_id); + stream_mgr_->Cancel(GetQueryId(instance_id)); StartReceiver(TPartitionType::UNPARTITIONED, 1, 1, 1024, true, _id); - stream_mgr_->Cancel(instance_id); + stream_mgr_->Cancel(GetQueryId(instance_id)); JoinReceivers(); EXPECT_TRUE(receiver_info_[0]->status.IsCancelled()); EXPECT_TRUE(receiver_info_[1]->status.IsCancelled()); diff --git a/be/src/runtime/fragment-instance-state.cc b/be/src/runtime/fragment-instance-state.cc index 1de34d6..4f306b9 100644 --- a/be/src/runtime/fragment-instance-state.cc +++ b/be/src/runtime/fragment-instance-state.cc @@ -38,7 +38,6 @@ #include "runtime/client-cache.h" #include "runtime/exec-env.h" #include "runtime/fragment-state.h" -#include "runtime/krpc-data-stream-mgr.h" #include "runtime/krpc-data-stream-sender.h" #include "runtime/mem-tracker.h" #include "runtime/query-state.h" @@ -137,7 +136,6 @@ void FragmentInstanceState::Cancel() { runtime_state_->Cancel(); PlanRootSink* root_sink = GetRootSink(); if (root_sink != nullptr) root_sink->Cancel(runtime_state_); - ExecEnv::GetInstance()->stream_mgr()->Cancel(runtime_state_->fragment_instance_id()); } Status FragmentInstanceState::Prepare() { diff --git a/be/src/runtime/krpc-data-stream-mgr.cc b/be/src/runtime/krpc-data-stream-mgr.cc index 7e47fe5..96d6b62 100644 --- a/be/src/runtime/krpc-data-stream-mgr.cc +++ b/be/src/runtime/krpc-data-stream-mgr.cc @@ -335,13 +335,16 @@ Status KrpcDataStreamMgr::DeregisterRecvr( return Status(msg); } -void KrpcDataStreamMgr::Cancel(const TUniqueId& finst_id) { - VLOG_QUERY << "cancelling active streams for fragment_instance_id=" - << PrintId(finst_id); +void KrpcDataStreamMgr::Cancel(const TUniqueId& query_id) { + VLOG_QUERY << "cancelling active streams for query_id=" << PrintId(query_id); lock_guard l(lock_); + // Fragment instance IDs are the query ID with the lower bits set to the instance + // index. Therefore all finstances for a query are clustered together, starting + // after the position in the map where the query_id would be. FragmentRecvrSet::iterator iter = - fragment_recvr_set_.lower_bound(make_pair(finst_id, 0)); - while (iter != fragment_recvr_set_.end() && iter->first == finst_id) { + fragment_recvr_set_.lower_bound(make_pair(query_id, 0)); + while (iter != fragment_recvr_set_.end() && + GetQueryId(iter->first) == query_id) { bool unused; shared_ptr recvr = FindRecvr(iter->first, iter->second, ); if (recvr != nullptr) { diff --git a/be/src/runtime/krpc-data-stream-mgr.h b/be/src/runtime/krpc-data-stream-mgr.h index af7ff2c..48a8144 100644 --- a/be/src/runtime/krpc-data-stream-mgr.h +++ b/be/src/runtime/krpc-data-stream-mgr.h @@ -213,7 +213,7 @@ struct EndDataStreamCtx { // /// DataStreamMgr also allows asynchronous cancellation of streams via Cancel() /// which unblocks all KrpcDataStreamRecvr::GetBatch() calls that are made on behalf -/// of the cancelled fragment id. +/// of the cancelled query id. /// /// Exposes three metrics: /// 'senders-bloc
[impala] 03/03: IMPALA-6434: Add support to decode RLE_DICTIONARY encoded pages
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 1d5fe2771fb88fc1e9fc4ff84c47c69f4ce1e142 Author: Tim Armstrong AuthorDate: Mon Dec 21 20:05:12 2020 -0800 IMPALA-6434: Add support to decode RLE_DICTIONARY encoded pages The encoding is identical to the already-supported PLAIN_DICTIONARY encoding but the PLAIN enum value is used for the dictionary pages and the RLE_DICTIONARY enum value is used for the data pages. A hidden option -write_new_parquet_dictionary_encodings is added to turn on writing too, for test purposes only. Testing: * Added an automated test using a pregenerated test file. * Ran core tests. * Manually tested by writing out TPC-H lineitem with the new encoding and reading back in Impala and Hive. Parquet-tools output for the generated test file: $ hadoop jar ~/repos/parquet-mr/parquet-tools/target/parquet-tools-1.12.0-SNAPSHOT.jar meta /test-warehouse/att/824de2afebad009f-6f460ade0003_643159826_data.0.parq 20/12/21 20:28:36 INFO hadoop.ParquetFileReader: Initiating action with parallelism: 5 20/12/21 20:28:36 INFO hadoop.ParquetFileReader: reading another 1 footers 20/12/21 20:28:36 INFO hadoop.ParquetFileReader: Initiating action with parallelism: 5 file: hdfs://localhost:20500/test-warehouse/att/824de2afebad009f-6f460ade0003_643159826_data.0.parq creator: impala version 4.0.0-SNAPSHOT (build 7b691c5d4249f0cb1ced8ddf01033fbbe10511d9) file schema: schema id: OPTIONAL INT32 L:INTEGER(32,true) R:0 D:1 bool_col:OPTIONAL BOOLEAN R:0 D:1 tinyint_col: OPTIONAL INT32 L:INTEGER(8,true) R:0 D:1 smallint_col:OPTIONAL INT32 L:INTEGER(16,true) R:0 D:1 int_col: OPTIONAL INT32 L:INTEGER(32,true) R:0 D:1 bigint_col: OPTIONAL INT64 L:INTEGER(64,true) R:0 D:1 float_col: OPTIONAL FLOAT R:0 D:1 double_col: OPTIONAL DOUBLE R:0 D:1 date_string_col: OPTIONAL BINARY R:0 D:1 string_col: OPTIONAL BINARY R:0 D:1 timestamp_col: OPTIONAL INT96 R:0 D:1 year:OPTIONAL INT32 L:INTEGER(32,true) R:0 D:1 month: OPTIONAL INT32 L:INTEGER(32,true) R:0 D:1 row group 1: RC:8 TS:754 OFFSET:4 id: INT32 SNAPPY DO:4 FPO:48 SZ:74/73/0.99 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0, max: 7, num_nulls: 0] bool_col: BOOLEAN SNAPPY DO:0 FPO:141 SZ:26/24/0.92 VC:8 ENC:RLE,PLAIN ST:[min: false, max: true, num_nulls: 0] tinyint_col: INT32 SNAPPY DO:220 FPO:243 SZ:51/47/0.92 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0] smallint_col: INT32 SNAPPY DO:343 FPO:366 SZ:51/47/0.92 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0] int_col: INT32 SNAPPY DO:467 FPO:490 SZ:51/47/0.92 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0] bigint_col: INT64 SNAPPY DO:586 FPO:617 SZ:59/55/0.93 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0, max: 10, num_nulls: 0] float_col:FLOAT SNAPPY DO:724 FPO:747 SZ:51/47/0.92 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: -0.0, max: 1.1, num_nulls: 0] double_col: DOUBLE SNAPPY DO:845 FPO:876 SZ:59/55/0.93 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: -0.0, max: 10.1, num_nulls: 0] date_string_col: BINARY SNAPPY DO:983 FPO:1028 SZ:74/88/1.19 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0x30312F30312F3039, max: 0x30342F30312F3039, num_nulls: 0] string_col: BINARY SNAPPY DO:1143 FPO:1168 SZ:53/49/0.92 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 0x30, max: 0x31, num_nulls: 0] timestamp_col:INT96 SNAPPY DO:1261 FPO:1329 SZ:98/138/1.41 VC:8 ENC:RLE,RLE_DICTIONARY ST:[num_nulls: 0, min/max not defined] year: INT32 SNAPPY DO:1451 FPO:1470 SZ:47/43/0.91 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 2009, max: 2009, num_nulls: 0] month:INT32 SNAPPY DO:1563 FPO:1594 SZ:60/56/0.93 VC:8 ENC:RLE,RLE_DICTIONARY ST:[min: 1, max: 4, num_nulls: 0] Parquet-tools output for one of the lineitem files: $ hadoop jar ~/repos/parquet-mr/parquet-tools/target/parquet-tools-1.12.0-SNAPSHOT.jar meta /test-warehouse/li2/4b4d9143c575dd71-3f69d3cf0001_1879643220_data.0.parq 20/12/22 09:39:56 INFO hadoop.ParquetFileReader: Initiating action with parallelism: 5 20/12/22 09:39:56 INFO hadoop.ParquetFileReader: reading another 1 footers 20/12/22 09:39:56 INFO hadoop.ParquetFileReader: Initiating action with parallelism: 5 file: hdfs://localhost:20500/test-warehouse/li2/4b4d9143c575dd71-3f69d3cf0001_1879643220_data.0.parq creator: impala version 4.0.0-SNAPSHOT (build
[impala] 01/03: IMPALA-10422: EXPLAIN statements leak ACID transactions and locks
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 03af0b2c8c4d7a9db2fbcc63b93bcb0348e4d82f Author: Zoltan Borok-Nagy AuthorDate: Tue Jan 5 13:00:58 2021 +0100 IMPALA-10422: EXPLAIN statements leak ACID transactions and locks Currently EXPLAIN statements might open ACID transactions and create locks on ACID tables. This is not necessary since we won't modify the table. But the real problem is that these transactions and locks are leaked and open forever. They are even getting heartbeated while the coordinator is still running. The solution is to not consume any ACID resources for EXPLAIN statements. Testing: * Added EXPLAIN INSERT OVERWRITE in front of an actual INSERT OVERWRITE in an e2e test Change-Id: I05113b1fd9a3eb2d0dd6cf723df916457f3fbf39 Reviewed-on: http://gerrit.cloudera.org:8080/16923 Reviewed-by: Csaba Ringhofer Tested-by: Impala Public Jenkins --- fe/src/main/java/org/apache/impala/service/Frontend.java | 4 ++-- .../workloads/functional-query/queries/QueryTest/acid-insert.test | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/service/Frontend.java b/fe/src/main/java/org/apache/impala/service/Frontend.java index 3a71fdf..046295d 100644 --- a/fe/src/main/java/org/apache/impala/service/Frontend.java +++ b/fe/src/main/java/org/apache/impala/service/Frontend.java @@ -1639,8 +1639,8 @@ public class Frontend { return result; } } - if (analysisResult.isInsertStmt() || - analysisResult.isCreateTableAsSelectStmt()) { + if (!analysisResult.isExplainStmt() && + (analysisResult.isInsertStmt() || analysisResult.isCreateTableAsSelectStmt())) { InsertStmt insertStmt = analysisResult.getInsertStmt(); FeTable targetTable = insertStmt.getTargetTable(); if (AcidUtils.isTransactionalTable( diff --git a/testdata/workloads/functional-query/queries/QueryTest/acid-insert.test b/testdata/workloads/functional-query/queries/QueryTest/acid-insert.test index 8665c56..ed9a741 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/acid-insert.test +++ b/testdata/workloads/functional-query/queries/QueryTest/acid-insert.test @@ -25,6 +25,9 @@ select i from insertonly_nopart order by i; INT QUERY +# IMPALA-10422: Run EXPLAIN before INSERT OVERWRITE to check that +# EXPLAIN statements don't leak transactions and locks. +explain insert overwrite insertonly_nopart values (10); insert overwrite insertonly_nopart values (10); QUERY
[impala] branch master updated (a5f6c26 -> 1d5fe27)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from a5f6c26 IMPALA-2536: Make ColumnType constructor explicit new 03af0b2 IMPALA-10422: EXPLAIN statements leak ACID transactions and locks new 4968055 IMPALA-10182: Don't add inferred identity predicates to SELECT node new 1d5fe27 IMPALA-6434: Add support to decode RLE_DICTIONARY encoded pages The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/exec/parquet/hdfs-parquet-scanner.cc| 9 +- be/src/exec/parquet/hdfs-parquet-table-writer.cc | 37 +-- be/src/exec/parquet/parquet-column-chunk-reader.cc | 2 + be/src/exec/parquet/parquet-column-readers.cc | 18 ++-- be/src/exec/parquet/parquet-common.h | 7 ++ be/src/exec/parquet/parquet-metadata-utils.cc | 1 + .../apache/impala/planner/SingleNodePlanner.java | 18 +++- .../java/org/apache/impala/service/Frontend.java | 4 +- testdata/data/README | 7 ++ testdata/data/alltypes_tiny_rle_dictionary.parquet | Bin 0 -> 3646 bytes .../queries/PlannerTest/inline-view.test | 113 + .../queries/QueryTest/acid-insert.test | 3 + .../queries/QueryTest/inline-view.test | 36 +++ .../queries/QueryTest/parquet-rle-dictionary.test | 16 +++ tests/query_test/test_scanners.py | 6 ++ 15 files changed, 250 insertions(+), 27 deletions(-) create mode 100644 testdata/data/alltypes_tiny_rle_dictionary.parquet create mode 100644 testdata/workloads/functional-query/queries/QueryTest/parquet-rle-dictionary.test
[impala] 02/03: IMPALA-10182: Don't add inferred identity predicates to SELECT node
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 49680559b0da843fbb6ff949d52c9d43f98364b1 Author: Aman Sinha AuthorDate: Sun Jan 3 15:58:57 2021 -0800 IMPALA-10182: Don't add inferred identity predicates to SELECT node For an inferred equality predicates of type c1 = c2 if both sides are referring to the same underlying tuple and slot, it is an identity predicate which should not be evaluated by the SELECT node since it will incorrectly eliminate NULL rows. This patch fixes the behavior. Testing: - Added planner tests with base table and with outer join - Added runtime tests with base table and with outer join - Added planner test for IMPALA-9694 (same root cause) - Ran PlannerTest .. no other plans changed Change-Id: I924044f582652dbc50085851cc639f3dee1cd1f4 Reviewed-on: http://gerrit.cloudera.org:8080/16917 Reviewed-by: Aman Sinha Tested-by: Impala Public Jenkins --- .../apache/impala/planner/SingleNodePlanner.java | 18 +++- .../queries/PlannerTest/inline-view.test | 113 + .../queries/QueryTest/inline-view.test | 36 +++ 3 files changed, 166 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java index bb00470..2a85d9e 100644 --- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java +++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java @@ -467,8 +467,24 @@ public class SingleNodePlanner { analyzer.createEquivConjuncts(tid, conjuncts); } if (conjuncts.isEmpty()) return root; + +List finalConjuncts = new ArrayList<>(); +// Check if this is an inferred identity predicate i.e for c1 = c2 both +// sides are pointing to the same source slot. In such cases it is wrong +// to add the predicate to the SELECT node because it will incorrectly +// eliminate rows with NULL values. +for (Expr e : conjuncts) { + if (e instanceof BinaryPredicate && ((BinaryPredicate) e).isInferred()) { +SlotDescriptor lhs = ((BinaryPredicate) e).getChild(0).findSrcScanSlot(); +SlotDescriptor rhs = ((BinaryPredicate) e).getChild(1).findSrcScanSlot(); +if (lhs != null && rhs != null && lhs.equals(rhs)) continue; + } + finalConjuncts.add(e); +} +if (finalConjuncts.isEmpty()) return root; + // evaluate conjuncts in SelectNode -SelectNode selectNode = new SelectNode(ctx_.getNextNodeId(), root, conjuncts); +SelectNode selectNode = new SelectNode(ctx_.getNextNodeId(), root, finalConjuncts); // init() marks conjuncts as assigned selectNode.init(analyzer); Preconditions.checkState(selectNode.hasValidStats()); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test index 25f7ea7..0d083e3 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test @@ -2484,3 +2484,116 @@ PLAN-ROOT SINK HDFS partitions=4/4 files=4 size=6.32KB row-size=89B cardinality=100 +# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns +select c1, c2 from (select tinyint_col c1, tinyint_col c2 + from functional.alltypesagg group by 1, 2) t1 + group by 1, 2 + union all +select c1, c2 from (select tinyint_col c1, tinyint_col c2 + from functional.alltypesagg group by 1, 2) t1 + group by 1, 2; + PLAN +PLAN-ROOT SINK +| +00:UNION +| pass-through-operands: all +| row-size=2B cardinality=18 +| +|--06:AGGREGATE [FINALIZE] +| | group by: tinyint_col, tinyint_col +| | row-size=2B cardinality=9 +| | +| 05:AGGREGATE [FINALIZE] +| | group by: tinyint_col +| | row-size=1B cardinality=9 +| | +| 04:SCAN HDFS [functional.alltypesagg] +| HDFS partitions=11/11 files=11 size=814.73KB +| row-size=1B cardinality=11.00K +| +03:AGGREGATE [FINALIZE] +| group by: tinyint_col, tinyint_col +| row-size=2B cardinality=9 +| +02:AGGREGATE [FINALIZE] +| group by: tinyint_col +| row-size=1B cardinality=9 +| +01:SCAN HDFS [functional.alltypesagg] + HDFS partitions=11/11 files=11 size=814.73KB + row-size=1B cardinality=11.00K + +# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns +# Introduce nulls from the null producing side of left outer join +with dt1 as (select t2.int_col y from functional.alltypessmall t1 + left outer join functional.alltypestiny t2 on t1.int_col = t2.int_col) +select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1 + group by 1, 2 + union all +select c1, c2 from (select dt1.y c1, dt1.y c2 from
[impala] 03/04: IMPALA-10336: Coordinator return incorrect error to client
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 6b292bdd1527ec5501685c4564145d2a725195d9 Author: wzhou-code AuthorDate: Tue Dec 8 23:55:17 2020 -0800 IMPALA-10336: Coordinator return incorrect error to client Due to race condition, coordinator could set execution status as RPC aborted due to cancellation. This internal error should not be returned to client. This patch fixed the issue by setting the backend status as CANCELLED instead of ABORTED if the exec RPC was aborted due to cancellation. Testing: - Manual tests Since this is a racy bug, I could only reproduce the situation by adding some artificial delays in 3 places: QueryExecMgr.StartQuery(), Coordinator.UpdateBackendExecStatus(), and Coordinator::StartBackendExec() when running test case test_scanners.py::TestOrc::test_type_conversions_hive3. Verified that the issue did not happen after applying this patch by running test_scanners.py::TestOrc::test_type_conversions_hive3 in a loop for hours. - Passed exhausive test. Change-Id: I75f252e43006c6ff6980800e3254672de396b318 Reviewed-on: http://gerrit.cloudera.org:8080/16849 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/runtime/coordinator-backend-state.cc | 14 -- be/src/runtime/coordinator-backend-state.h | 3 +++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc index c7beac4..df75f35 100644 --- a/be/src/runtime/coordinator-backend-state.cc +++ b/be/src/runtime/coordinator-backend-state.cc @@ -213,8 +213,17 @@ void Coordinator::BackendState::ExecCompleteCb( rpc_latency_ = MonotonicMillis() - start_ms; if (!exec_rpc_status_.ok()) { - SetExecError( - FromKuduStatus(exec_rpc_status_, "Exec() rpc failed"), exec_status_barrier); + // Return CANCELLED instead of ABORTED if the RPC is cancelled. + if (cancel_exec_rpc_ && exec_rpc_status_.IsAborted()) { +LOG(ERROR) << "ExecQueryFInstances rpc query_id=" << PrintId(query_id_) + << " was aborted by cancellation"; +status_ = Status::CANCELLED; +exec_done_ = true; +exec_status_barrier->NotifyRemaining(status_); + } else { +SetExecError( +FromKuduStatus(exec_rpc_status_, "Exec() rpc failed"), exec_status_barrier); + } goto done; } @@ -584,6 +593,7 @@ Coordinator::BackendState::CancelResult Coordinator::BackendState::Cancel( // and then wait for it to be done. if (!exec_done_) { VLogForBackend("Attempting to cancel Exec() rpc"); +cancel_exec_rpc_ = true; exec_rpc_controller_.Cancel(); WaitOnExecLocked(); } diff --git a/be/src/runtime/coordinator-backend-state.h b/be/src/runtime/coordinator-backend-state.h index d9b5bf8..9c6fc55 100644 --- a/be/src/runtime/coordinator-backend-state.h +++ b/be/src/runtime/coordinator-backend-state.h @@ -418,6 +418,9 @@ class Coordinator::BackendState { /// True if a CancelQueryFInstances RPC was already sent to this backend. bool sent_cancel_rpc_ = false; + /// True if Exec() RPC is cancelled. + bool cancel_exec_rpc_ = false; + /// Total scan ranges complete across all scan nodes. Set in ApplyExecStatusReport(). int64_t total_ranges_complete_ = 0;
[impala] 02/04: IMPALA-6671: Skip locked tables from topic updates
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 2fccd82590d747d834b8be6f3b05bb446d9bac12 Author: Vihang Karajgaonkar AuthorDate: Mon Oct 12 10:38:23 2020 -0700 IMPALA-6671: Skip locked tables from topic updates This change adds a mechanism for topic-update thread to skip a table which is locked for more than a configurable interval from the topic updates. This is especially useful in scenarios where long running operations on a locked table (refresh, recover partitions, compute stats) block the topic update thread. This causes unrelated queries which are waiting on metadata via topic updates (catalog-v1 mode) to unnecessarily block. The ideal solution of this problem would be to make HdfsTable immutable so that there is no need for table lock. But that is large change and not easily portable to older releases of Impala. It would be taken up as a separate patch. This change introduces 2 new configurations for catalogd: 1. topic_update_tbl_max_wait_time_ms: This defines the maximum time in msecs the topic update thread waits on a locked table before skipping the table from that iteration of topic updates. The default value is 500. If this configuration is set to 0 the lock with timeout for topic update thread is disabled. 2. catalog_max_lock_skipped_topic_updates: This defines the maximum number of distinct lock operations which are skipped by topic update thread due to lock contention. Once this limit is reached, topic update thread will block until it acquires the table lock and adds it to the updates. Testing: 1. Added a test case which introduces a simulated delay in a few potentially long running statements. This causes the table to be locked for a long time. The topic update thread skips that table from updates and unrelated queries are unblocked since they receive the required metadata from updates. 2. Added a test where multiple threads run blocking statements in a loop to stress the table lock. It makes sure that topic update thread is not starved and eventually blocks on table lock by hitting the limit defined by catalog_max_lock_skipped_topic_updates. 3. Ran exhaustive tests with default configurations. Change-Id: Ic657b96edbcdc94c6b906e7ca59291f4e4715655 Reviewed-on: http://gerrit.cloudera.org:8080/16549 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/catalog/catalog-server.cc | 15 + be/src/util/backend-gflag-util.cc | 5 + common/thrift/BackendGflags.thrift | 4 + .../apache/impala/analysis/CopyTestCaseStmt.java | 4 +- .../java/org/apache/impala/catalog/Catalog.java| 4 +- .../impala/catalog/CatalogServiceCatalog.java | 315 - .../org/apache/impala/catalog/HdfsPartition.java | 6 +- .../java/org/apache/impala/catalog/HdfsTable.java | 74 + .../main/java/org/apache/impala/catalog/Table.java | 72 - .../org/apache/impala/catalog/TopicUpdateLog.java | 18 +- .../apache/impala/service/CatalogOpExecutor.java | 109 --- .../catalog/CatalogObjectToFromThriftTest.java | 4 +- .../custom_cluster/test_topic_update_frequency.py | 213 ++ 13 files changed, 696 insertions(+), 147 deletions(-) diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc index a66e256..e14c982 100644 --- a/be/src/catalog/catalog-server.cc +++ b/be/src/catalog/catalog-server.cc @@ -74,6 +74,21 @@ DEFINE_int64_hidden(catalog_partial_fetch_rpc_queue_timeout_s, LLONG_MAX, "Maxim "(in seconds) a partial catalog object fetch RPC spends in the queue waiting " "to run. Must be set to a value greater than zero."); +DEFINE_int32(catalog_max_lock_skipped_topic_updates, 2, "Maximum number of topic " +"updates skipped for a table due to lock contention in catalogd after which it must" +"be added to the topic the update log. This limit only applies to distinct lock " +"operations which block the topic update thread."); + +DEFINE_int64(topic_update_tbl_max_wait_time_ms, 500, "Maximum time " + "(in milliseconds) catalog's topic update thread will wait to acquire lock on " + "table. If the topic update thread cannot acquire a table lock it skips the table " + "from that topic update and processes the table in the next update. However to " + "prevent starvation it only skips the table catalog_max_lock_skipped_topic_updates " + "many times. After that limit is hit, topic thread block until it acquires the " + "table lock. A value of 0 disables the time
[impala] branch master updated (6cb7cec -> 3f2eab8)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 6cb7cec IMPALA-10237: Support Bucket and Truncate partition transforms as built-in functions new 1b86313 IMPALA-10211 (Part 1): Add support for role-related statements new 2fccd82 IMPALA-6671: Skip locked tables from topic updates new 6b292bd IMPALA-10336: Coordinator return incorrect error to client new 3f2eab8 IMPALA-9966: Add missing breaks in SetQueryOption The 4 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/catalog/catalog-server.cc | 15 + be/src/runtime/coordinator-backend-state.cc| 14 +- be/src/runtime/coordinator-backend-state.h |3 + be/src/service/query-options.cc| 12 +- be/src/util/backend-gflag-util.cc |5 + bin/create-test-configuration.sh | 17 + common/thrift/BackendGflags.thrift |4 + .../apache/impala/analysis/CopyTestCaseStmt.java |4 +- .../ranger/RangerCatalogdAuthorizationManager.java | 207 +++- .../ranger/RangerImpaladAuthorizationManager.java | 91 +- .../impala/authorization/ranger/RangerUtil.java| 37 + .../java/org/apache/impala/catalog/Catalog.java|4 +- .../impala/catalog/CatalogServiceCatalog.java | 315 +++-- .../org/apache/impala/catalog/HdfsPartition.java |6 +- .../java/org/apache/impala/catalog/HdfsTable.java | 74 ++ .../main/java/org/apache/impala/catalog/Table.java | 72 +- .../org/apache/impala/catalog/TopicUpdateLog.java | 18 +- .../apache/impala/service/CatalogOpExecutor.java | 109 +- .../authorization/AuthorizationTestBase.java |4 +- .../catalog/CatalogObjectToFromThriftTest.java |4 +- .../queries/QueryTest/grant_revoke.test| 1274 tests/authorization/test_ranger.py | 119 +- .../custom_cluster/test_topic_update_frequency.py | 213 23 files changed, 2405 insertions(+), 216 deletions(-) create mode 100644 testdata/workloads/functional-query/queries/QueryTest/grant_revoke.test create mode 100644 tests/custom_cluster/test_topic_update_frequency.py
[impala] 01/04: IMPALA-10211 (Part 1): Add support for role-related statements
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 1b863132c6fbe1b45aabea9986653a9ec7817092 Author: Fang-Yu Rao AuthorDate: Sat Oct 3 12:00:30 2020 -0700 IMPALA-10211 (Part 1): Add support for role-related statements This patch adds the support for the following role-related statements. 1. CREATE ROLE . 2. DROP ROLE . 3. GRANT ROLE TO GROUP . 4. REVOKE ROLE FROM GROUP . 5. GRANT ON TO ROLE . 6. REVOKE ON FROM ROLE . 7. SHOW GRANT ROLE ON . 8. SHOW ROLES. 9. SHOW CURRENT ROLES. 10. SHOW ROLE GRANT GROUP . To support the first 4 statements, we implemented the methods of createRole()/dropRole(), and grantRoleToGroup()/revokeRoleFromGroup() with their respective API calls provided by Ranger. To support the 5th and 6th statements, we modified createGrantRevokeRequest() so that the cases in which the grantee or revokee is a role could be processed. We slightly extended getPrivileges() so as to include the case when the principal is a role for the 7th statement. For the last 3 statements, to make Impala's behavior consistent with that when Sentry was the authorization provider, we based our implementation on SentryImpaladAuthorizationManager#getRoles() at https://gerrit.cloudera.org/c/15833/8/fe/src/main/java/org/apache/impala/authorization/sentry/SentryImpaladAuthorizationManager.java, which was removed in IMPALA-9708 when we dropped the support for Sentry. To test the implemented functionalities, we based our test cases on those at https://gerrit.cloudera.org/c/15833/8/testdata/workloads/functional-query/queries/QueryTest/grant_revoke.test. We note that before our tests could be automatically run in a Kerberized environment (IMPALA-9360), in order to run the statements of CREATE/DROP ROLE , GRANT/REVOKE ROLE TO/FROM GROUP , and SHOW ROLES, we revised security-applicationContext.xml, one of the files needed when the Ranger server is started, so that the corresponding API calls could be performed in a non-Kerberized environment. During the process of adding test cases to grant_revoke.test, we found the following differences in Impala's behavior between the case when Ranger is the authorization provider and that when Sentry is the authorization provider. Specifically, we have the following two major differences. 1. Before dropping a role in Ranger, we have to remove all the privileges granted to the role in advance, which is not the case when Sentry is the authorization provider. 2. The resource has to be specified for the statement of SHOW GRANT ROLE ON , which is different when Sentry is the authorization provider. This could be partly due to the fact that there is no API provided by Ranger that allows Impala to directly retrieve the list of all privileges granted to a specified role. Due to the differences in Impala's behavior described above, we had to revise the test cases in grant_revoke.test accordingly. On the other hand, to include as many test cases that were in the original grant_revoke.test as possible, we had to explicitly add the test section of 'USER' to specify the connecting user to Impala for some queries that require the connecting user to be a Ranger administrator, e.g., CREATE/DROP ROLE and GRANT/REVOKE TO/FROM GROUP . The user has to be 'admin' in the current grant_revoke.test, whereas it could be the default user 'getuser()' in the original grant_revoke.test because previously 'getuser()' was also a Sentry administrator. Moreover, for some test cases, we had to explicitly alter the owner of a resource in the original grant_revoke.test when we would like to prevent the original owner of the resource, e.g., the creator of the resource, from accessing the resource since the original grant_revoke.test was run without object ownership being taken into consideration. We also note that in this patch we added the decorator of @pytest.mark.execute_serially to each test in test_ranger.py since we have observed that in some cases, e.g., if we are only running the E2E tests in the Jenkins environment, some tests do not seem to be executed sequentially. Testing: - Briefly verified that the implemented statements work as expected in a Kerberized cluster. - Verified that test_ranger.py passes in a local development environment. - Verified that the patch passes the exhaustive tests in the DEBUG build. Change-Id: Ic2b204e62a1d8ae1932d955b4efc28be22202860 Reviewed-on: http://gerrit.cloudera.org:8080/16837 Reviewed-by: Quanlong Huang Tested-by: Impala Public Jenkins --- bin/create-test
[impala] 02/02: IMPALA-10381: Fix overloading of --ldap_passwords_in_clear_ok
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 10fe4b6c635601768e70054255b7e50e000e71b5 Author: Thomas Tauber-Marshall AuthorDate: Mon Dec 7 15:17:37 2020 -0800 IMPALA-10381: Fix overloading of --ldap_passwords_in_clear_ok The --ldap_passwords_in_clear_ok flag was originally intended to allow configurations where Impala connects to LDAP without SSL, for testing purposes. Since then, two other uses of the flag have been added: 1) for controlling whether cookies include the 'Secure' attribute and 2) for controlling whether the webserver allows LDAP auth to be enabled if SSL isn't. Some use cases may prefer to control these values separately, so this patch separates them into three different flags. Testing: - Updated existing tests that use --ldap_passwords_in_clear_ok Change-Id: I12ee3a857365c0fca261a8b06de2321ed6b40a83 Reviewed-on: http://gerrit.cloudera.org:8080/16829 Reviewed-by: Impala Public Jenkins Tested-by: Thomas Tauber-Marshall --- be/src/rpc/authentication-util.cc | 10 +- be/src/util/webserver-test.cc | 6 -- be/src/util/webserver.cc | 8 +--- .../java/org/apache/impala/customcluster/LdapJdbcTest.java | 4 +++- .../org/apache/impala/customcluster/LdapWebserverTest.java | 3 ++- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/be/src/rpc/authentication-util.cc b/be/src/rpc/authentication-util.cc index e910557..64d0215 100644 --- a/be/src/rpc/authentication-util.cc +++ b/be/src/rpc/authentication-util.cc @@ -28,7 +28,10 @@ #include "util/openssl-util.h" #include "util/string-parser.h" -DECLARE_bool(ldap_passwords_in_clear_ok); +DEFINE_bool(cookie_require_secure, true, +"(Advanced) If true, authentication cookies will include the 'Secure' attribute, " +"indicating to clients that they should only be returned over SSL connections. For " +"testing only."); DEFINE_int64(max_cookie_lifetime_s, 24 * 60 * 60, "Maximum amount of time in seconds that an authentication cookie will remain valid. " "Setting to 0 disables use of cookies. Defaults to 1 day."); @@ -153,10 +156,7 @@ string GenerateCookie(const string& username, const AuthenticationHash& hash) { base64_signature[SHA256_BASE64_LEN] = '\0'; const char* secure_flag = ";Secure"; - if (FLAGS_ldap_passwords_in_clear_ok) { -// If the user specified password can be sent without TLS/SSL, don't include the -// 'Secure' flag, which indicates the cookie should only be returned over secured -// connections. This is for testing only. + if (!FLAGS_cookie_require_secure) { secure_flag = ""; } return Substitute("$0=$1$2$3;HttpOnly;Max-Age=$4$5", COOKIE_NAME, base64_signature, diff --git a/be/src/util/webserver-test.cc b/be/src/util/webserver-test.cc index 6b06444..3d94010 100644 --- a/be/src/util/webserver-test.cc +++ b/be/src/util/webserver-test.cc @@ -45,7 +45,8 @@ DECLARE_string(webserver_private_key_password_cmd); DECLARE_string(webserver_x_frame_options); DECLARE_string(ssl_cipher_list); DECLARE_string(ssl_minimum_version); -DECLARE_bool(ldap_passwords_in_clear_ok); +DECLARE_bool(webserver_ldap_passwords_in_clear_ok); +DECLARE_bool(cookie_require_secure); #include "common/names.h" @@ -376,7 +377,8 @@ TEST(Webserver, TestWithSpnego) { gflags::FlagSaver saver; FLAGS_webserver_require_spnego = true; - FLAGS_ldap_passwords_in_clear_ok = true; + FLAGS_webserver_ldap_passwords_in_clear_ok = true; + FLAGS_cookie_require_secure = false; MetricGroup metrics("webserver-test"); Webserver webserver("", FLAGS_webserver_port, ); diff --git a/be/src/util/webserver.cc b/be/src/util/webserver.cc index 3e32025..851e2e9 100644 --- a/be/src/util/webserver.cc +++ b/be/src/util/webserver.cc @@ -130,11 +130,13 @@ DEFINE_string(webserver_ldap_user_filter, "", DEFINE_string(webserver_ldap_group_filter, "", "Comma separated list of groups. If specified, users must belong to one of these " "groups for LDAP authentication to the webserver to succeed."); +DEFINE_bool(webserver_ldap_passwords_in_clear_ok, false, +"(Advanced) If true, allows the webserver to start with LDAP authentication even if " +"SSL is not enabled, a potentially insecure configuration."); DECLARE_bool(enable_ldap_auth); DECLARE_string(hostname); DECLARE_bool(is_coordinator); -DECLARE_bool(ldap_passwords_in_clear_ok); DECLARE_int64(max_cookie_lifetime_s); DECLARE_string(ssl_minimum_version); DECLARE_string(ssl_ciph
[impala] branch master updated (ec6070a -> 10fe4b6)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from ec6070a IMPALA-10391: Fix LIRS edge case for single unprotected entry new 8de7bf7 IMPALA-10398: Altering an Iceberg table might throw NullPointerException new 10fe4b6 IMPALA-10381: Fix overloading of --ldap_passwords_in_clear_ok The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/rpc/authentication-util.cc | 10 +- be/src/util/webserver-test.cc | 6 -- be/src/util/webserver.cc | 8 +--- .../java/org/apache/impala/util/IcebergSchemaConverter.java| 7 ++- .../java/org/apache/impala/customcluster/LdapJdbcTest.java | 4 +++- .../org/apache/impala/customcluster/LdapWebserverTest.java | 3 ++- 6 files changed, 25 insertions(+), 13 deletions(-)
[impala] 01/02: IMPALA-10398: Altering an Iceberg table might throw NullPointerException
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 8de7bf773ded7e6db9a0463222e7f552fd20735f Author: Zoltan Borok-Nagy AuthorDate: Wed Dec 16 15:01:35 2020 +0100 IMPALA-10398: Altering an Iceberg table might throw NullPointerException IcebergSchemaConverter has a static thread local member which might not have a value in the current thread when nextId() is invoked. In that case the thread local integer's get() method returns a null and we get a NullPointerException when we want to convert it to a builtin int. This patch initializes the thread local variable with an anonymous subclass of ThreadLocal that overrides the 'initialValue()' method which returns 0 instead of null. Testing * tested manually by restarting the impala cluster and issuing ALTER TABLE .. ADD COLUMNS * looped test_alter_iceberg_tables for a while Change-Id: I4e8b7c68898bd13c5288b466d5bf3d258392 Reviewed-on: http://gerrit.cloudera.org:8080/16882 Reviewed-by: Gabor Kaszab Tested-by: Impala Public Jenkins --- .../main/java/org/apache/impala/util/IcebergSchemaConverter.java | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java b/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java index 763c4fb..736a265 100644 --- a/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java +++ b/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java @@ -42,7 +42,12 @@ public class IcebergSchemaConverter { // them from multiple threads. Hence we use this thread-local integer to generate // unique field ids for each schema element. Please note that Iceberg only care about // the uniqueness of the field ids, but they will be reassigned by Iceberg. - private static ThreadLocal iThreadLocal = new ThreadLocal<>(); + private static ThreadLocal iThreadLocal = new ThreadLocal() { +@Override +public Integer initialValue() { +return 0; +} + }; /** * Transform iceberg type to impala type
[impala] 03/04: IMPALA-10393: Iceberg field id-based column resolution fails in ASAN builds
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit d1f4a4cc457c0327cf0e5d482116b97431da154e Author: Zoltan Borok-Nagy AuthorDate: Mon Dec 14 15:20:11 2020 +0100 IMPALA-10393: Iceberg field id-based column resolution fails in ASAN builds For MAP types field id resolution indexes the top-level columns via the current 'table_idx - 1'. In this case table_idx is either SchemaPathConstants::MAP_KEY or SchemaPathConstants::MAP_VALUE which are 0 and 1 respectively. Hence 'table_idx - 1' can be -1 which is not a valid index for a vector, hence we get an ASAN error. Even if 'table_idx - 1' is zero we get a wrong field id. Note that at this point in the schema resolution we have successfully found a MAP type with a matching field id, therefore it is safe to resolve the child via the value of 'table_idx' (which is the position of the child, MAP_KEY or MAP_VALUE). Testing: * Built impala with ASAN (buildall.sh -notests -skiptests -asan), then executed test_iceberg_query Change-Id: I41e8daaebe8a6024716e6c22f6ccd819f43508bd Reviewed-on: http://gerrit.cloudera.org:8080/16873 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/parquet-metadata-utils.cc | 14 +++--- tests/query_test/test_scanners.py | 2 -- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index 2e77326..e2b1e46 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++ b/be/src/exec/parquet/parquet-metadata-utils.cc @@ -764,17 +764,9 @@ SchemaNode* ParquetSchemaResolver::NextSchemaNode( DCHECK_EQ(col_type->type, TYPE_MAP); DCHECK(table_idx == SchemaPathConstants::MAP_KEY || table_idx == SchemaPathConstants::MAP_VALUE); - int field_id = -1; - if (table_idx == SchemaPathConstants::MAP_KEY) { -field_id = tbl_desc_.col_descs()[table_idx - 1].field_map_key_id(); - } else { -field_id = tbl_desc_.col_descs()[table_idx - 1].field_map_value_id(); - } - file_idx = FindChildWithFieldId(node, field_id); - if (file_idx >= node->children.size()) { -// Couldn't resolve by field id, fall back to resolution by position. -file_idx = table_idx; - } + // At this point we've found a MAP with a matching field id. It's safe to resolve + // the child (key or value) by position. + file_idx = table_idx; } } else { // Resolution by position. diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 3192144..5a865cc 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -348,8 +348,6 @@ class TestIceberg(ImpalaTestSuite): create_exec_option_dimension(debug_action_options=DEBUG_ACTION_DIMS)) cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format == 'parquet') -cls.ImpalaTestMatrix.add_dimension( -ImpalaTestDimension('PARQUET_FALLBACK_SCHEMA_RESOLUTION', 2)) def test_iceberg_query(self, vector): self.run_test_case('QueryTest/iceberg-query', vector)
[impala] branch master updated (87b95a5 -> a7e71b4)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 87b95a5 IMPALA-10386: Don't allow PARTITION BY SPEC for non-Iceberg tables new a8ac9f8 IMPALA-10390: impala-profile-tool JSON output new 9dd0abb IMPALA-10287: Include parallelism in cost comparison of broadcast vs partition new d1f4a4c IMPALA-10393: Iceberg field id-based column resolution fails in ASAN builds new a7e71b4 IMPALA-10358: Correct Iceberg type mappings The 4 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/exec/parquet/hdfs-parquet-table-writer.cc | 8 +- be/src/exec/parquet/hdfs-parquet-table-writer.h| 6 + be/src/exec/parquet/parquet-metadata-utils.cc | 18 +- be/src/exec/parquet/parquet-metadata-utils.h | 4 +- be/src/service/impala-hs2-server.cc| 14 +- be/src/service/impala-server.h | 5 - be/src/service/query-options.cc| 16 + be/src/service/query-options.h | 6 +- be/src/util/impala-profile-tool.cc | 79 ++- be/src/util/runtime-profile.cc | 18 + be/src/util/runtime-profile.h | 6 + common/thrift/ImpalaInternalService.thrift | 7 + common/thrift/ImpalaService.thrift | 11 + .../org/apache/impala/analysis/InsertStmt.java | 17 + .../org/apache/impala/catalog/FeIcebergTable.java | 5 + .../org/apache/impala/catalog/IcebergTable.java| 13 +- .../impala/catalog/local/LocalIcebergTable.java| 9 +- .../apache/impala/planner/DistributedPlanner.java | 21 +- .../impala/service/IcebergCatalogOpExecutor.java | 120 + .../apache/impala/util/IcebergSchemaConverter.java | 183 +++ .../java/org/apache/impala/util/IcebergUtil.java | 127 - .../org/apache/impala/planner/PlannerTest.java | 8 + .../queries/PlannerTest/tpcds-dist-method.test | 538 + .../queries/QueryTest/iceberg-insert.test | 23 - .../queries/QueryTest/iceberg-negative.test| 21 + tests/query_test/test_iceberg.py | 39 +- tests/query_test/test_scanners.py | 2 - 27 files changed, 999 insertions(+), 325 deletions(-) create mode 100644 fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java create mode 100644 testdata/workloads/functional-planner/queries/PlannerTest/tpcds-dist-method.test
[impala] 02/04: IMPALA-10287: Include parallelism in cost comparison of broadcast vs partition
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 9dd0abbb373c0256a948cdf54a1a677230e7f5cb Author: Aman Sinha AuthorDate: Thu Dec 10 23:20:05 2020 -0800 IMPALA-10287: Include parallelism in cost comparison of broadcast vs partition The current planner tends to pick broadcast distribution in some cases even when partition distribution would be more optimal (seen in TPC-DS performance runs). This patch adds 2 query options: - use_dop_for_costing (type:boolean, default:true) - broadcast_to_partition_factor (type:double, default:1.0) With use_dop_for_costing enabled, the distributed planner will increase the cost of the broadcast join's build side by C.sqrt(m) where m = degree of parallelism of the join node and, C = the broadcast_to_partition_factor This allows the planner to more favorably consider partition distribution where appropriate. The choice of sqrt in the calculation is not a final choice at this point but is intended to model a non-linear relationship between mt_dop and the query performance. After further performance testing with tuning the above factor, we can establish a better correlation and refine the formula (tracked by IMPALA-10395). Testing: - Added a new test file with TPC-DS Q78 which shows partition distribution for a left-outer join (with store_returns on the right input) in the query when the query options are enabled (it chooses broadcast otherwise). - Ran PlannerTest and TpcdsPlannerTest. - Ran e2e tests for Tpcds and Tpch. Change-Id: Idff569299e5c78720ca17c616a531adac78208e1 Reviewed-on: http://gerrit.cloudera.org:8080/16864 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/query-options.cc| 16 + be/src/service/query-options.h | 6 +- common/thrift/ImpalaInternalService.thrift | 7 + common/thrift/ImpalaService.thrift | 11 + .../apache/impala/planner/DistributedPlanner.java | 21 +- .../org/apache/impala/planner/PlannerTest.java | 8 + .../queries/PlannerTest/tpcds-dist-method.test | 538 + 7 files changed, 603 insertions(+), 4 deletions(-) diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index f2cd720..cc65f08 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -988,6 +988,22 @@ Status impala::SetQueryOption(const string& key, const string& value, query_options->__set_report_skew_limit(skew_threshold); break; } + case TImpalaQueryOptions::USE_DOP_FOR_COSTING: { +query_options->__set_use_dop_for_costing(IsTrue(value)); +break; + } + case TImpalaQueryOptions::BROADCAST_TO_PARTITION_FACTOR: { +StringParser::ParseResult result; +const double val = +StringParser::StringToFloat(value.c_str(), value.length(), ); +if (result != StringParser::PARSE_SUCCESS || val < 0 || val > 1000) { + return Status(Substitute("Invalid broadcast to partition factor '$0'. " + "Only values from 0 to 1000 are allowed.", + value)); +} +query_options->__set_broadcast_to_partition_factor(val); +break; + } default: if (IsRemovedQueryOption(key)) { LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'"; diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index d61e47d..9abd042 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -47,7 +47,7 @@ typedef std::unordered_map // time we add or remove a query option to/from the enum TImpalaQueryOptions. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::OPTIMIZE_SIMPLE_LIMIT + 1);\ + TImpalaQueryOptions::BROADCAST_TO_PARTITION_FACTOR + 1);\ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -225,6 +225,10 @@ typedef std::unordered_map TQueryOptionLevel::ADVANCED)\ QUERY_OPT_FN(optimize_simple_limit, OPTIMIZE_SIMPLE_LIMIT,\ TQueryOptionLevel::REGULAR)\ + QUERY_OPT_FN(use_dop_for_costing, USE_DOP_FOR_COSTING,\ + TQueryOptionLevel::ADVANCED)\ + QUERY_OPT_FN(broadcast_to_partition_factor, BROADCAST_TO_PARTITION_FACTOR,\ + TQueryOptionLevel::ADVANCED)\ ; /// Enforce practical limits on some query options to avoid undesire
[impala] 01/04: IMPALA-10390: impala-profile-tool JSON output
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a8ac9f8a57730880520448df1c72a0d5b938fa7d Author: Tim Armstrong AuthorDate: Thu Dec 10 11:04:44 2020 -0800 IMPALA-10390: impala-profile-tool JSON output Add --profile_format option that takes options "text", "json" or "prettyjson". "json" and "prettyjson" output the JSON representation of each profile in a dense single-line form and in a human-readable multi-line form respectively. Also implement usage output when --help is passed in. Change-Id: I82ae0fe9379b7e3cbe93166adaa4c37212ea0f67 Reviewed-on: http://gerrit.cloudera.org:8080/16855 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/impala-hs2-server.cc | 14 +-- be/src/service/impala-server.h | 5 --- be/src/util/impala-profile-tool.cc | 79 + be/src/util/runtime-profile.cc | 18 + be/src/util/runtime-profile.h | 6 +++ 5 files changed, 89 insertions(+), 33 deletions(-) diff --git a/be/src/service/impala-hs2-server.cc b/be/src/service/impala-hs2-server.cc index 5c618fb..9cd55bb 100644 --- a/be/src/service/impala-hs2-server.cc +++ b/be/src/service/impala-hs2-server.cc @@ -32,8 +32,6 @@ #include #include #include -#include -#include #include #include "common/logging.h" @@ -1030,15 +1028,6 @@ void ImpalaServer::GetExecSummary(TGetExecSummaryResp& return_val, return_val.status.__set_statusCode(thrift::TStatusCode::SUCCESS_STATUS); } -void ImpalaServer::JsonProfileToStringProfile( -const rapidjson::Document& json_profile, stringstream* string_profile) { - // Serialize to JSON without extra whitespace/formatting. - rapidjson::StringBuffer sb; - rapidjson::Writer writer(sb); - json_profile.Accept(writer); - *string_profile << sb.GetString(); -} - // Add the given Thrift profile to the list of failed thrift profiles for the given // TGetRuntimeProfileResp. void SetFailedProfile( @@ -1069,7 +1058,8 @@ void ImpalaServer::SetProfile(TGetRuntimeProfileResp& get_profile_resp, } } else if (profile_format == TRuntimeProfileFormat::JSON) { DCHECK(profile.json_output != nullptr); -JsonProfileToStringProfile(*profile.json_output, profile.string_output); +RuntimeProfile::JsonProfileToString( +*profile.json_output, /*pretty=*/false, profile.string_output); if (set_failed_profile) { SetFailedProfile(profile.string_output, get_profile_resp); } else { diff --git a/be/src/service/impala-server.h b/be/src/service/impala-server.h index 2347030..4971dfe 100644 --- a/be/src/service/impala-server.h +++ b/be/src/service/impala-server.h @@ -769,11 +769,6 @@ class ImpalaServer : public ImpalaServiceIf, Status GetRuntimeProfileOutput(const string& user, const QueryHandle& query_handle, TRuntimeProfileFormat::type format, RuntimeProfileOutput* profile); - /// Converts a JSON Document representation of a profile to a string representation. - /// Both parameters cannot be nullptr. - void JsonProfileToStringProfile(const rapidjson::Document& json_profile, - std::stringstream* string_profile); - /// Set the profile (or thrift_profile) field for the given TRuntimeProfileFormat /// using the profile from the given RuntimeProfileOutput. If 'set_failed_profile' /// is true, then the profile is added to the 'failed_profile' field of diff --git a/be/src/util/impala-profile-tool.cc b/be/src/util/impala-profile-tool.cc index 9fe6f6f..5618978 100644 --- a/be/src/util/impala-profile-tool.cc +++ b/be/src/util/impala-profile-tool.cc @@ -19,6 +19,8 @@ #include #include #include + +#include #include #include "common/object-pool.h" @@ -26,36 +28,64 @@ #include "common/names.h" -// Utility to decode an Impala profile log from standard input. -// The profile log is consumed from standard input and each successfully parsed entry -// is pretty-printed to standard output. -// -// Example usage: -// impala-profile-tool < impala_profile_log_1.1-1607057366897 -// -// The following options are supported: -// --query_id=: given an impala query ID, only process profiles with this -// query id -// --min_timestamp=: only process profiles at or after this timestamp -// --max_timestamp=: only process profiles at or before this timestamp -// -// --gen_experimental_profile: if set to true, generates full output for the new -// experimental profile. +static const char* USAGE = +"Utility to decode an Impala profile log from standard input.\n" +"\n" +"The profile log is consumed from standard input and each successfully parsed entry" +" is pretty-printed to sta
[impala] 04/04: IMPALA-10358: Correct Iceberg type mappings
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a7e71b45232c90af09ff70a7596db56688cfeb31 Author: Zoltan Borok-Nagy AuthorDate: Thu Dec 10 17:41:39 2020 +0100 IMPALA-10358: Correct Iceberg type mappings The Iceberg format spec defines what types to use for different file formats, e.g.: https://iceberg.apache.org/spec/#parquet Impala should follow the specification, so this patch * annotates strings with UTF8 in Parquet metadata * removes fixed(L) <-> CHAR(L) mapping * forbids INSERTs when the Iceberg schema has a TIMESTAMPTZ column This patch also refactors the type/schema conversions as Impala => Iceberg conversions were duplicated in IcebergCatalogOpExecutor and IcebergUtil. I introduced the class 'IcebergSchemaConverter' to contain the code for conversions. Testing: * added test to check CHAR and VARCHAR types are not allowed * test that INSERTs are not allowed when the table has TIMESTMAPTZ * added test to check that strings are annotated with UTF8 Change-Id: I652565f82708824f5cf7497139153b06f116ccd3 Reviewed-on: http://gerrit.cloudera.org:8080/16851 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/hdfs-parquet-table-writer.cc | 8 +- be/src/exec/parquet/hdfs-parquet-table-writer.h| 6 + be/src/exec/parquet/parquet-metadata-utils.cc | 4 +- be/src/exec/parquet/parquet-metadata-utils.h | 4 +- .../org/apache/impala/analysis/InsertStmt.java | 17 ++ .../org/apache/impala/catalog/FeIcebergTable.java | 5 + .../org/apache/impala/catalog/IcebergTable.java| 13 +- .../impala/catalog/local/LocalIcebergTable.java| 9 +- .../impala/service/IcebergCatalogOpExecutor.java | 120 +- .../apache/impala/util/IcebergSchemaConverter.java | 183 + .../java/org/apache/impala/util/IcebergUtil.java | 127 -- .../queries/QueryTest/iceberg-insert.test | 23 --- .../queries/QueryTest/iceberg-negative.test| 21 +++ tests/query_test/test_iceberg.py | 39 - 14 files changed, 304 insertions(+), 275 deletions(-) diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc b/be/src/exec/parquet/hdfs-parquet-table-writer.cc index 5203c3f..f835fe4 100644 --- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc +++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc @@ -997,6 +997,11 @@ void HdfsParquetTableWriter::ConfigureTimestampType() { timestamp_type_ = state_->query_options().parquet_timestamp_type; } +void HdfsParquetTableWriter::ConfigureStringType() { + string_utf8_ = is_iceberg_file_ || + state_->query_options().parquet_annotate_strings_utf8; +} + Status HdfsParquetTableWriter::Init() { // Initialize file metadata file_metadata_.version = PARQUET_CURRENT_VERSION; @@ -1062,6 +1067,7 @@ Status HdfsParquetTableWriter::Init() { Codec::CodecInfo codec_info(codec, clevel); ConfigureTimestampType(); + ConfigureStringType(); columns_.resize(num_cols); // Initialize each column structure. @@ -1178,7 +1184,7 @@ Status HdfsParquetTableWriter::CreateSchema() { DCHECK_EQ(col_desc.name(), columns_[i]->column_name()); const int field_id = col_desc.field_id(); if (field_id != -1) col_schema.__set_field_id(field_id); -ParquetMetadataUtils::FillSchemaElement(col_type, state_->query_options(), +ParquetMetadataUtils::FillSchemaElement(col_type, string_utf8_, timestamp_type_, _schema); } diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.h b/be/src/exec/parquet/hdfs-parquet-table-writer.h index 672fa33..aadad1f 100644 --- a/be/src/exec/parquet/hdfs-parquet-table-writer.h +++ b/be/src/exec/parquet/hdfs-parquet-table-writer.h @@ -160,6 +160,9 @@ class HdfsParquetTableWriter : public HdfsTableWriter { /// Selects the Parquet timestamp type to be used by this writer. void ConfigureTimestampType(); + /// Sets 'string_utf8_' based on query options and table type. + void ConfigureStringType(); + /// Updates output partition with some summary about the written file. void FinalizePartitionInfo(); @@ -225,6 +228,9 @@ class HdfsParquetTableWriter : public HdfsTableWriter { /// True if we are writing an Iceberg data file. In that case the writer behaves a /// bit differently, e.g. writes specific type of timestamps, fills some extra metadata. bool is_iceberg_file_ = false; + + /// If true, STRING values are annotated with UTF8 in Parquet metadata. + bool string_utf8_ = false; }; } diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index e2b1e46..2caa949 100644 --- a/be/src/exec/parquet/parquet-metadata-
[impala] 02/03: IMPALA-10384: Make partition names consistent between BE and FE
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit eb8b118db5be9a7c8dca9b3229e76c213e19c86c Author: Zoltan Borok-Nagy AuthorDate: Tue Dec 8 17:32:51 2020 +0100 IMPALA-10384: Make partition names consistent between BE and FE In the BE we build partition names with the trailing char '/'. In the FE we build partition names without a trailing char. We should make this consistent because this causes some annoying string adjustments in the FE and can cause hidden bugs. This patch creates partition names without the trailing '/' both in the BE and the FE. This follows Hive's behavior that also prints partition names without the trailing '/'. Testing: * Ran exhaustive tests Change-Id: I7e40111e2d1148aeb01ebc985bbb15db7d6a6012 Reviewed-on: http://gerrit.cloudera.org:8080/16850 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/hdfs-table-sink.cc |6 +- .../java/org/apache/impala/catalog/HdfsTable.java |5 +- .../apache/impala/service/CatalogOpExecutor.java | 12 +- .../impala/catalog/HdfsPartitionSdCompareTest.java |4 +- .../events/MetastoreEventsProcessorTest.java |6 +- .../queries/QueryTest/alter-table.test | 14 +- .../QueryTest/compute-stats-incremental.test |6 +- .../queries/QueryTest/compute-stats.test |6 +- .../queries/QueryTest/create-table-like-table.test |2 +- .../queries/QueryTest/create-table.test|6 +- .../queries/QueryTest/date-fileformat-support.test | 10 +- .../queries/QueryTest/date-partitioning.test | 14 +- .../queries/QueryTest/hdfs-caching.test|4 +- .../functional-query/queries/QueryTest/insert.test | 256 ++--- .../queries/QueryTest/insert_null.test | 16 +- .../queries/QueryTest/insert_overwrite.test| 28 +- .../queries/QueryTest/insert_part_key.test |2 +- .../queries/QueryTest/insert_permutation.test | 24 +- .../functional-query/queries/QueryTest/load.test |2 +- .../queries/QueryTest/multiple-filesystems.test|4 +- .../queries/QueryTest/partition-col-types.test | 22 +- .../tpcds-insert/queries/partitioned-insert.test | 1038 ++-- 22 files changed, 740 insertions(+), 747 deletions(-) diff --git a/be/src/exec/hdfs-table-sink.cc b/be/src/exec/hdfs-table-sink.cc index 22b269d..886919d 100644 --- a/be/src/exec/hdfs-table-sink.cc +++ b/be/src/exec/hdfs-table-sink.cc @@ -236,12 +236,12 @@ void HdfsTableSink::BuildHdfsFileNames( output_partition->tmp_hdfs_dir_name = Substitute("$0/.$1_$2_dir/", staging_dir_, unique_id_str_, rand()); - output_partition->tmp_hdfs_file_name_prefix = Substitute("$0$1$2", + output_partition->tmp_hdfs_file_name_prefix = Substitute("$0$1/$2", output_partition->tmp_hdfs_dir_name, output_partition->partition_name, query_suffix); if (partition_descriptor.location().empty()) { -output_partition->final_hdfs_file_name_prefix = Substitute("$0/$1", +output_partition->final_hdfs_file_name_prefix = Substitute("$0/$1/", table_desc_->hdfs_base_dir(), output_partition->partition_name); } else { // If the partition descriptor has a location (as set by alter table add partition @@ -472,7 +472,7 @@ Status HdfsTableSink::InitOutputPartition(RuntimeState* state, partition_name_ss << (encoded_str.empty() ? table_desc_->null_partition_key_value() : encoded_str); } -partition_name_ss << "/"; +if (j < partition_key_expr_evals_.size() - 1) partition_name_ss << "/"; } // partition_name_ss now holds the unique descriptor for this partition, diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java index 92aabf4..77825cc 100644 --- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java @@ -1607,10 +1607,7 @@ public class HdfsTable extends Table implements FeFsTable { List parts = Lists.newArrayListWithCapacity(partitionNames.size()); for (String partitionName: partitionNames) { String partName = DEFAULT_PARTITION_NAME; - if (partitionName.length() > 0) { -// Trim the last trailing char '/' from each partition name -partName = partitionName.substring(0, partitionName.length()-1); - } + if (partitionName.length() > 0) partName = partitionName; HdfsPartition partition = nameToPartitionMap_.get(partName); Preconditions.checkNotNull(partition, "Invalid partition name: " + partName); p
[impala] 01/03: IMPALA-10385: Fix RPM repo ID capitalization for Centos 8.3
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit eceec36f69d09343f6253b7fe02e2bae505b1c5d Author: Laszlo Gaal AuthorDate: Tue Dec 8 20:57:38 2020 +0100 IMPALA-10385: Fix RPM repo ID capitalization for Centos 8.3 Centos 8.3 changed package repo ID capitalization from MixedCase to all lowercase. On Centos 8 snappy-devel is installed from the PowerTools repo, which is not enabled by default, so the install command has to enable is temporarily using the repo ID. The capitalization change broke bootstrap_system.sh, failing builds on Centos 8. The patch changes the `dnf install` call to use a glob pattern for the PowerTools repo ID to cover the naming conventions in all Centos 8.x releases. Change-Id: I224beb1189ce25ae66ecd78d70757537e117805a Reviewed-on: http://gerrit.cloudera.org:8080/16844 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- bin/bootstrap_system.sh | 4 +++- bin/impala-config.sh| 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh index 5e994f2..352d9a3 100755 --- a/bin/bootstrap_system.sh +++ b/bin/bootstrap_system.sh @@ -268,7 +268,9 @@ redhat sudo yum install -y curl gawk gcc gcc-c++ git krb5-devel krb5-server \ # Enable the Powertools repo for snappy-devel on RedHat 8 redhat8 sudo yum install -y dnf-plugins-core -redhat8 sudo yum install -y --enablerepo="PowerTools*" snappy-devel +# Package repo IDs changed from mixed case to all-lowercase between Centos 8.2 +# and 8.3, so use globbing to cover both conventions +redhat8 sudo yum install -y --enablerepo="[Pp]ower[Tt]ools*" snappy-devel # RedHat / CentOS 8 exposes only specific versions of Python. # Set up unversioned default Python 2.x for older CentOS versions diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 811d46b..d7baa31 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -68,7 +68,7 @@ fi # moving to a different build of the toolchain, e.g. when a version is bumped or a # compile option is changed. The build id can be found in the output of the toolchain # build jobs, it is constructed from the build number and toolchain git hash prefix. -export IMPALA_TOOLCHAIN_BUILD_ID=68-7644f7fe9c +export IMPALA_TOOLCHAIN_BUILD_ID=69-7644f7fe9c # Versions of toolchain dependencies. # --- export IMPALA_AVRO_VERSION=1.7.4-p5
[impala] branch master updated (07f3ae3 -> 4c0bdba)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 07f3ae3 IMPALA-10066: Fix test_cancellation_mid_command flakiness new eceec36 IMPALA-10385: Fix RPM repo ID capitalization for Centos 8.3 new eb8b118 IMPALA-10384: Make partition names consistent between BE and FE new 4c0bdba IMPALA-9865: part 1: basic profile log parser The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/exec/hdfs-table-sink.cc |6 +- be/src/util/CMakeLists.txt |2 + be/src/util/impala-profile-tool.cc | 102 ++ be/src/util/runtime-profile.cc |8 + be/src/util/runtime-profile.h |4 + bin/bootstrap_system.sh|4 +- bin/impala-config.sh |2 +- .../java/org/apache/impala/catalog/HdfsTable.java |5 +- .../apache/impala/service/CatalogOpExecutor.java | 12 +- .../impala/catalog/HdfsPartitionSdCompareTest.java |4 +- .../events/MetastoreEventsProcessorTest.java |6 +- .../queries/QueryTest/alter-table.test | 14 +- .../QueryTest/compute-stats-incremental.test |6 +- .../queries/QueryTest/compute-stats.test |6 +- .../queries/QueryTest/create-table-like-table.test |2 +- .../queries/QueryTest/create-table.test|6 +- .../queries/QueryTest/date-fileformat-support.test | 10 +- .../queries/QueryTest/date-partitioning.test | 14 +- .../queries/QueryTest/hdfs-caching.test|4 +- .../functional-query/queries/QueryTest/insert.test | 256 ++--- .../queries/QueryTest/insert_null.test | 16 +- .../queries/QueryTest/insert_overwrite.test| 28 +- .../queries/QueryTest/insert_part_key.test |2 +- .../queries/QueryTest/insert_permutation.test | 24 +- .../functional-query/queries/QueryTest/load.test |2 +- .../queries/QueryTest/multiple-filesystems.test|4 +- .../queries/QueryTest/partition-col-types.test | 22 +- .../tpcds-insert/queries/partitioned-insert.test | 1038 ++-- 28 files changed, 860 insertions(+), 749 deletions(-) create mode 100644 be/src/util/impala-profile-tool.cc
[impala] 03/03: IMPALA-9865: part 1: basic profile log parser
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 4c0bdbada0bc0eeb0435e1ea647573566f0cddbd Author: Tim Armstrong AuthorDate: Fri Dec 4 15:27:17 2020 -0800 IMPALA-9865: part 1: basic profile log parser This adds a utility that consumes the Impala profile log format from stdin and pretty-prints the profiles. It supports some basic filters - --query_id, --min_timestamp and --max_timestamp. If --gen_experimental_profile=true is set, it dumps the aggregated part of the profile with the full output for the new experimental profiles. In a future change, we should detect this based on the profile version set. This utility will be extended in future with more options, but is already useful in that it can handle the new experimental profile format and produce pretty-printed output consistent with the Impala web UI and impala-shell. Change-Id: I6178399ac96e176f7067cc47347e51cda2f3 Reviewed-on: http://gerrit.cloudera.org:8080/16821 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/util/CMakeLists.txt | 2 + be/src/util/impala-profile-tool.cc | 102 + be/src/util/runtime-profile.cc | 8 +++ be/src/util/runtime-profile.h | 4 ++ 4 files changed, 116 insertions(+) diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index f04d1c1..b5e43da 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -163,8 +163,10 @@ add_library(loggingsupport SHARED ) add_executable(parquet-reader parquet-reader.cc) +add_executable(impala-profile-tool impala-profile-tool.cc) target_link_libraries(parquet-reader ${IMPALA_LINK_LIBS}) +target_link_libraries(impala-profile-tool ${IMPALA_LINK_LIBS}) target_link_libraries(loggingsupport ${IMPALA_LINK_LIBS_DYNAMIC_TARGETS}) diff --git a/be/src/util/impala-profile-tool.cc b/be/src/util/impala-profile-tool.cc new file mode 100644 index 000..9fe6f6f --- /dev/null +++ b/be/src/util/impala-profile-tool.cc @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "common/object-pool.h" +#include "util/runtime-profile.h" + +#include "common/names.h" + +// Utility to decode an Impala profile log from standard input. +// The profile log is consumed from standard input and each successfully parsed entry +// is pretty-printed to standard output. +// +// Example usage: +// impala-profile-tool < impala_profile_log_1.1-1607057366897 +// +// The following options are supported: +// --query_id=: given an impala query ID, only process profiles with this +// query id +// --min_timestamp=: only process profiles at or after this timestamp +// --max_timestamp=: only process profiles at or before this timestamp +// +// --gen_experimental_profile: if set to true, generates full output for the new +// experimental profile. +DEFINE_string(query_id, "", "Query ID to output profiles for"); +DEFINE_int64(min_timestamp, -1, "Minimum timestamp (inclusive) to output profiles for"); +DEFINE_int64(max_timestamp, -1, "Maximum timestamp (inclusive) to output profiles for"); + +using namespace impala; + +using std::cerr; +using std::cin; +using std::cout; +using std::istringstream; + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(, , true); + + int errors = 0; + string line; + int lineno = 1; + // Read profile log lines from stdin. + for (; getline(cin, line); ++lineno) { +// Parse out fields from the line. +istringstream liness(line); +int64_t timestamp; +string query_id, encoded_profile; +liness >> timestamp >> query_id >> encoded_profile; +if (liness.fail()) { + cerr << "Error parsing line " << lineno << ": '" << line << "'\n"; + ++errors; + continue; +} + +
[impala] branch master updated: IMPALA-10343: increase control_service_queue_mem_limit
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new c04de99 IMPALA-10343: increase control_service_queue_mem_limit c04de99 is described below commit c04de9933d174ebf7fc3779faa1fa54de953be93 Author: Tim Armstrong AuthorDate: Wed Dec 9 14:08:33 2020 -0800 IMPALA-10343: increase control_service_queue_mem_limit --control_service_queue_mem_limit is set to 1% by default to increase the maximum size of the queue in typical production deployments. E.g. an Impala daemon with a 50GB memory limit will have a limit of 512MB on the control service queue. Add --control_service_queue_mem_limit_floor_bytes so that this does not have the unintended effect of reducing the memory given to the control service queue. I.e. the default behaviour does not change for impala daemons with a daemon mem limit of <= 5000MB, but does increase the control service queue memory limit for impala daemons with mem limits > 5000MB. The default process memory limit in the mocked backend test ExecEnv is changed to be 8GB. Previously it was unlimited, so we couldn't calculate 1% of it. It cannot be unlimited in an actual impalad since IMPALA-5653 was fixed. Testing: This had been previously problematic on a 64 node TPC-DS workload with mt_dop=12 where impalads had ~100GB of memory. Status report RPCs would fail and have to be retried. We tested this new value on the same workload and the retries were avoided. Change-Id: Ic7fe93b5ce7eb6b63e48293ac287d98cc1d9e3fa Reviewed-on: http://gerrit.cloudera.org:8080/16848 Tested-by: Impala Public Jenkins Reviewed-by: Thomas Tauber-Marshall --- be/src/runtime/test-env.h| 4 +++- be/src/service/control-service.cc| 7 ++- tests/custom_cluster/test_rpc_timeout.py | 4 +++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/be/src/runtime/test-env.h b/be/src/runtime/test-env.h index 108b62f..6e50fd0 100644 --- a/be/src/runtime/test-env.h +++ b/be/src/runtime/test-env.h @@ -96,7 +96,9 @@ class TestEnv { int64_t buffer_pool_capacity_; /// Arguments for process memory tracker, used in Init(). - int64_t process_mem_limit_ = -1; + /// Default to 8GB, which should be enough for any tests that are not deliberately + /// allocating large amounts of memory. + int64_t process_mem_limit_ = 8L * 1024L * 1024L * 1024L; bool process_mem_tracker_use_metrics_ = false; /// Global state for test environment. diff --git a/be/src/service/control-service.cc b/be/src/service/control-service.cc index 3db1055..8c6cea0 100644 --- a/be/src/service/control-service.cc +++ b/be/src/service/control-service.cc @@ -49,7 +49,11 @@ using kudu::rpc::RpcContext; static const string QUEUE_LIMIT_MSG = "(Advanced) Limit on RPC payloads consumption for " "ControlService. " + Substitute(MEM_UNITS_HELP_MSG, "the process memory limit"); -DEFINE_string(control_service_queue_mem_limit, "50MB", QUEUE_LIMIT_MSG.c_str()); +DEFINE_string(control_service_queue_mem_limit, "1%", QUEUE_LIMIT_MSG.c_str()); +DEFINE_int64(control_service_queue_mem_limit_floor_bytes, 50L * 1024L * 1024L, +"Lower bound on --control_service_queue_mem_limit in bytes. If " +"--control_service_queue_mem_limit works out to be less than this amount, " +"this value is used instead"); DEFINE_int32(control_service_num_svc_threads, 0, "Number of threads for processing " "control service's RPCs. if left at default value 0, it will be set to number of " "CPU cores. Set it to a positive value to change from the default."); @@ -68,6 +72,7 @@ ControlService::ControlService(MetricGroup* metric_group) CLEAN_EXIT_WITH_ERROR(Substitute("Invalid mem limit for control service queue: " "'$0'.", FLAGS_control_service_queue_mem_limit)); } + bytes_limit = max(bytes_limit, FLAGS_control_service_queue_mem_limit_floor_bytes); mem_tracker_.reset(new MemTracker( bytes_limit, "Control Service Queue", process_mem_tracker)); MemTrackerMetric::CreateMetrics(metric_group, mem_tracker_.get(), "ControlService"); diff --git a/tests/custom_cluster/test_rpc_timeout.py b/tests/custom_cluster/test_rpc_timeout.py index 59a809a..797efb2 100644 --- a/tests/custom_cluster/test_rpc_timeout.py +++ b/tests/custom_cluster/test_rpc_timeout.py @@ -154,7 +154,9 @@ class TestRPCTimeout(CustomClusterTestSuite): # the retry paths in the ReportExecStatus() RPC @pytest.mark.execute_serially @CustomClusterTestSuite.with_args("--status_report_interval_ms=100" - " --cont
[impala] branch master updated: IMPALA-10361: Use field id to resolve columns for Iceberg tables
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new a850cd3 IMPALA-10361: Use field id to resolve columns for Iceberg tables a850cd3 is described below commit a850cd3cc6e5928851ec4c06c66cdc5299097b30 Author: skyyws AuthorDate: Fri Nov 27 17:02:19 2020 +0800 IMPALA-10361: Use field id to resolve columns for Iceberg tables We supported resolve column by field id for Iceberg table in this patch. Currently, we use field id to resolve column for Iceberg tables, which means 'PARQUET_FALLBACK_SCHEMA_RESOLUTION' is invalid for Iceberg tables. Change-Id: I057bdc6ab2859cc4d40de5ed428d0c20028b8435 Reviewed-on: http://gerrit.cloudera.org:8080/16788 Tested-by: Impala Public Jenkins Reviewed-by: Zoltan Borok-Nagy --- be/src/exec/parquet/parquet-metadata-utils.cc | 42 ++ be/src/exec/parquet/parquet-metadata-utils.h | 8 +- be/src/runtime/descriptors.cc | 3 + be/src/runtime/descriptors.h | 4 + be/src/runtime/row-batch-serialize-test.cc | 7 + be/src/runtime/types.cc| 12 +- be/src/runtime/types.h | 2 + be/src/service/query-options-test.cc | 2 +- common/thrift/CatalogObjects.thrift| 3 + common/thrift/Descriptors.thrift | 3 + common/thrift/ImpalaInternalService.thrift | 2 + common/thrift/Types.thrift | 2 + .../java/org/apache/impala/catalog/Column.java | 12 +- .../org/apache/impala/catalog/IcebergColumn.java | 15 ++- .../apache/impala/catalog/IcebergStructField.java | 57 .../org/apache/impala/catalog/IcebergTable.java| 23 +++- .../java/org/apache/impala/catalog/StructType.java | 11 ++ .../main/java/org/apache/impala/catalog/Table.java | 19 ++- .../main/java/org/apache/impala/catalog/Type.java | 8 +- .../impala/catalog/local/LocalIcebergTable.java| 5 +- .../apache/impala/catalog/local/LocalTable.java| 96 +- .../org/apache/impala/planner/IcebergScanNode.java | 3 + .../java/org/apache/impala/util/IcebergUtil.java | 7 +- testdata/data/README | 6 + ...79bd6-4b97-4680-b4e1-52e93b6ce04e-0.parquet | Bin 0 -> 3855 bytes ...c9b7a-f42d-4245-b806-dfa7a792593f-0.parquet | Bin 0 -> 3855 bytes ...fe2d6-b0d9-42d6-bc95-15f52ecb29ad-0.parquet | Bin 0 -> 3855 bytes ...8e294-5992-48d9-a18e-08e129bb418c-0.parquet | Bin 0 -> 3854 bytes ...fcf22-3de2-489a-b1ec-d5141e75a8e8-0.parquet | Bin 0 -> 3855 bytes ...1dc85-b8f3-4cc2-a5c6-38b7fee49709-0.parquet | Bin 0 -> 3854 bytes ...510cc-e765-43bc-be03-c5561a8d50a3-0.parquet | Bin 0 -> 3855 bytes ...afc4a-b718-406d-a532-58fab5c8f85d-0.parquet | Bin 0 -> 3855 bytes ...a8e89-8aeb-4405-be64-76557432cf21-0.parquet | Bin 0 -> 3870 bytes ...d552a-fddc-42f3-adfd-ecba20a01d80-0.parquet | Bin 0 -> 3870 bytes ...7db43-3b9a-4a50-9946-d003cc1d461c-0.parquet | Bin 0 -> 3870 bytes ...895d0-1f42-4c30-989f-968802831077-0.parquet | Bin 0 -> 3870 bytes ...9ac6d-aeee-4c35-9f8a-1a03127d33b8-0.parquet | Bin 0 -> 3870 bytes ...a3ad9-737c-4416-a32c-501cc9a4aa90-0.parquet | Bin 0 -> 3870 bytes ...78795-ff6a-4a20-9fff-8dc4907c1ba7-0.parquet | Bin 0 -> 3887 bytes ...a35cb-22b5-4a5d-932b-89f222b0b2c7-0.parquet | Bin 0 -> 3887 bytes ...c4c8c-de16-487a-89b3-ee8e58b4fd07-0.parquet | Bin 0 -> 3884 bytes ...ba706-2ace-4d41-b475-6bda3ba72306-0.parquet | Bin 0 -> 3884 bytes ...5e226-5f8f-49b9-b998-039b8362b7a0-0.parquet | Bin 0 -> 3887 bytes ...c3538-052d-493e-9479-b59fc8aece0f-0.parquet | Bin 0 -> 3887 bytes .../4ceee3ab-8653-423c-b8ac-0ad5f7b0579b-m0.avro | Bin 0 -> 6481 bytes ...178-1-4ceee3ab-8653-423c-b8ac-0ad5f7b0579b.avro | Bin 0 -> 2594 bytes .../metadata/v1.metadata.json | 125 ++ .../metadata/v2.metadata.json | 144 + .../metadata/version-hint.text | 1 + .../functional/functional_schema_template.sql | 14 ++ .../datasets/functional/schema_constraints.csv | 1 + .../queries/QueryTest/iceberg-query.test | 103 +++ .../QueryTest/parquet-resolution-by-name.test | 2 +- .../functional-query/queries/QueryTest/set.test| 2 +- tests/query_test/test_scanners.py | 2 + 55 files changed, 724 insertions(+), 22 deletions(-) diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index dbc5510..2e77326 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++
[impala] branch master updated: IMPALA-9985/IMPALA-10378: Bump toolchain to include Centos 8.2
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 69578db IMPALA-9985/IMPALA-10378: Bump toolchain to include Centos 8.2 69578db is described below commit 69578db9ae2c7befc1455d4dc8d29ce6e5bcf517 Author: Laszlo Gaal AuthorDate: Tue Dec 8 15:52:09 2020 +0100 IMPALA-9985/IMPALA-10378: Bump toolchain to include Centos 8.2 Bump the toolchain to a newer version that has Centos 8 binaries that were built on Centos 8.2. Centos 8.2 contains an updated version of glibc, which fixes several bugs, at least one of which had also impacted Impala Centos 8 builds through the binary toolchain. Details are described in see IMPALA-9985. The new toolchain binaries can now compile Impala for Centos 8 without the problems seen earlier in IMPALA-9985. This also means that the minimum Centos 8 version required for Impala is now Centos 8.2.2004. Unrelated to these bugs, the same toolchain version is now the first one to drop support for Debian 8. As Debian 8 support was discontinued on 2020-06-30 (see: https://www.debian.org/News/2020/20200709), IMPALA-10378 removes Debian 8 support from the toolchain. Change-Id: I061506e44872fc980630fc006658fdc778a75dd1 Reviewed-on: http://gerrit.cloudera.org:8080/16843 Tested-by: Impala Public Jenkins Reviewed-by: Tim Armstrong --- bin/impala-config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 82488ed..811d46b 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -68,7 +68,7 @@ fi # moving to a different build of the toolchain, e.g. when a version is bumped or a # compile option is changed. The build id can be found in the output of the toolchain # build jobs, it is constructed from the build number and toolchain git hash prefix. -export IMPALA_TOOLCHAIN_BUILD_ID=62-ab817885e4 +export IMPALA_TOOLCHAIN_BUILD_ID=68-7644f7fe9c # Versions of toolchain dependencies. # --- export IMPALA_AVRO_VERSION=1.7.4-p5
[impala] branch master updated: IMPALA-10252: fix invalid runtime filters for outer joins
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new f684ed7 IMPALA-10252: fix invalid runtime filters for outer joins f684ed7 is described below commit f684ed72c541fa04dc1841a1aab83a7c9847f1a2 Author: Tim Armstrong AuthorDate: Wed Oct 21 13:59:25 2020 -0700 IMPALA-10252: fix invalid runtime filters for outer joins The planner generates runtime filters for non-join conjuncts assigned to LEFT OUTER and FULL OUTER JOIN nodes. This is correct in many cases where NULLs stemming from unmatched rows would result in the predicate evaluating to false. E.g. x = y is always false if y is NULL. However, it is incorrect if the NULL returned from the unmatched row can result in the predicate evaluating to true. E.g. x = isnull(y, 1) can return true even if y is NULL. The fix is to detect cases when the source expression from the left input of the join returns non-NULL for null inputs and then skip generating the filter. Examples of expressions that may be affected by this change are COALESCE and ISNULL. Testing: Added regression tests: * Planner tests for LEFT OUTER and FULL OUTER where the runtime filter was incorrectly generated before this patch. * Enabled end-to-end test that was previously failing. * Added a new runtime filter test that will execute on both Parquet and Kudu (which are subtly different because of nullability of slots). Ran exhaustive tests. Change-Id: I507af1cc8df15bca21e0d8555019997812087261 Reviewed-on: http://gerrit.cloudera.org:8080/16622 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../java/org/apache/impala/analysis/Analyzer.java | 22 +++- .../impala/planner/RuntimeFilterGenerator.java | 31 .../PlannerTest/runtime-filter-propagation.test| 59 +- .../queries/QueryTest/runtime_filters.test | 33 .../queries/QueryTest/subquery.test| 6 +-- tests/query_test/test_queries.py | 2 - 6 files changed, 144 insertions(+), 9 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index a5b1e36..8a73e07 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -2301,6 +2301,25 @@ public class Analyzer { * returns false otherwise. Throws if backend expression evaluation fails. */ public boolean isTrueWithNullSlots(Expr p) throws InternalException { +Expr nullTuplePred = substituteNullSlots(p); +return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx()); + } + + /** + * Try to evaluate 'p' with all NULL slots into a literal. + * @return null if it could not be evaluated successfully, the literal otherwise. + * @throws AnalysisException + */ + public LiteralExpr evalWithNullSlots(Expr p) throws AnalysisException { +Expr nullTuplePred = substituteNullSlots(p); +return LiteralExpr.createBounded( +nullTuplePred, getQueryCtx(), StringLiteral.MAX_STRING_LEN); + } + + /** + * Replace all the SlotRefs in 'p' with null literals + */ + private Expr substituteNullSlots(Expr p) { // Construct predicate with all SlotRefs substituted by NullLiterals. List slotRefs = new ArrayList<>(); p.collect(Predicates.instanceOf(SlotRef.class), slotRefs); @@ -2313,8 +2332,7 @@ public class Analyzer { // function signature as in the original predicate. nullSmap.put(slotRef.clone(), NullLiteral.create(slotRef.getType())); } -Expr nullTuplePred = p.substitute(nullSmap, this, false); -return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx()); +return p.substitute(nullSmap, this, false); } public TupleId getTupleId(SlotId slotId) { diff --git a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java b/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java index 8356d78..1422129 100644 --- a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java +++ b/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java @@ -35,6 +35,8 @@ import org.apache.impala.analysis.Expr; import org.apache.impala.analysis.ExprSubstitutionMap; import org.apache.impala.analysis.FunctionCallExpr; import org.apache.impala.analysis.IsNullPredicate; +import org.apache.impala.analysis.LiteralExpr; +import org.apache.impala.analysis.NullLiteral; import org.apache.impala.analysis.Predicate; import org.apache.impala.analysis.SlotDescriptor; import org.apache.impala.analysis.SlotId; @@ -368,6 +370,35 @@ public final
[impala] branch master updated: IMPALA-10366: skip test_runtime_profile_aggregated for EC
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 62c19e6 IMPALA-10366: skip test_runtime_profile_aggregated for EC 62c19e6 is described below commit 62c19e63396c1e783e93b4e7fb71804b70955e89 Author: Tim Armstrong AuthorDate: Mon Nov 30 19:53:49 2020 -0800 IMPALA-10366: skip test_runtime_profile_aggregated for EC The schedule for erasure coded data results in 3 instead of 4 instances of the fragment with the scan. Skip the test - we don't need special coverage for erasure coding. Change-Id: I2bb47d89f6d6c59242f2632c481f26d93e28e33e Reviewed-on: http://gerrit.cloudera.org:8080/16799 Reviewed-by: Aman Sinha Tested-by: Impala Public Jenkins --- tests/common/skip.py | 1 + tests/custom_cluster/test_runtime_profile.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/common/skip.py b/tests/common/skip.py index a340360..21bbfdd 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -195,6 +195,7 @@ class SkipIfEC: fix_later = pytest.mark.skipif(IS_EC, reason="It should work but doesn't.") contain_full_explain = pytest.mark.skipif(IS_EC, reason="Contain full explain output " "for hdfs tables.") + different_schedule = pytest.mark.skipif(IS_EC, reason="Query is scheduled differently.") class SkipIfDockerizedCluster: diff --git a/tests/custom_cluster/test_runtime_profile.py b/tests/custom_cluster/test_runtime_profile.py index 73d7b50..933ad7c 100644 --- a/tests/custom_cluster/test_runtime_profile.py +++ b/tests/custom_cluster/test_runtime_profile.py @@ -17,6 +17,7 @@ import pytest from tests.common.custom_cluster_test_suite import CustomClusterTestSuite +from tests.common.skip import SkipIfEC class TestRuntimeProfile(CustomClusterTestSuite): @@ -28,6 +29,7 @@ class TestRuntimeProfile(CustomClusterTestSuite): PERIODIC_COUNTER_UPDATE_FLAG = '--periodic_counter_update_period_ms=50' + @SkipIfEC.different_schedule @pytest.mark.execute_serially @CustomClusterTestSuite.with_args('--gen_experimental_profile=true ' + PERIODIC_COUNTER_UPDATE_FLAG)
[impala] branch master updated: IMPALA-9355: TestExchangeMemUsage.test_exchange_mem_usage_scaling doesn't hit the memory limit
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 6314610 IMPALA-9355: TestExchangeMemUsage.test_exchange_mem_usage_scaling doesn't hit the memory limit 6314610 is described below commit 63146103a76592fe7ee583f83bc1d5e0385353b7 Author: Qifan Chen AuthorDate: Sun Nov 29 15:48:51 2020 -0500 IMPALA-9355: TestExchangeMemUsage.test_exchange_mem_usage_scaling doesn't hit the memory limit This patch reduces the memory limit for the following query in test_exchange_mem_usage_scaling test from 170MB to 164MB to reduce the chance of not detecting a memory allocation failure. set mem_limit= set num_scanner_threads=1; select * from tpch_parquet.lineitem l1 join tpch_parquet.lineitem l2 on l1.l_orderkey = l2.l_orderkey and l1.l_partkey = l2.l_partkey and l1.l_suppkey = l2.l_suppkey and l1.l_linenumber = l2.l_linenumber order by l1.l_orderkey desc, l1.l_partkey, l1.l_suppkey, l1.l_linenumber limit 5; In a test with 500 executions of the above query with the memory limit set to 164MB, there were 500 memory allocation failures in total (one in each execution), and a total of 266 of them from Exchange Node #4. Testing: Ran the query in question individually; Ran TestExchangeMemUsage.test_exchange_mem_usage_scaling test; Ran core tests. Change-Id: Id945d7e37fac07beb7808e6ccf8530e667cbaad4 Reviewed-on: http://gerrit.cloudera.org:8080/16791 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- .../functional-query/queries/QueryTest/exchange-mem-scaling.test| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testdata/workloads/functional-query/queries/QueryTest/exchange-mem-scaling.test b/testdata/workloads/functional-query/queries/QueryTest/exchange-mem-scaling.test index 4affa6e..b955bc9 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/exchange-mem-scaling.test +++ b/testdata/workloads/functional-query/queries/QueryTest/exchange-mem-scaling.test @@ -4,7 +4,7 @@ # it hits the memory limit in the exchange node when allocating receiver-side # buffers. It's also possible but less likely that this will hit a memory limit # in the scan nodes. -set mem_limit=170m; +set mem_limit=164m; set num_scanner_threads=1; select * from tpch_parquet.lineitem l1
[impala] branch master updated: IMPALA-10314: Optimize planning time for simple limits
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 5530b62 IMPALA-10314: Optimize planning time for simple limits 5530b62 is described below commit 5530b62539e762ddf5825e2b43db2f29d9addae7 Author: Aman Sinha AuthorDate: Wed Nov 11 18:38:11 2020 -0800 IMPALA-10314: Optimize planning time for simple limits This patch optimizes the planning time for simple limit queries by only considering a minimal set of partitions whose file descriptors add up to N (the specified limit). Each file is conservatively estimated to contain 1 row. This reduces the number of partitions processed by HdfsScanNode.computeScanRangeLocations() which, according to query profiling, has been the main contributor to the planning time especially for large number of partitions. Further, within each partition, we only consider the number of non-empty files that brings the total to N. This is an opt-in optimization. A new planner option OPTIMIZE_SIMPLE_LIMIT enables this optimization. Further, if there's a WHERE clause, it must have an 'always_true' hint in order for the optimization to be considered. For example: set optimize_simple_limit = true; SELECT * FROM T WHERE /* +always_true */ LIMIT 10; If there are too many empty files in the partitions, it is possible that the query may produce fewer rows although those are still valid rows. Testing: - Added planner tests for the optimization - Ran query_test.py tests by enabling the optimize_simple_limit - Added an e2e test. Since result rows are non-deterministic, only simple count(*) query on top of subquery with limit was added. Change-Id: I9d6a79263bc092e0f3e9a1d72da5618f3cc35574 Reviewed-on: http://gerrit.cloudera.org:8080/16723 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/query-options.cc| 3 + be/src/service/query-options.h | 4 +- common/thrift/ImpalaInternalService.thrift | 3 + common/thrift/ImpalaService.thrift | 8 + fe/src/main/cup/sql-parser.cup | 12 +- .../java/org/apache/impala/analysis/Analyzer.java | 17 ++ .../main/java/org/apache/impala/analysis/Expr.java | 40 .../org/apache/impala/analysis/PartitionSet.java | 3 +- .../java/org/apache/impala/analysis/Predicate.java | 9 + .../org/apache/impala/analysis/SelectStmt.java | 39 .../apache/impala/planner/HdfsPartitionPruner.java | 40 +++- .../org/apache/impala/planner/HdfsScanNode.java| 49 +++- .../apache/impala/planner/SingleNodePlanner.java | 23 +- .../org/apache/impala/analysis/ParserTest.java | 4 +- .../org/apache/impala/planner/PlannerTest.java | 11 + .../queries/PlannerTest/optimize-simple-limit.test | 258 + .../QueryTest/range-constant-propagation.test | 14 ++ 17 files changed, 517 insertions(+), 20 deletions(-) diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 900fe76..f2cd720 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -329,6 +329,9 @@ Status impala::SetQueryOption(const string& key, const string& value, case TImpalaQueryOptions::OPTIMIZE_PARTITION_KEY_SCANS: query_options->__set_optimize_partition_key_scans(IsTrue(value)); break; + case TImpalaQueryOptions::OPTIMIZE_SIMPLE_LIMIT: +query_options->__set_optimize_simple_limit(IsTrue(value)); +break; case TImpalaQueryOptions::REPLICA_PREFERENCE: { map valid_enums_values = { {0, "CACHE_LOCAL"}, diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index ade7950..d61e47d 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -47,7 +47,7 @@ typedef std::unordered_map // time we add or remove a query option to/from the enum TImpalaQueryOptions. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::REPORT_SKEW_LIMIT + 1);\ + TImpalaQueryOptions::OPTIMIZE_SIMPLE_LIMIT + 1);\ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -223,6 +223,8 @@ typedef std::unordered_map TQueryOptionLevel::ADVANCED)\ QUERY_OPT_FN(report_skew_limit, REPORT_SKEW_LIMIT,\ TQueryOptionLevel::ADVANCED)\ + QUERY_OPT_FN(optimize_simple_limit, OPTIMIZE_SIMPLE_LIMIT,\ + TQueryOptionLevel::REGUL
[impala] branch master updated: Fix for startup crash in scheduler-benchmark.
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new a4cf449 Fix for startup crash in scheduler-benchmark. a4cf449 is described below commit a4cf449c88ef3fe08db9abbad82664b99382014c Author: Shant Hovsepian AuthorDate: Mon Nov 23 22:05:36 2020 -0500 Fix for startup crash in scheduler-benchmark. Updated scheduler-benchmark's main() to use the newer impala::InitCommonRuntime() initialization methods. Before this change the benchmark would compile properly but crash when run. Testing: * built and ran in debug and release mode. Change-Id: Ib9fba3b97f102e41f2024a2bfaacbf0568bd4c68 Reviewed-on: http://gerrit.cloudera.org:8080/16778 Reviewed-by: Joe McDonnell Tested-by: Impala Public Jenkins --- be/src/benchmarks/scheduler-benchmark.cc | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/be/src/benchmarks/scheduler-benchmark.cc b/be/src/benchmarks/scheduler-benchmark.cc index f7df4ef..3fc6079 100644 --- a/be/src/benchmarks/scheduler-benchmark.cc +++ b/be/src/benchmarks/scheduler-benchmark.cc @@ -26,7 +26,10 @@ #include "util/debug-util.h" #include "util/thread.h" +#include "codegen/llvm-codegen.h" +#include "common/init.h" #include "common/names.h" +#include "service/fe-support.h" using namespace impala; using namespace impala::test; @@ -160,9 +163,9 @@ void RunNumBlocksBenchmark(TReplicaPreference::type replica_preference) { } int main(int argc, char** argv) { - google::InitGoogleLogging(argv[0]); - CpuInfo::Init(); - impala::InitThreading(); + impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST); + impala::InitFeSupport(); + ABORT_IF_ERROR(LlvmCodeGen::InitializeLlvm()); cout << Benchmark::GetMachineInfo() << endl; RunClusterSizeBenchmark(TReplicaPreference::DISK_LOCAL);
[impala] branch master updated: IMPALA-9382: part 2/3: aggregate profiles sent to coordinator
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 9429bd7 IMPALA-9382: part 2/3: aggregate profiles sent to coordinator 9429bd7 is described below commit 9429bd779de986d3e61858bef7e258bd73a2cacd Author: Tim Armstrong AuthorDate: Sun May 17 16:37:46 2020 -0700 IMPALA-9382: part 2/3: aggregate profiles sent to coordinator This reworks the status reporting so that serialized AggregatedRuntimeProfile objects are sent from executors to coordinators. These profiles are substantially denser and faster to process for higher mt_dop values. The aggregation is also done in a single step, merging the aggregated thrift profile from the executor directly into the final aggregated profile, instead of converting it to an unaggregated profile first. The changes required were: * A new Update() method for AggregatedRuntimeProfile that updates the profile from a serialised AggregateRuntimeProfile for a subset of the instances. The code is generalized from the existing InitFromThrift() code path. * Per-fragment reports included in the status report protobuf when --gen_experimental_profile=true. * Logic on the coordinator that either consumes serialized AggregatedRuntimeProfile per fragment, when --gen_experimental_profile=true, or consumes a serialized RuntimeProfile per finstance otherwise. This also adds support for event sequences and time series in the aggregated profile, so the amount of information in the aggregated profile is now on par with the basic profile. We also finish off support for JSON profile. The JSON profile is more stripped down because we do not need to round-trip profiles via JSON and it is a much less dense profile representation. Part 3 will clean up and improve the display of the profile. Testing: * Add sanity tests for aggregated runtime profile. * Add unit tests to exercise aggregation of the various counter types * Ran core tests. Change-Id: Ic680cbfe94c939c2a8fad9d0943034ed058c6bca Reviewed-on: http://gerrit.cloudera.org:8080/16057 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/runtime/coordinator-backend-state.cc| 62 +- be/src/runtime/coordinator-backend-state.h | 30 +- be/src/runtime/fragment-instance-state.cc | 14 +- be/src/runtime/fragment-instance-state.h | 5 +- be/src/runtime/fragment-state.cc | 5 + be/src/runtime/fragment-state.h| 6 + be/src/runtime/query-state.cc | 40 +- be/src/runtime/query-state.h | 5 + be/src/service/impala-server.cc| 2 + be/src/util/runtime-profile-counters.h | 30 +- be/src/util/runtime-profile-test.cc| 337 - be/src/util/runtime-profile.cc | 814 - be/src/util/runtime-profile.h | 118 ++- common/protobuf/control_service.proto | 23 +- common/thrift/ImpalaInternalService.thrift | 4 + common/thrift/RuntimeProfile.thrift| 38 +- .../tpch/queries/runtime-profile-aggregated.test | 29 + tests/custom_cluster/test_runtime_profile.py | 46 ++ 18 files changed, 1348 insertions(+), 260 deletions(-) diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc index 602bfca..c7beac4 100644 --- a/be/src/runtime/coordinator-backend-state.cc +++ b/be/src/runtime/coordinator-backend-state.cc @@ -384,6 +384,9 @@ bool Coordinator::BackendState::ApplyExecStatusReport( vector* aux_error_info, const vector& fragment_stats) { DCHECK(!IsEmptyBackend()); + CHECK(FLAGS_gen_experimental_profile || +backend_exec_status.fragment_exec_status().empty()) + << "Received pre-aggregated profile but --gen_experimental_profile=false"; // Hold the exec_summary's lock to avoid exposing it half-way through // the update loop below. lock_guard l1(exec_summary->lock); @@ -399,21 +402,28 @@ bool Coordinator::BackendState::ApplyExecStatusReport( if (IsDoneLocked(lock)) return false; // Use empty profile in case profile serialization/deserialization failed. - // 'thrift_profiles' and 'instance_exec_status' vectors have one-to-one correspondance. + // Depending on the --gen_experimental_profile value, there is one profile tree per + // fragment (if true) or per fragment instance (if false). vector empty_profiles; + // We iterate over the profiles in order. The profile trees include the instance + // profiles in the same order as 'instance_exec_status', then t
[impala] 01/02: IMPALA-10351, IMPALA-9812: enable mt_dop for DML by default
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit c4e7977f5ee1b740215eb486a074bc1326cff360 Author: Tim Armstrong AuthorDate: Mon Nov 23 12:14:33 2020 -0800 IMPALA-10351,IMPALA-9812: enable mt_dop for DML by default This allows setting mt_dop for any query with any configuration. Before this patch it was not supported for DML. --unlock_mt_dop and --mt_dop_auto_fallback are now ignored. Testing: * Updated tests to reflect new behaviour. * Removed irrelevant tests for fallback/validation. * Ran exhaustive tests. Change-Id: I66331481260fe4b69d9e95b0200029b14d230ade Reviewed-on: http://gerrit.cloudera.org:8080/16775 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/common/global-flags.cc | 9 +-- be/src/util/backend-gflag-util.cc | 4 -- bin/run-all-tests.sh | 3 +- common/thrift/BackendGflags.thrift | 4 +- .../java/org/apache/impala/common/RuntimeEnv.java | 7 -- .../java/org/apache/impala/planner/Planner.java| 18 - .../org/apache/impala/service/BackendConfig.java | 8 --- .../org/apache/impala/planner/PlannerTest.java | 16 ++--- .../queries/PlannerTest/mt-dop-validation.test | 80 -- .../queries/QueryTest/mt-dop-auto-fallback.test| 31 - tests/custom_cluster/test_mt_dop.py| 21 +- 11 files changed, 69 insertions(+), 132 deletions(-) diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc index 339ff14..1b8c16f 100644 --- a/be/src/common/global-flags.cc +++ b/be/src/common/global-flags.cc @@ -290,13 +290,6 @@ DEFINE_double_hidden(invalidate_tables_fraction_on_memory_pressure, 0.1, "The fraction of tables to invalidate when CatalogdTableInvalidator considers the " "old GC generation to be almost full."); -DEFINE_bool_hidden(unlock_mt_dop, false, -"(Experimental) If true, allow specifying mt_dop for all queries."); - -DEFINE_bool_hidden(mt_dop_auto_fallback, false, -"(Experimental) If true, fall back to non-mt_dop if mt_dop query option is set and " -"a query does not support it. Has no effect if --unlock_mt_dop is true."); - DEFINE_bool_hidden(recursively_list_partitions, true, "If true, recursively list the content of partition directories."); @@ -399,6 +392,7 @@ REMOVED_FLAG(llama_registration_timeout_secs); REMOVED_FLAG(llama_registration_wait_secs); REMOVED_FLAG(local_nodemanager_url); REMOVED_FLAG(max_free_io_buffers); +REMOVED_FLAG(mt_dop_auto_fallback); REMOVED_FLAG(pull_incremental_statistics); REMOVED_FLAG(report_status_retry_interval_ms); REMOVED_FLAG(resource_broker_cnxn_attempts); @@ -417,6 +411,7 @@ REMOVED_FLAG(staging_cgroup); REMOVED_FLAG(status_report_interval); REMOVED_FLAG(status_report_max_retries); REMOVED_FLAG(suppress_unknown_disk_id_warnings); +REMOVED_FLAG(unlock_mt_dop); REMOVED_FLAG(use_krpc); REMOVED_FLAG(use_kudu_kinit); REMOVED_FLAG(use_statestore); diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc index 3a2c470..b43c63e 100644 --- a/be/src/util/backend-gflag-util.cc +++ b/be/src/util/backend-gflag-util.cc @@ -66,8 +66,6 @@ DECLARE_int32(kudu_error_buffer_size); DECLARE_int32(hms_event_polling_interval_s); DECLARE_bool(enable_insert_events); DECLARE_string(authorization_factory_class); -DECLARE_bool(unlock_mt_dop); -DECLARE_bool(mt_dop_auto_fallback); DECLARE_string(ranger_service_type); DECLARE_string(ranger_app_id); DECLARE_string(authorization_provider); @@ -153,8 +151,6 @@ Status GetThriftBackendGflags(JNIEnv* jni_env, jbyteArray* cfg_bytes) { cfg.__set_enable_insert_events(FLAGS_enable_insert_events); cfg.__set_impala_build_version(::GetDaemonBuildVersion()); cfg.__set_authorization_factory_class(FLAGS_authorization_factory_class); - cfg.__set_unlock_mt_dop(FLAGS_unlock_mt_dop); - cfg.__set_mt_dop_auto_fallback(FLAGS_mt_dop_auto_fallback); cfg.__set_ranger_service_type(FLAGS_ranger_service_type); cfg.__set_ranger_app_id(FLAGS_ranger_app_id); cfg.__set_authorization_provider(FLAGS_authorization_provider); diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh index 98fa122..329b866 100755 --- a/bin/run-all-tests.sh +++ b/bin/run-all-tests.sh @@ -169,11 +169,10 @@ TEST_RET_CODE=0 # Helper function to start Impala cluster. start_impala_cluster() { - # TODO: IMPALA-9812: remove --unlock_mt_dop when it is no longer needed. run-step "Starting Impala cluster" start-impala-cluster.log \ "${IMPALA_HOME}/bin/start-impala-cluster.py" \ --log_dir="${IMPALA_EE_TEST_LOGS_DIR}" \ - ${TEST_START_CLUSTER_ARGS} --impalad_args=--unlock_mt_dop=true + ${TEST_
[impala] 02/02: IMPALA-10216: make TestWriteErrorBlacklist deterministic
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit f07514add9956e676d99ae9cbca5855960a279f2 Author: Tim Armstrong AuthorDate: Tue Nov 24 12:20:04 2020 -0800 IMPALA-10216: make TestWriteErrorBlacklist deterministic There is a subtle bug in the test where it does a BufferPool::Pin() call followed immediately by a BufferPool::Unpin() call. This is meant to ensure that a new scratch range is allocated for the file, but does not guarantee that because the Pin() is asynchronous and there is a short-circuit case in buffer pool that cancels the Pin() if it the page is unpinned before the pin completes. We can force the pin to complete by requesting the actual buffer for the page (and verifying the data for good measure). Testing: I was never able to reproduce this race locally, but the fix is pretty safe and I looped the modified test for a while. Change-Id: I158dbc1ac60c8fefd53a138aa3e41cced9c4d674 Reviewed-on: http://gerrit.cloudera.org:8080/16782 Reviewed-by: Csaba Ringhofer Tested-by: Impala Public Jenkins --- be/src/runtime/bufferpool/buffer-pool-test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/be/src/runtime/bufferpool/buffer-pool-test.cc b/be/src/runtime/bufferpool/buffer-pool-test.cc index 37f21be..949ac14 100644 --- a/be/src/runtime/bufferpool/buffer-pool-test.cc +++ b/be/src/runtime/bufferpool/buffer-pool-test.cc @@ -1785,6 +1785,8 @@ void BufferPoolTest::TestWriteErrorBlacklist( DestroyAll(, [ERROR_QUERY], _new_pages); ASSERT_OK(PinAll(, [NO_ERROR_QUERY], [NO_ERROR_QUERY])); + // IMPALA-10216: Verify data to force Pin to complete before unpinning. + VerifyData(pages[NO_ERROR_QUERY], 0); UnpinAll(, [NO_ERROR_QUERY], [NO_ERROR_QUERY]); WaitForAllWrites([NO_ERROR_QUERY]); EXPECT_TRUE(FindPageInDir(pages[NO_ERROR_QUERY], good_dir) != NULL)
[impala] branch master updated (96decf5 -> f07514a)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 96decf5 IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h new c4e7977 IMPALA-10351,IMPALA-9812: enable mt_dop for DML by default new f07514a IMPALA-10216: make TestWriteErrorBlacklist deterministic The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/common/global-flags.cc | 9 +-- be/src/runtime/bufferpool/buffer-pool-test.cc | 2 + be/src/util/backend-gflag-util.cc | 4 -- bin/run-all-tests.sh | 3 +- common/thrift/BackendGflags.thrift | 4 +- .../java/org/apache/impala/common/RuntimeEnv.java | 7 -- .../java/org/apache/impala/planner/Planner.java| 18 - .../org/apache/impala/service/BackendConfig.java | 8 --- .../org/apache/impala/planner/PlannerTest.java | 16 ++--- .../queries/PlannerTest/mt-dop-validation.test | 80 -- .../queries/QueryTest/mt-dop-auto-fallback.test| 31 - tests/custom_cluster/test_mt_dop.py| 21 +- 12 files changed, 71 insertions(+), 132 deletions(-) delete mode 100644 testdata/workloads/functional-query/queries/QueryTest/mt-dop-auto-fallback.test
[impala] branch master updated: IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 96decf5 IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h 96decf5 is described below commit 96decf535ba9ae7a6b295012d2be85c3a1ae892c Author: Zoltan Borok-Nagy AuthorDate: Mon Nov 23 13:30:50 2020 +0100 IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h During Parquet file writing, a DCHECK checks if row group stats have copied the min/max string values into their internal buffers. This check is at the finalization of each page. The copying of the string values happened at the end of each row batch. Thus, if a row batch spans over multiple pages then the min/max string values don't get copied by the end of the page. Since the memory is attached to the row batch this isn't really an error. As a workaround this commit also copies the min/max string values at the end of the page if they haven't been copied yet. Testing * Added e2e test Change-Id: I4289bd743e951cc4c607d5a5ea75d27825a1c12b Reviewed-on: http://gerrit.cloudera.org:8080/16771 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/hdfs-parquet-table-writer.cc| 1 + .../queries/QueryTest/parquet-page-index.test | 21 + tests/query_test/test_parquet_stats.py | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc b/be/src/exec/parquet/hdfs-parquet-table-writer.cc index d33578d..5203c3f 100644 --- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc +++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc @@ -936,6 +936,7 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() { // Update row group statistics from page statistics. DCHECK(row_group_stats_base_ != nullptr); + RETURN_IF_ERROR(row_group_stats_base_->MaterializeStringValuesToInternalBuffers()); row_group_stats_base_->Merge(*page_stats_base_); // Add the size of the data page header diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test index d3cbd9f..f37a064 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test @@ -316,3 +316,24 @@ STRING, BIGINT aggregation(SUM, NumPages): 30 aggregation(SUM, NumStatsFilteredPages): 27 + QUERY +# IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h +# Impala could hit a DCHECK when the row batch spanned over multiple pages and +# they had the same min or max string values. +set parquet_page_row_count_limit=5; +create table lineitem_comment (s string) stored as parquet; +insert into lineitem_comment select l_comment from tpch_parquet.lineitem + order by l_comment + limit 100; +insert into lineitem_comment select * from lineitem_comment; +select count(*) from lineitem_comment; + RESULTS +200 + TYPES +BIGINT + + QUERY +drop table lineitem_comment; + RESULTS +'Table has been dropped.' + \ No newline at end of file diff --git a/tests/query_test/test_parquet_stats.py b/tests/query_test/test_parquet_stats.py index 319dd1a..28a2288 100644 --- a/tests/query_test/test_parquet_stats.py +++ b/tests/query_test/test_parquet_stats.py @@ -91,7 +91,7 @@ class TestParquetStats(ImpalaTestSuite): create_table_from_parquet(self.client, unique_database, 'customer_multiblock_page_index') -for batch_size in [1]: +for batch_size in [0, 1]: new_vector.get_value('exec_option')['batch_size'] = batch_size self.run_test_case('QueryTest/parquet-page-index', new_vector, unique_database) self.run_test_case('QueryTest/nested-types-parquet-page-index', new_vector,
[impala] branch master updated: IMPALA-10330: Bump toolchain build id for new Kudu
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 51d8f4f IMPALA-10330: Bump toolchain build id for new Kudu 51d8f4f is described below commit 51d8f4fb21d8314191bafb75c7d2f0910080cb4e Author: Tim Armstrong AuthorDate: Mon Nov 16 17:10:27 2020 -0800 IMPALA-10330: Bump toolchain build id for new Kudu Change-Id: I1d351720bb5322a7ae3c038256f5385372ba6c6b Reviewed-on: http://gerrit.cloudera.org:8080/16735 Tested-by: Impala Public Jenkins Reviewed-by: Joe McDonnell --- bin/impala-config.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index d727982..82488ed 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -68,7 +68,7 @@ fi # moving to a different build of the toolchain, e.g. when a version is bumped or a # compile option is changed. The build id can be found in the output of the toolchain # build jobs, it is constructed from the build number and toolchain git hash prefix. -export IMPALA_TOOLCHAIN_BUILD_ID=61-7ce63be783 +export IMPALA_TOOLCHAIN_BUILD_ID=62-ab817885e4 # Versions of toolchain dependencies. # --- export IMPALA_AVRO_VERSION=1.7.4-p5 @@ -652,7 +652,7 @@ fi # overall build type) and does not apply when using a local Kudu build. export USE_KUDU_DEBUG_BUILD=${USE_KUDU_DEBUG_BUILD-false} -export IMPALA_KUDU_VERSION=${IMPALA_KUDU_VERSION-"2f5605dfc"} +export IMPALA_KUDU_VERSION=${IMPALA_KUDU_VERSION-"6a7cadc7e"} export IMPALA_KUDU_HOME=${IMPALA_TOOLCHAIN_PACKAGES_HOME}/kudu-$IMPALA_KUDU_VERSION export IMPALA_KUDU_JAVA_HOME=\ ${IMPALA_TOOLCHAIN_PACKAGES_HOME}/kudu-${IMPALA_KUDU_VERSION}/java
[impala] branch master updated: IMPALA-10288: Implement DESCRIBE HISTORY for Iceberg tables
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new b66045c IMPALA-10288: Implement DESCRIBE HISTORY for Iceberg tables b66045c is described below commit b66045c8a5d48c268a4dfad967021ff9bcbdd937 Author: Gabor Kaszab AuthorDate: Thu Sep 17 15:43:55 2020 +0200 IMPALA-10288: Implement DESCRIBE HISTORY for Iceberg tables The DESCRIBE HISTORY works for Iceberg tables and displays the snapshot history of the table. An example output: DESCRIBE HISTORY iceberg_multi_snapshots; ++-+-+-+ | creation_time | snapshot_id | parent_id | is_current_ancestor | ++-+-+-+ | 2020-10-13 14:01:07.234000 | 4400379706200951771 | NULL| TRUE| | 2020-10-13 14:01:19.307000 | 4221472712544505868 | 4400379706200951771 | TRUE| ++-+-+-+ The purpose here was to have similar output with this new feature as what SparkSql returns for "SELECT * from tablename.history". See "History" section of https://iceberg.apache.org/spark/#inspecting-tables Testing: - iceberg-negative.test was extended to check that DESCRIBE HISTORY is not applicable for non-Iceberg tables. - iceberg-table-history.test: Covers basic usage of DESCRIBE HISTORY. Tests on tables created with Impala and also with Spark. Change-Id: I56a4b92c27e8e4a79109696cbae62735a00750e5 Reviewed-on: http://gerrit.cloudera.org:8080/16599 Reviewed-by: Zoltan Borok-Nagy Reviewed-by: wangsheng Tested-by: Impala Public Jenkins --- be/src/service/client-request-state.cc | 29 ++ be/src/service/frontend.cc | 6 ++ be/src/service/frontend.h | 5 ++ common/thrift/Frontend.thrift | 23 fe/src/main/cup/sql-parser.cup | 25 .../apache/impala/analysis/AnalysisContext.java| 11 +++- .../impala/analysis/DescribeHistoryStmt.java | 67 ++ .../java/org/apache/impala/service/Frontend.java | 46 +++ .../org/apache/impala/service/JniFrontend.java | 21 +++ .../org/apache/impala/analysis/ParserTest.java | 15 - testdata/data/README | 3 +- .../queries/QueryTest/iceberg-negative.test| 5 ++ .../queries/QueryTest/iceberg-table-history.test | 20 +++ tests/query_test/test_iceberg.py | 22 +++ 14 files changed, 283 insertions(+), 15 deletions(-) diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc index 7c5b023..763b7f5 100644 --- a/be/src/service/client-request-state.cc +++ b/be/src/service/client-request-state.cc @@ -30,6 +30,7 @@ #include "catalog/catalog-service-client-wrapper.h" #include "common/status.h" #include "exec/kudu-util.h" +#include "exprs/timezone_db.h" #include "kudu/rpc/rpc_controller.h" #include "rpc/rpc-mgr.inline.h" #include "runtime/coordinator.h" @@ -38,6 +39,8 @@ #include "runtime/query-driver.h" #include "runtime/row-batch.h" #include "runtime/runtime-state.h" +#include "runtime/timestamp-value.h" +#include "runtime/timestamp-value.inline.h" #include "scheduling/admission-control-client.h" #include "scheduling/scheduler.h" #include "service/frontend.h" @@ -423,6 +426,32 @@ Status ClientRequestState::ExecLocalCatalogOp( result_metadata_ = response.schema; return Status::OK(); } +case TCatalogOpType::DESCRIBE_HISTORY: { + // This operation is supported for Iceberg tables only. + const TDescribeHistoryParams& params = catalog_op.describe_history_params; + TGetTableHistoryResult result; + RETURN_IF_ERROR(frontend_->GetTableHistory(params, )); + + request_result_set_.reset(new vector); + request_result_set_->resize(result.result.size()); + for (int i = 0; i < result.result.size(); ++i) { +const TGetTableHistoryResultItem item = result.result[i]; +TResultRow _row = (*request_result_set_.get())[i]; +result_row.__isset.colVals = true; +result_row.colVals.resize(4); +const Timezone* local_tz = TimezoneDatabase::FindTimezone( +query_options().timezone); +TimestampValue tv = TimestampValue::FromUnixTimeMic
[impala] branch master updated: IMPALA-10334: test_stats_extrapolation output doesn't match on erasure coding build
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 6493f87 IMPALA-10334: test_stats_extrapolation output doesn't match on erasure coding build 6493f87 is described below commit 6493f8735731bb1d7beadf5e093ebd812d8ad8d2 Author: Qifan Chen AuthorDate: Fri Nov 20 13:32:11 2020 -0500 IMPALA-10334: test_stats_extrapolation output doesn't match on erasure coding build This patch skips test_stats_extrapolation for erasure code builds. The reason is that an extra erasure code information line can be included in the scan explain section when a hdfs table is erasure coded. This makes the explain output different between a normal build and an erasure code build. A new reason 'contain_full_explain' is added to SkipIfEC to facilitate this. Testing: Ran erasure coding version of the EE and CLUSTER tests. Ran core tests Change-Id: I16c11aa0a1ec2d4569c272d2454915041039f950 Reviewed-on: http://gerrit.cloudera.org:8080/16756 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- tests/common/skip.py | 2 ++ tests/metadata/test_stats_extrapolation.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/common/skip.py b/tests/common/skip.py index b9aa6c8..a340360 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -193,6 +193,8 @@ class SkipIfEC: "features relying on local read do not work.") oom = pytest.mark.skipif(IS_EC, reason="Probably broken by HDFS-13540.") fix_later = pytest.mark.skipif(IS_EC, reason="It should work but doesn't.") + contain_full_explain = pytest.mark.skipif(IS_EC, reason="Contain full explain output " + "for hdfs tables.") class SkipIfDockerizedCluster: diff --git a/tests/metadata/test_stats_extrapolation.py b/tests/metadata/test_stats_extrapolation.py index 4dc14ff..8de917d 100644 --- a/tests/metadata/test_stats_extrapolation.py +++ b/tests/metadata/test_stats_extrapolation.py @@ -17,6 +17,7 @@ from os import path from tests.common.impala_test_suite import ImpalaTestSuite +from tests.common.skip import SkipIfEC from tests.common.test_dimensions import ( create_exec_option_dimension, create_single_exec_option_dimension, @@ -38,6 +39,7 @@ class TestStatsExtrapolation(ImpalaTestSuite): cls.ImpalaTestMatrix.add_dimension( create_uncompressed_text_dimension(cls.get_workload())) + @SkipIfEC.contain_full_explain def test_stats_extrapolation(self, vector, unique_database): vector.get_value('exec_option')['num_nodes'] = 1 vector.get_value('exec_option')['explain_level'] = 2
[impala] 03/03: IMPALA-10346: Rename Iceberg test tables' name with specific cases
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 5c91ff27372adb3117cf48ce95ab7f7b1b9ee73a Author: skyyws AuthorDate: Fri Nov 20 16:43:30 2020 +0800 IMPALA-10346: Rename Iceberg test tables' name with specific cases We used some unrecognized table names in Iceberg related test cases, such as iceberg_test1/iceberg_test2 and so on, which resulted in poor readability. So we better rename these Iceberg test tables' name by specific cases. Testing: - Renamed tables' name in iceberg-create.test - Renamed tables' name in iceberg-alter.test Change-Id: Ifdaeaaeed69753222668342dcac852677fdd9ae5 Reviewed-on: http://gerrit.cloudera.org:8080/16753 Reviewed-by: Zoltan Borok-Nagy Tested-by: Impala Public Jenkins --- .../queries/QueryTest/iceberg-alter.test | 44 ++-- .../queries/QueryTest/iceberg-create.test | 80 +++--- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-alter.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-alter.test index 21557cb..b5c9827 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-alter.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-alter.test @@ -1,14 +1,14 @@ QUERY -CREATE TABLE iceberg_test1( +CREATE TABLE iceberg_hadoop_tables( level STRING ) STORED AS ICEBERG TBLPROPERTIES('iceberg.catalog'='hadoop.tables'); -ALTER TABLE iceberg_test1 ADD COLUMNS(event_time TIMESTAMP, register_time DATE); -ALTER TABLE iceberg_test1 ADD COLUMNS(message STRING, price DECIMAL(8,1)); -ALTER TABLE iceberg_test1 ADD COLUMNS(map_test MAP >, struct_test STRUCT ); -DESCRIBE iceberg_test1; +ALTER TABLE iceberg_hadoop_tables ADD COLUMNS(event_time TIMESTAMP, register_time DATE); +ALTER TABLE iceberg_hadoop_tables ADD COLUMNS(message STRING, price DECIMAL(8,1)); +ALTER TABLE iceberg_hadoop_tables ADD COLUMNS(map_test MAP >, struct_test STRUCT ); +DESCRIBE iceberg_hadoop_tables; RESULTS 'level','string','' 'event_time','timestamp','' @@ -21,16 +21,16 @@ DESCRIBE iceberg_test1; STRING,STRING,STRING QUERY -ALTER TABLE iceberg_test1 set TBLPROPERTIES('fake_key'='fake_value'); -DESCRIBE FORMATTED iceberg_test1; +ALTER TABLE iceberg_hadoop_tables set TBLPROPERTIES('fake_key'='fake_value'); +DESCRIBE FORMATTED iceberg_hadoop_tables; RESULTS: VERIFY_IS_SUBSET '','fake_key','fake_value ' TYPES string, string, string QUERY -ALTER TABLE iceberg_test1 set OWNER USER fake_user; -DESCRIBE FORMATTED iceberg_test1; +ALTER TABLE iceberg_hadoop_tables set OWNER USER fake_user; +DESCRIBE FORMATTED iceberg_hadoop_tables; RESULTS: VERIFY_IS_SUBSET 'OwnerType: ','USER','NULL' 'Owner: ','fake_user ','NULL' @@ -38,8 +38,8 @@ DESCRIBE FORMATTED iceberg_test1; string, string, string QUERY -ALTER TABLE iceberg_test1 set OWNER ROLE fake_role; -DESCRIBE FORMATTED iceberg_test1; +ALTER TABLE iceberg_hadoop_tables set OWNER ROLE fake_role; +DESCRIBE FORMATTED iceberg_hadoop_tables; RESULTS: VERIFY_IS_SUBSET 'OwnerType: ','ROLE','NULL' 'Owner: ','fake_role ','NULL' @@ -47,16 +47,16 @@ DESCRIBE FORMATTED iceberg_test1; string, string, string QUERY -CREATE TABLE iceberg_test2( +CREATE TABLE iceberg_hadoop_catalog( level STRING ) STORED AS ICEBERG TBLPROPERTIES('iceberg.catalog'='hadoop.catalog', 'iceberg.catalog_location'='/$DATABASE/hadoop_catalog_test'); -ALTER TABLE iceberg_test2 ADD COLUMNS(event_time TIMESTAMP, register_time DATE); -ALTER TABLE iceberg_test2 ADD COLUMNS(message STRING, price DECIMAL(8,1)); -ALTER TABLE iceberg_test2 ADD COLUMNS(map_test MAP >, struct_test STRUCT ); -DESCRIBE iceberg_test2; +ALTER TABLE iceberg_hadoop_catalog ADD COLUMNS(event_time TIMESTAMP, register_time DATE); +ALTER TABLE iceberg_hadoop_catalog ADD COLUMNS(message STRING, price DECIMAL(8,1)); +ALTER TABLE iceberg_hadoop_catalog ADD COLUMNS(map_test MAP >, struct_test STRUCT ); +DESCRIBE iceberg_hadoop_catalog; RESULTS 'level','string','' 'event_time','timestamp','' @@ -69,16 +69,16 @@ DESCRIBE iceberg_test2; STRING,STRING,STRING QUERY -ALTER TABLE iceberg_test2 set TBLPROPERTIES('test_key'='test_value'); -DESCRIBE FORMATTED iceberg_test2; +ALTER TABLE iceberg_hadoop_catalog set TBLPROPERTIES('test_key'='test_value'); +DESCRIBE FORMATTED iceberg_hadoop_catalog; RESULTS: VERIFY_IS_SUBSET '','test_key','test_value ' TYPES string, string, string QUERY -ALTER TABLE iceberg_test2 set OWNER USER fake_user; -DESCRIBE FORMATTED iceberg_test2; +ALTER TABLE iceberg_ha
[impala] 01/03: IMPALA-9121: try to avoid ASAN error in hdfs-util-test
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit ba509ad1809138844d52a8b5877880e5fc9cfe9a Author: Tim Armstrong AuthorDate: Wed Nov 18 17:18:17 2020 -0800 IMPALA-9121: try to avoid ASAN error in hdfs-util-test I couldn't discern the likely root cause of the ASAN error, but have a hunch that it's a background thread accessing some data structure that is being torn down as the process exits. The tests in this file are simple so there shouldn't really be that much that can go wrong, except for the stuff started by ExecEnv::Init(). I modified the test to only initialize the necessary configs in ExecEnv, not start up the whole thing. Hopefully that make the problem go away. Testing: Looped the test locally with ASAN. Change-Id: Ic7b42be0f8b5d6c6a31095f9d1a278fd82bd500c Reviewed-on: http://gerrit.cloudera.org:8080/16748 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/runtime/exec-env.cc| 5 - be/src/runtime/exec-env.h | 7 +++ be/src/util/hdfs-util-test.cc | 21 +++-- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc index 0dd78bc..48dd888 100644 --- a/be/src/runtime/exec-env.cc +++ b/be/src/runtime/exec-env.cc @@ -461,7 +461,11 @@ Status ExecEnv::Init() { } RETURN_IF_ERROR(admission_controller_->Init()); + RETURN_IF_ERROR(InitHadoopConfig()); + return Status::OK(); +} +Status ExecEnv::InitHadoopConfig() { // Get the fs.defaultFS value set in core-site.xml and assign it to configured_defaultFs TGetHadoopConfigRequest config_request; config_request.__set_name(DEFAULT_FS); @@ -472,7 +476,6 @@ Status ExecEnv::Init() { } else { default_fs_ = "hdfs://"; } - return Status::OK(); } diff --git a/be/src/runtime/exec-env.h b/be/src/runtime/exec-env.h index e96a070..c685613 100644 --- a/be/src/runtime/exec-env.h +++ b/be/src/runtime/exec-env.h @@ -27,6 +27,7 @@ #include "common/global-types.h" #include "common/status.h" #include "runtime/client-cache-types.h" +#include "testutil/gtest-util.h" #include "util/hdfs-bulk-ops-defs.h" // For declaration of HdfsOpThreadPool #include "util/network-util.h" #include "util/spinlock.h" @@ -226,6 +227,9 @@ class ExecEnv { friend class TestEnv; friend class DataStreamTest; + // For access to InitHadoopConfig(). + FRIEND_TEST(HdfsUtilTest, CheckFilesystemsMatch); + static ExecEnv* exec_env_; bool is_fe_tests_ = false; @@ -272,6 +276,9 @@ class ExecEnv { /// this backend. Queries take up multiple slots only when mt_dop > 1. int64_t admission_slots_; + /// Initialize ExecEnv based on Hadoop config from frontend. + Status InitHadoopConfig(); + /// Choose a memory limit (returned in *bytes_limit) based on the --mem_limit flag and /// the memory available to the daemon process. Returns an error if the memory limit is /// invalid or another error is encountered that should prevent starting up the daemon. diff --git a/be/src/util/hdfs-util-test.cc b/be/src/util/hdfs-util-test.cc index cbeac16..748ab23 100644 --- a/be/src/util/hdfs-util-test.cc +++ b/be/src/util/hdfs-util-test.cc @@ -24,18 +24,13 @@ #include "runtime/exec-env.h" #include "service/fe-support.h" -using namespace impala; - -DECLARE_bool(enable_webserver); +namespace impala { TEST(HdfsUtilTest, CheckFilesystemsMatch) { - // We do not want to start the webserver. - FLAGS_enable_webserver = false; - ExecEnv* exec_env = new ExecEnv(); - - // We do this to retrieve the default FS from the frontend. - // It doesn't matter if initializing the ExecEnv fails. - discard_result(exec_env->Init()); + // We do this to retrieve the default FS from the frontend without starting the rest + // of the ExecEnv services. + ExecEnv exec_env; + ASSERT_OK(exec_env.InitHadoopConfig()); // Tests with both paths qualified. EXPECT_TRUE(FilesystemsMatch("s3a://dummybucket/temp_dir/temp_path", @@ -65,7 +60,7 @@ TEST(HdfsUtilTest, CheckFilesystemsMatch) { EXPECT_TRUE(FilesystemsMatch("tempdir/temppath", "tempdir2/temppath2")); // Tests with one path qualified and the other unqualified. - const char* default_fs = exec_env->default_fs().c_str(); + const char* default_fs = exec_env.default_fs().c_str(); EXPECT_TRUE(FilesystemsMatch(default_fs, "temp_dir/temp_path")); EXPECT_TRUE(FilesystemsMatch("temp_dir/temp_path", default_fs)); EXPECT_FALSE(FilesystemsMatch("badscheme://namenode/temp_dir/temp_path", @@ -75,9 +70,6 @@ TEST(HdfsUtilTest, CheckFilesystemsMatch) { } TEST(HdfsUtilTest, CheckGetBaseName) {
[impala] 02/03: IMPALA-10329 Change apt install retry times to 30
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 48113bcffc68bb1379b8f7f991ce851755c86b28 Author: zhaorenhai AuthorDate: Fri Nov 20 10:02:23 2020 +0800 IMPALA-10329 Change apt install retry times to 30 Change apt install retry times to 30 in bootstrap_system.sh, Because this always timeout recently. And add solution for waiting the apt's lock-frontend Change-Id: Id664dd66874ac65d6b78e630c974a6a563408147 Reviewed-on: http://gerrit.cloudera.org:8080/16751 Reviewed-by: Jim Apple Tested-by: Impala Public Jenkins --- bin/bootstrap_system.sh | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh index f6c33a3..5e994f2 100755 --- a/bin/bootstrap_system.sh +++ b/bin/bootstrap_system.sh @@ -196,7 +196,7 @@ function notindocker { # Note that yum has its own retries; see yum.conf(5). REAL_APT_GET=$(ubuntu which apt-get) function apt-get { - for ITER in $(seq 1 20); do + for ITER in $(seq 1 30); do echo "ATTEMPT: ${ITER}" if sudo -E "${REAL_APT_GET}" "$@" then @@ -209,6 +209,11 @@ function apt-get { } echo ">>> Installing build tools" +if [[ "$UBUNTU" == true ]]; then + while sudo fuser /var/lib/dpkg/lock-frontend; do +sleep 1 + done +fi ubuntu apt-get update ubuntu apt-get --yes install ccache curl gawk g++ gcc libffi-dev \ libkrb5-dev krb5-admin-server krb5-kdc krb5-user libsasl2-dev \
[impala] branch master updated (47dbfde -> 5c91ff2)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 47dbfde IMPALA-10249: Fix the flaky TestImpalaShell.test_queries_closed test. new ba509ad IMPALA-9121: try to avoid ASAN error in hdfs-util-test new 48113bc IMPALA-10329 Change apt install retry times to 30 new 5c91ff2 IMPALA-10346: Rename Iceberg test tables' name with specific cases The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/runtime/exec-env.cc | 5 +- be/src/runtime/exec-env.h | 7 ++ be/src/util/hdfs-util-test.cc | 21 ++ bin/bootstrap_system.sh| 7 +- .../queries/QueryTest/iceberg-alter.test | 44 ++-- .../queries/QueryTest/iceberg-create.test | 80 +++--- 6 files changed, 86 insertions(+), 78 deletions(-)
[impala] branch master updated (50b5894 -> 47dbfde)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 50b5894 IMPALA-7523: increase hbase assignment timeout new c9ccb61 IMPALA-10286: Disable metadata.test_catalogd_debug_actions on S3 new 47dbfde IMPALA-10249: Fix the flaky TestImpalaShell.test_queries_closed test. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: tests/common/skip.py | 2 ++ tests/metadata/test_catalogd_debug_actions.py | 2 ++ tests/shell/test_shell_commandline.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-)
[impala] 02/02: IMPALA-10249: Fix the flaky TestImpalaShell.test_queries_closed test.
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 47dbfde0b2cabd14452f18527e954aec5891224c Author: Andrew Sherman AuthorDate: Tue Nov 17 17:53:53 2020 -0800 IMPALA-10249: Fix the flaky TestImpalaShell.test_queries_closed test. This test for IMPALA-897 is testing that queries run by Impala Shell from a script file are closed correctly. This is tested by an assertion that there is one in-flight query during execution of a script containing several queries. The test then closes the shell and checks that there are no in-flight queries. This is the assertion which failed. Change this assertion to instead wait for the number of in-flight queries to be zero. This avoids whatever race was causing the flakiness. Change-Id: Ib0485097c34282523ed0df6faa143fee6f74676d Reviewed-on: http://gerrit.cloudera.org:8080/16743 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- tests/shell/test_shell_commandline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py index 4448d4e..8ad2f07 100644 --- a/tests/shell/test_shell_commandline.py +++ b/tests/shell/test_shell_commandline.py @@ -364,7 +364,7 @@ class TestImpalaShell(ImpalaTestSuite): sleep(5) assert 1 == impalad_service.get_num_in_flight_queries() assert p.get_result().rc == 0 -assert 0 == impalad_service.get_num_in_flight_queries() +assert impalad_service.wait_for_num_in_flight_queries(0) def test_cancellation(self, vector): """Test cancellation (Ctrl+C event). Run a query that sleeps 10ms per row so will run
[impala] 01/02: IMPALA-10286: Disable metadata.test_catalogd_debug_actions on S3
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit c9ccb61acb6d0060de10833b4b37df096baa2cd3 Author: Vihang Karajgaonkar AuthorDate: Wed Nov 18 10:34:14 2020 -0800 IMPALA-10286: Disable metadata.test_catalogd_debug_actions on S3 This patch disables metadata/test_catalogd_debug_actions test on S3 builds due to its flakiness. The root cause of this seems to be that listing time on S3 is variable and the test becomes flaky because it measures the time taken by refresh command after a certain debug action is set. Testing: 1. Ran the test on my local environment to make sure it compiles fine. Change-Id: I30bd10de468ad449c4a143a65cdcba97d9f0cd78 Reviewed-on: http://gerrit.cloudera.org:8080/16745 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- tests/common/skip.py | 2 ++ tests/metadata/test_catalogd_debug_actions.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/common/skip.py b/tests/common/skip.py index 37ace22..b9aa6c8 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -63,6 +63,8 @@ class SkipIfS3: reason="Flakiness on account of S3 eventual consistency.") iceberg = pytest.mark.skipif(IS_S3, reason="Currently Iceberg is only supported on HDFS.") + variable_listing_times = pytest.mark.skipif(IS_S3, + reason="Flakiness due to unpredictable listing times on S3.") class SkipIfABFS: diff --git a/tests/metadata/test_catalogd_debug_actions.py b/tests/metadata/test_catalogd_debug_actions.py index 01461fb..ade7e87 100644 --- a/tests/metadata/test_catalogd_debug_actions.py +++ b/tests/metadata/test_catalogd_debug_actions.py @@ -17,8 +17,10 @@ import pytest from tests.common.impala_test_suite import ImpalaTestSuite +from tests.common.skip import SkipIfS3 +@SkipIfS3.variable_listing_times class TestDebugActions(ImpalaTestSuite): @pytest.mark.execute_serially
[impala] branch master updated: IMPALA-7523: increase hbase assignment timeout
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 50b5894 IMPALA-7523: increase hbase assignment timeout 50b5894 is described below commit 50b589422ac41dd6920c93504d3eecdb842abd47 Author: Tim Armstrong AuthorDate: Tue Nov 17 12:45:35 2020 -0800 IMPALA-7523: increase hbase assignment timeout Try to avoid flakiness by doubling the timeout. Change-Id: I32cf06ddc03abfb8d5f2fdeb3e153cf353b71fb3 Reviewed-on: http://gerrit.cloudera.org:8080/16740 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../org/apache/impala/datagenerator/HBaseTestDataRegionAssignment.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/src/test/java/org/apache/impala/datagenerator/HBaseTestDataRegionAssignment.java b/fe/src/test/java/org/apache/impala/datagenerator/HBaseTestDataRegionAssignment.java index 0b13566..eb0e18f 100644 --- a/fe/src/test/java/org/apache/impala/datagenerator/HBaseTestDataRegionAssignment.java +++ b/fe/src/test/java/org/apache/impala/datagenerator/HBaseTestDataRegionAssignment.java @@ -58,7 +58,7 @@ public class HBaseTestDataRegionAssignment { private final List sortedRS; // sorted list of region server name private final String[] splitPoints = { "1", "3", "5", "7", "9"}; - private final static int REGION_MOVE_TIMEOUT_MILLIS = 6; + private final static int REGION_MOVE_TIMEOUT_MILLIS = 12; public HBaseTestDataRegionAssignment() throws IOException { conf = new Configuration();
[impala] branch master updated: IMPALA-10189: addendum: improve comment
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new f14ce75 IMPALA-10189: addendum: improve comment f14ce75 is described below commit f14ce75008f27dea7e57a6add5c918438b6953e6 Author: Tim Armstrong AuthorDate: Wed Oct 7 09:23:31 2020 -0700 IMPALA-10189: addendum: improve comment Change-Id: I81c77c26fe60812f28c14a93ea26a2a44ac2bba9 Reviewed-on: http://gerrit.cloudera.org:8080/16557 Reviewed-by: Impala Public Jenkins Tested-by: Tim Armstrong --- fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java index 97a8d69..20afadc 100644 --- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java +++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java @@ -1710,8 +1710,7 @@ public class CatalogOpExecutor { partBuilder = new HdfsPartition.Builder(part).dropPartitionStats(); } - // Remove the ROW_COUNT parameter if it has been set and set numRows to reflect - // the change. + // We need to update the partition if it has a ROW_COUNT parameter. if (part.getParameters().containsKey(StatsSetupConst.ROW_COUNT)) { if (partBuilder == null) { partBuilder = new HdfsPartition.Builder(part);
[impala] 01/04: IMPALA-10305 (part 2): Sync Kudu's FIPS compliant changes
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 4657f062f401a2da5e5f4aa3a3e3244f45dba8c2 Author: wzhou-code AuthorDate: Mon Nov 16 15:53:30 2020 -0800 IMPALA-10305 (part 2): Sync Kudu's FIPS compliant changes kudu-3210 added FIPS compliant changes for Kudu. In previous patch, we ported the following patches for kudu-3210 into Impala source tree: http://gerrit.cloudera.org:8080/16631/ http://gerrit.cloudera.org:8080/16657/ http://gerrit.cloudera.org:8080/16658/ http://gerrit.cloudera.org:8080/16659/ The last patch http://gerrit.cloudera.org:8080/16659/ fixed an OpenSSL race condition with work around by adding lock before verifying signature. But this issue could be fixed by redefining the thread ID callback without any additional locking. Kudu reverted commit f9f3189a6dbe0636d578d80b1d8e60cf7b2e6686 and added a new patch to redefine the thread ID callback. https://gerrit.cloudera.org/#/c/16730/ https://gerrit.cloudera.org/#/c/16731/ This patch syncs Kudu's code changes of above two patches. Testing: - Passed exhausive tests. Change-Id: I04b9d46b5d7228289565617b8d3cfbef9f3b5ba3 Reviewed-on: http://gerrit.cloudera.org:8080/16736 Reviewed-by: Tim Armstrong Reviewed-by: Attila Bukor Tested-by: Impala Public Jenkins --- be/src/kudu/security/CMakeLists.txt | 2 +- be/src/kudu/security/crypto.cc| 13 --- be/src/kudu/security/openssl_util.cc | 42 --- be/src/kudu/security/openssl_util.h | 33 +-- be/src/kudu/security/tls_context.cc | 10 - be/src/kudu/security/tls_handshake.cc | 18 +-- 6 files changed, 22 insertions(+), 96 deletions(-) diff --git a/be/src/kudu/security/CMakeLists.txt b/be/src/kudu/security/CMakeLists.txt index 9475e7d..22e7442 100644 --- a/be/src/kudu/security/CMakeLists.txt +++ b/be/src/kudu/security/CMakeLists.txt @@ -73,9 +73,9 @@ set(SECURITY_SRCS ca/cert_management.cc cert.cc crypto.cc + kerberos_util.cc gssapi.cc init.cc - kerberos_util.cc openssl_util.cc ${PORTED_X509_CHECK_HOST_CC} security_flags.cc diff --git a/be/src/kudu/security/crypto.cc b/be/src/kudu/security/crypto.cc index 5e6fd16..234d193 100644 --- a/be/src/kudu/security/crypto.cc +++ b/be/src/kudu/security/crypto.cc @@ -18,9 +18,6 @@ #include "kudu/security/crypto.h" #include -#if OPENSSL_VERSION_NUMBER < 0x1010L -#include -#endif #include #include @@ -70,10 +67,6 @@ int DerWritePublicKey(BIO* bio, EVP_PKEY* key) { return i2d_RSA_PUBKEY_bio(bio, rsa.get()); } -#if OPENSSL_VERSION_NUMBER < 0x1010L -OpenSSLMutex mutex; -#endif - } // anonymous namespace template<> struct SslTypeTraits { @@ -143,9 +136,6 @@ Status PublicKey::VerifySignature(DigestType digest, const EVP_MD* md = GetMessageDigest(digest); auto md_ctx = ssl_make_unique(EVP_MD_CTX_create()); -#if OPENSSL_VERSION_NUMBER < 0x1010L - std::unique_lock l(mutex); -#endif OPENSSL_RET_NOT_OK(EVP_DigestVerifyInit(md_ctx.get(), nullptr, md, nullptr, GetRawData()), "error initializing verification digest"); OPENSSL_RET_NOT_OK(EVP_DigestVerifyUpdate(md_ctx.get(), data.data(), data.size()), @@ -238,9 +228,6 @@ Status PrivateKey::MakeSignature(DigestType digest, const EVP_MD* md = GetMessageDigest(digest); auto md_ctx = ssl_make_unique(EVP_MD_CTX_create()); -#if OPENSSL_VERSION_NUMBER < 0x1010L - std::unique_lock l(mutex); -#endif OPENSSL_RET_NOT_OK(EVP_DigestSignInit(md_ctx.get(), nullptr, md, nullptr, GetRawData()), "error initializing signing digest"); OPENSSL_RET_NOT_OK(EVP_DigestSignUpdate(md_ctx.get(), data.data(), data.size()), diff --git a/be/src/kudu/security/openssl_util.cc b/be/src/kudu/security/openssl_util.cc index 7198db3..3d9544c 100644 --- a/be/src/kudu/security/openssl_util.cc +++ b/be/src/kudu/security/openssl_util.cc @@ -28,10 +28,8 @@ #include #include -#include #include -#include "kudu/gutil/macros.h" #include "kudu/gutil/strings/split.h" #include "kudu/gutil/strings/strip.h" #include "kudu/gutil/strings/substitute.h" @@ -39,7 +37,6 @@ #include "kudu/util/debug/leakcheck_disabler.h" #endif #include "kudu/util/errno.h" -#include "kudu/util/flag_tags.h" #include "kudu/util/flags.h" #if OPENSSL_VERSION_NUMBER < 0x1010L #include "kudu/util/mutex.h" @@ -47,15 +44,9 @@ #include "kudu/util/scoped_cleanup.h" #include "kudu/util/status.h" #include "kudu/util/subprocess.h" - -DEFINE_bool(openssl_defensive_locking, -false, -"If enable
[impala] 02/04: IMPALA-10318: default_transactional_type shouldn't affect Iceberg tables
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 26fc6795ec5a77496f6b5482383223fc8dfdaac0 Author: Zoltan Borok-Nagy AuthorDate: Wed Nov 18 17:35:05 2020 +0100 IMPALA-10318: default_transactional_type shouldn't affect Iceberg tables Query option 'default_transactional_type' shouldn't affect Iceberg tables. Also, Iceberg tables shouldn't allow setting transactional properties. Testing: * Added e2e tests Change-Id: I86d1ac82ecd01a7455a0881a9e84aeb193dd5385 Reviewed-on: http://gerrit.cloudera.org:8080/16742 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../main/java/org/apache/impala/analysis/TableDef.java | 8 .../queries/QueryTest/iceberg-create.test | 17 + .../queries/QueryTest/iceberg-negative.test | 16 3 files changed, 41 insertions(+) diff --git a/fe/src/main/java/org/apache/impala/analysis/TableDef.java b/fe/src/main/java/org/apache/impala/analysis/TableDef.java index addfa8f..139be5a 100644 --- a/fe/src/main/java/org/apache/impala/analysis/TableDef.java +++ b/fe/src/main/java/org/apache/impala/analysis/TableDef.java @@ -782,6 +782,14 @@ class TableDef { return; } +if (options_.fileFormat == THdfsFileFormat.ICEBERG) { + if (AcidUtils.isTransactionalTable(options_.tblProperties)) { +throw new AnalysisException( +"Iceberg tables cannot have Hive ACID table properties."); + } + return; +} + AcidUtils.setTransactionalProperties(options_.tblProperties, analyzer.getQueryOptions().getDefault_transactional_type()); } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test index fd06875..e8af62d 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test @@ -339,3 +339,20 @@ DROP TABLE iceberg_test10; RESULTS 'Table has been dropped.' + QUERY +# Default transactional property doesn't affect Iceberg tables. +set default_transactional_type=insert_only; +create table iceberg_default_trans (i int) +stored as iceberg +tblproperties('iceberg.catalog'='hadoop.tables'); + RESULTS +'Table has been created.' + + QUERY +describe formatted iceberg_default_trans + RESULTS: VERIFY_IS_NOT_IN +'','transactional ','true ' +'','transactional_properties','insert_only ' + TYPES +STRING, STRING, STRING + diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test index 6f3dda5..cc65d26 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test @@ -195,3 +195,19 @@ ALTER TABLE iceberg_table_hadoop_catalog REPLACE COLUMNS(level INT, register_tim CATCH UnsupportedOperationException: Unsupported ALTER TABLE operation for Iceberg tables: REPLACE_COLUMNS + QUERY +CREATE TABLE iceberg_transactional(i int) +STORED AS ICEBERG +tblproperties('iceberg.catalog'='hadoop.tables', 'transactional'='true'); + CATCH +Iceberg tables cannot have Hive ACID table properties. + + QUERY +CREATE TABLE iceberg_insert_only(i int) +STORED AS ICEBERG +tblproperties('iceberg.catalog'='hadoop.catalog', +'iceberg.catalog_location'='/$EXTERNAL_WAREHOUSE_DIR/specified_location', +'transactional'='true', 'transactional_properties'='insert_only'); + CATCH +Iceberg tables cannot have Hive ACID table properties. +
[impala] 03/04: IMPALA-10340: Cannot set up KDC from scratch
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 63da9ccf0c64188a6cdd01ef8d88cfbd0c81fc9a Author: Doroszlai, Attila AuthorDate: Wed Nov 18 15:23:54 2020 +0100 IMPALA-10340: Cannot set up KDC from scratch IMPALA-9361 added experimental support for setting up a KDC for the mini cluster. However, the experimental-kerberos-setup.sh script fails with the error: KRB5_CONFIG: unbound variable The problem is that impala-config.sh unsets KDC-related variables if the cluster is not kerberized or does not exist. However, kerberized cluster cannot be created without installing KDC first. This change updates impala-config.sh to allow setting the KDC-related variables if IMPALA_KERBERIZE is set to 'true' and the cluster does not exist yet. Testing: * successfully set up KDC via experimental-kerberos-setup.sh after building Impala locally * verified that impala-config.sh sets/unsets KDC-related variables depending on cluster existence and config * started Kerberized and non-Kerberized mini clusters (after regenerating configs) Change-Id: Icaed2ad2fb1e1b60951a5a4138a9386588bc6972 Reviewed-on: http://gerrit.cloudera.org:8080/16744 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- bin/impala-config.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index d855b33..e7adfdc 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -738,7 +738,9 @@ echo "IMPALA_ICEBERG_VERSION = $IMPALA_ICEBERG_VERSION" # kerberized cluster is created, it will have to be sourced again # *after* the cluster is created in order to pick up these settings. export MINIKDC_ENV="${IMPALA_HOME}/testdata/bin/minikdc_env.sh" -if "${CLUSTER_DIR}/admin" is_kerberized; then +if "${CLUSTER_DIR}/admin" is_kerberized || + ( ! "${CLUSTER_DIR}/admin" cluster_exists && [[ "$IMPALA_KERBERIZE" == "true" ]] ); then + . "${MINIKDC_ENV}" echo " *** This cluster is kerberized ***" echo "KRB5_KTNAME= $KRB5_KTNAME"
[impala] branch master updated (1c72c5a -> b81ff2d)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 1c72c5a IMPALA-10234: Add support for cookie authentication to impala-shell new 4657f06 IMPALA-10305 (part 2): Sync Kudu's FIPS compliant changes new 26fc679 IMPALA-10318: default_transactional_type shouldn't affect Iceberg tables new 63da9cc IMPALA-10340: Cannot set up KDC from scratch new b81ff2d IMPALA-10276: thread-safe access to RuntimeProfile::counter_map_ The 4 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/kudu/security/CMakeLists.txt| 2 +- be/src/kudu/security/crypto.cc | 13 --- be/src/kudu/security/openssl_util.cc | 42 +- be/src/kudu/security/openssl_util.h| 33 ++--- be/src/kudu/security/tls_context.cc| 10 -- be/src/kudu/security/tls_handshake.cc | 18 +- be/src/util/runtime-profile.cc | 39 ++-- be/src/util/runtime-profile.h | 29 --- bin/impala-config.sh | 4 ++- .../java/org/apache/impala/analysis/TableDef.java | 8 + .../queries/QueryTest/iceberg-create.test | 17 + .../queries/QueryTest/iceberg-negative.test| 16 + 12 files changed, 100 insertions(+), 131 deletions(-)
[impala] 04/04: IMPALA-10276: thread-safe access to RuntimeProfile::counter_map_
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit b81ff2d25f53586914695b95c77a9d9e31a2a51a Author: Tim Armstrong AuthorDate: Tue Nov 17 10:32:10 2020 -0800 IMPALA-10276: thread-safe access to RuntimeProfile::counter_map_ The bug was that 'counter_map_' can be mutated concurrent with total_time_counter() or inactive_timer() being called. This is fixed by storing a pointer directly to those counters and bypassing 'counter_map_'. This is then thread-safe and also has low overhead (adding lock acquisitions might have some perf impact, since total_time_counter() is called throughout query execution). Change-Id: Ic21a13acf9c7c326a27334e61ce3729f1e3cab42 Reviewed-on: http://gerrit.cloudera.org:8080/16739 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/util/runtime-profile.cc | 39 --- be/src/util/runtime-profile.h | 29 ++--- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/be/src/util/runtime-profile.cc b/be/src/util/runtime-profile.cc index 3d0e1f7..32e2e75 100644 --- a/be/src/util/runtime-profile.cc +++ b/be/src/util/runtime-profile.cc @@ -162,8 +162,20 @@ const char* ProfileEntryPrototype::SignificanceDescription( } } -RuntimeProfileBase::RuntimeProfileBase(ObjectPool* pool, const string& name) - : pool_(pool), name_(name) {} +RuntimeProfileBase::RuntimeProfileBase(ObjectPool* pool, const string& name, +Counter* total_time_counter, Counter* inactive_timer) + : pool_(pool), +name_(name), +total_time_counter_(total_time_counter), +inactive_timer_(inactive_timer) { + DCHECK(total_time_counter != nullptr); + DCHECK(inactive_timer != nullptr); + set& root_counters = child_counter_map_[ROOT_COUNTER]; + counter_map_[TOTAL_TIME_COUNTER_NAME] = total_time_counter; + root_counters.emplace(TOTAL_TIME_COUNTER_NAME); + counter_map_[INACTIVE_TIME_COUNTER_NAME] = inactive_timer; + root_counters.emplace(INACTIVE_TIME_COUNTER_NAME); +} RuntimeProfileBase::~RuntimeProfileBase() {} @@ -172,13 +184,8 @@ RuntimeProfile* RuntimeProfile::Create(ObjectPool* pool, const string& name) { } RuntimeProfile::RuntimeProfile(ObjectPool* pool, const string& name) - : RuntimeProfileBase(pool, name) { - set& root_counters = child_counter_map_[ROOT_COUNTER]; - counter_map_[TOTAL_TIME_COUNTER_NAME] = _total_time_; - root_counters.emplace(TOTAL_TIME_COUNTER_NAME); - counter_map_[INACTIVE_TIME_COUNTER_NAME] = _timer_; - root_counters.emplace(INACTIVE_TIME_COUNTER_NAME); -} + : RuntimeProfileBase( +pool, name, _counter_total_time_, _inactive_timer_) {} RuntimeProfile::~RuntimeProfile() { DCHECK(!has_active_periodic_counters_); @@ -2151,18 +2158,12 @@ void RuntimeProfile::EventSequence::ToJson(Document& document, Value* value) { AggregatedRuntimeProfile::AggregatedRuntimeProfile( ObjectPool* pool, const string& name, int num_input_profiles, bool is_root) - : RuntimeProfileBase(pool, name), num_input_profiles_(num_input_profiles) { + : RuntimeProfileBase(pool, name, +pool->Add(new AveragedCounter(TUnit::TIME_NS, num_input_profiles)), +pool->Add(new AveragedCounter(TUnit::TIME_NS, num_input_profiles))), +num_input_profiles_(num_input_profiles) { DCHECK_GE(num_input_profiles, 0); if (is_root) input_profile_names_.resize(num_input_profiles); - set& root_counters = child_counter_map_[ROOT_COUNTER]; - Counter* total_time_counter = - pool->Add(new AveragedCounter(TUnit::TIME_NS, num_input_profiles)); - Counter* inactive_timer = - pool->Add(new AveragedCounter(TUnit::TIME_NS, num_input_profiles)); - counter_map_[TOTAL_TIME_COUNTER_NAME] = total_time_counter; - root_counters.emplace(TOTAL_TIME_COUNTER_NAME); - counter_map_[INACTIVE_TIME_COUNTER_NAME] = inactive_timer; - root_counters.emplace(INACTIVE_TIME_COUNTER_NAME); } AggregatedRuntimeProfile* AggregatedRuntimeProfile::Create( diff --git a/be/src/util/runtime-profile.h b/be/src/util/runtime-profile.h index 905c1ce..6c061b6 100644 --- a/be/src/util/runtime-profile.h +++ b/be/src/util/runtime-profile.h @@ -176,19 +176,9 @@ class RuntimeProfileBase { const TRuntimeProfileNodeMetadata& metadata() const { return metadata_; } /// Returns the counter for the total elapsed time. - Counter* total_time_counter() const { -auto it = counter_map_.find(TOTAL_TIME_COUNTER_NAME); -DCHECK(it != counter_map_.end()); -return it->second; - } - + Counter* total_time_counter() const { return total_time_counter_; } /// Returns the counter for the inactive time. - Counter* inactive_timer() const { -auto it = counter_map_.find(INACTIVE_TIME_COUNTER_NAME); -DCHECK(it != counter_map_.end()); -return it->
[impala] 01/03: IMPALA-7876: COMPUTE STATS TABLESAMPLE is not updating number of estimated rows
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit fa525dfdf72f6f612821a14e683cd7f16d2c423a Author: Abhishek Rawat AuthorDate: Wed Nov 11 07:58:11 2020 -0800 IMPALA-7876: COMPUTE STATS TABLESAMPLE is not updating number of estimated rows 'COMPUTE STATS TABLESAMPLE' uses a child query with following function 'ROUND(COUNT(*) / )' for computing the row count. The 'ROUND()' fn returns the row count as a DECIMAL type. The 'CatalogOpExecutor' (CatalogOpExecutor::SetTableStats) expects the row count as a BIGINT type. Due to this data type mismatch the table stats (Extrap #Rows) doesn't get set. Adding an explicit CAST to BIGINT for the ROUND function results in the table stats (Extrap #Rows) getting set properly. Fixed both 'custom_cluster/test_stats_extrapolation.py' and 'metadata/test_stats_extrapolation.py' so that they can catch issues like this, where table stats are not set when using 'COMPUTE STATS TABLESAMPLE'. Testing: - Ran core tests. Change-Id: I88a0a777c2be9cc18b3ff293cf1c06fb499ca052 Reviewed-on: http://gerrit.cloudera.org:8080/16712 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- .../org/apache/impala/analysis/ComputeStatsStmt.java| 8 +--- tests/common/impala_test_suite.py | 2 +- tests/custom_cluster/test_stats_extrapolation.py| 9 + tests/metadata/test_stats_extrapolation.py | 17 - 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java index 6bb2b17..906a972 100644 --- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java @@ -322,7 +322,7 @@ public class ComputeStatsStmt extends StatementBase { * * 2. COMPUTE STATS with TABLESAMPLE * 2.1 Row counts: - * SELECT ROUND(COUNT(*) / ) + * SELECT CAST(ROUND(COUNT(*) / ) AS BIGINT) * FROM tbl TABLESAMPLE SYSTEM() REPEATABLE () * * 2.1 Column stats: @@ -544,8 +544,10 @@ public class ComputeStatsStmt extends StatementBase { StringBuilder tableStatsQueryBuilder = new StringBuilder("SELECT "); String countSql = "COUNT(*)"; if (isSampling()) { - // Extrapolate the count based on the effective sampling rate. - countSql = String.format("ROUND(COUNT(*) / %.10f)", effectiveSamplePerc_); + // Extrapolate the count based on the effective sampling rate. Add an explicit CAST + // to BIGINT, which is the expected data type for row count. + countSql = String.format("CAST(ROUND(COUNT(*) / %.10f) AS BIGINT)", +effectiveSamplePerc_); } List tableStatsSelectList = Lists.newArrayList(countSql); // Add group by columns for incremental stats or with extrapolation disabled. diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index 5d1a1fa..e5fcc70 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -930,7 +930,7 @@ class ImpalaTestSuite(BaseTestSuite): """Returns True if 'a' and 'b' are within 'diff_perc' percent of each other, False otherwise. 'diff_perc' must be a float in [0,1].""" if a == b: return True # Avoid division by 0 -assert abs(a - b) / float(max(a,b)) <= diff_perc +assert abs(a - b) / float(max(abs(a), abs(b))) <= diff_perc def _get_table_location(self, table_name, vector): """ Returns the HDFS location of the table """ diff --git a/tests/custom_cluster/test_stats_extrapolation.py b/tests/custom_cluster/test_stats_extrapolation.py index cd1accd..9b21921 100644 --- a/tests/custom_cluster/test_stats_extrapolation.py +++ b/tests/custom_cluster/test_stats_extrapolation.py @@ -48,8 +48,17 @@ class TestStatsExtrapolation(CustomClusterTestSuite): # Test COMPUTE STATS TABLESAMPLE part_test_tbl = unique_database + ".alltypes" self.clone_table("functional.alltypes", part_test_tbl, True, vector) +# Since our test tables are small, set the minimum sample size to 0 to make sure +# we exercise the sampling code paths. +self.client.execute("set COMPUTE_STATS_MIN_SAMPLE_SIZE=0") self.client.execute( "compute stats {0} tablesample system (13)".format(part_test_tbl)) +# Check that table stats were set. +table_stats = self.client.execute("show table stats {0}".format(part_test_tbl)) +col_names = [fs.name.upper() for fs in table_stats.schema.fieldSchemas] +extrap_rows_idx = col_names.index("EXTRAP #ROWS") +
[impala] 01/03: IMPALA-10310: Fix couldn't skip rows in parquet file on NextRowGroup
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 1fd5e4279c75a7cb3e51e737d9ca7c36435412dc Author: guojingfeng AuthorDate: Thu Nov 5 03:30:33 2020 + IMPALA-10310: Fix couldn't skip rows in parquet file on NextRowGroup In practice we recommend that hdfs block size should align with parquet row group size.But in fact some compute engine like spark, default parquet row group size is 128MB, and if ETL user doesn't change the default property spark will generate row groups that smaller than hdfs block size. The result is a single hdfs block may contain multiple parquet row groups. In planner stage, length of impala generated scan range may be bigger than row group size. thus a single scan range contains multiple row group. In current parquet scanner when move to next row group, some of internal stat in parquet column readers need to reset. eg: num_buffered_values_, column chunk metadata, reset internal stat of column chunk readers. But current_row_range_ offset is not reset currently, this will cause errors "Couldn't skip rows in file hdfs://xxx" as IMPALA-10310 points out. This patch simply reset current_row_range_ to 0 when moving into next row group in parquet column readers. Fix the bug IMPALA-10310. Testing: * Add e2e test for parquet multi blocks per file and multi pages per block * Ran all core tests offline. * Manually tested all cases encountered in my production environment. Change-Id: I964695cd53f5d5fdb6485a85cd82e7a72ca6092c Reviewed-on: http://gerrit.cloudera.org:8080/16697 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/parquet-column-readers.cc | 1 + testdata/data/README | 15 +++ testdata/data/customer_multiblock_page_index.parquet | Bin 0 -> 451607 bytes .../queries/QueryTest/parquet-page-index.test | 16 tests/query_test/test_parquet_stats.py | 2 ++ 5 files changed, 34 insertions(+) diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc index b00ba51..b3030d7 100644 --- a/be/src/exec/parquet/parquet-column-readers.cc +++ b/be/src/exec/parquet/parquet-column-readers.cc @@ -1225,6 +1225,7 @@ void BaseScalarColumnReader::ResetPageFiltering() { candidate_page_idx_ = -1; current_row_ = -1; levels_readahead_ = false; + current_row_range_ = 0; } Status BaseScalarColumnReader::StartPageFiltering() { diff --git a/testdata/data/README b/testdata/data/README index 3954d4b..5d24b22 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -592,3 +592,18 @@ int64_t r = random(); if (r % 2 + r % 3 + r % 5 == 0) return true; Also modified HdfsParquetTableWriter::BaseColumnWriter::Flush to randomly invalidate the offset index: if (r ... ) location.offset = -1; + +customer_multiblock_page_index.parquet +Parquet file that contains multiple blocks in a single file Needed to test IMPALA-10310. +In order to generate this file, execute the following instruments: +// use 1.11.0 to generate page index +1. export HIVE_AUX_JARS_PATH=/path/parquet-hadoop-1.11.0.jar +// in hive shell +2. SET parquet.block.size=8192; // use little block size +3. SET parquet.page.row.count.limit=10; // little page row count generate multi pages +4. CREATE TABLE customer_multiblock_page_index_6 + STORED AS PARQUET + TBLPROPERTIES('parquet.compression'='SNAPPY') + AS SELECT * FROM tpcds.customer + WHERE c_current_cdemo_sk IS NOT NULL ORDER BY c_current_cdemo_sk LIMIT 2000; +generated file will contains multi blocks, multi pages per block. diff --git a/testdata/data/customer_multiblock_page_index.parquet b/testdata/data/customer_multiblock_page_index.parquet new file mode 100644 index 000..21fa9a5 Binary files /dev/null and b/testdata/data/customer_multiblock_page_index.parquet differ diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test index 7c44e21..d3cbd9f 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test @@ -300,3 +300,19 @@ BIGINT RUNTIME_PROFILE aggregation(SUM, NumStatsFilteredPages): 0 + QUERY +# Query table with multi blocks in a single file +select c_birth_country,count(distinct c_customer_id) from customer_multiblock_page_index +where c_current_cdemo_sk < 100 group by c_birth_country; + RESULTS +'SLOVAKIA',1 +'BRUNEI DARUSSALAM',1 +'BURKINA FASO',1 +'SIERRA LEONE',1 +'PORTUGAL',1 + TYPES +STRING, BIGINT + RUNTIME_PROFILE
[impala] 02/03: IMPALA-6861: Fix OpenSSL initialization
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 746c968ac372938d64298372a0790ec18135c889 Author: Thomas Tauber-Marshall AuthorDate: Tue Nov 3 14:22:08 2020 -0800 IMPALA-6861: Fix OpenSSL initialization Impalads currently initialize OpenSSL twice: once when initializing Thrift and once when initializing KRPC. The initialization is theoretically idempotent but not thread-safe, so its better to clean this up. This patch disables the Thrift version as its older (last updated in 2015) and the KRPC version contains logic specific to more recent versions of OpenSSL. The catalogd and statestored also now use the KRPC version instead of the Thrift version. It also improves debuggability by adding some additional startup logging. Testing: - Passed existing SSL tests. Change-Id: I35b1362d40c8a12082cc8b531a38b4a485bac0e7 Reviewed-on: http://gerrit.cloudera.org:8080/16704 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/rpc/authentication.cc | 6 +- be/src/rpc/authentication.h | 10 +- be/src/rpc/thrift-server.cc | 5 + 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/be/src/rpc/authentication.cc b/be/src/rpc/authentication.cc index 1b3f82c..1d7acca 100644 --- a/be/src/rpc/authentication.cc +++ b/be/src/rpc/authentication.cc @@ -43,6 +43,7 @@ #include "kudu/rpc/sasl_common.h" #include "kudu/security/gssapi.h" #include "kudu/security/init.h" +#include "kudu/security/openssl_util.h" #include "rpc/auth-provider.h" #include "rpc/authentication-util.h" #include "rpc/thrift-server.h" @@ -1112,7 +1113,10 @@ AuthManager::AuthManager() {} AuthManager::~AuthManager() {} Status AuthManager::Init() { - ssl_socket_factory_.reset(new TSSLSocketFactory(TLSv1_0)); + // Tell Thrift not to initialize SSL for us, as we use Kudu's SSL initializtion. + TSSLSocketFactory::setManualOpenSSLInitialization(true); + kudu::security::InitializeOpenSSL(); + LOG(INFO) << "Initialized " << OPENSSL_VERSION_TEXT; bool use_ldap = false; diff --git a/be/src/rpc/authentication.h b/be/src/rpc/authentication.h index 96d091b..f48fdcd 100644 --- a/be/src/rpc/authentication.h +++ b/be/src/rpc/authentication.h @@ -52,7 +52,7 @@ class AuthManager { ~AuthManager(); /// Set up internal and external AuthProvider classes. This also initializes SSL (via - /// the creation of ssl_socket_factory_). + /// kudu::security::InitializeOpenSSL()). Status Init(); /// Returns the authentication provider to use for "external" communication @@ -81,14 +81,6 @@ class AuthManager { boost::scoped_ptr internal_auth_provider_; boost::scoped_ptr external_auth_provider_; - /// A thrift SSL socket factory must be created and live the lifetime of the process to - /// ensure that the thrift OpenSSL initialization code runs at Init(), and is not - /// unregistered (which thrift will do when the refcount of TSSLSocketFactory objects - /// reach 0), see IMPALA-4933. For simplicity, and because Kudu will expect SSL to be - /// initialized, this will be created regardless of whether or not SSL credentials are - /// specified. This factory isn't otherwise used. - boost::scoped_ptr ssl_socket_factory_; - /// Used to authenticate usernames and passwords to LDAP. std::unique_ptr ldap_; }; diff --git a/be/src/rpc/thrift-server.cc b/be/src/rpc/thrift-server.cc index abc3fcf..7d5a8c1 100644 --- a/be/src/rpc/thrift-server.cc +++ b/be/src/rpc/thrift-server.cc @@ -310,6 +310,11 @@ class ImpalaSslSocketFactory : public TSSLSocketFactory { : TSSLSocketFactory(version), password_(password) {} void ciphers(const string& enable) override { +if (ctx_.get() == nullptr) { + throw new TSSLException("ImpalaSslSocketFactory was not properly initialized."); +} +LOG(INFO) << "Enabling the following ciphers for the ImpalaSslSocketFactory: " + << enable; SCOPED_OPENSSL_NO_PENDING_ERRORS; TSSLSocketFactory::ciphers(enable);
[impala] branch master updated (5e38f32 -> 5a00a4c)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 5e38f32 IMPALA-10315: Update 'ICEBERG' InputFormat/OutputFormat/SerDe from latest Iceberg new 1fd5e42 IMPALA-10310: Fix couldn't skip rows in parquet file on NextRowGroup new 746c968 IMPALA-6861: Fix OpenSSL initialization new 5a00a4c IMPALA-10320: Specify expression selectivity for BoolLiteral. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/exec/parquet/parquet-column-readers.cc | 1 + be/src/rpc/authentication.cc | 6 +- be/src/rpc/authentication.h| 10 +- be/src/rpc/thrift-server.cc| 5 + .../java/org/apache/impala/analysis/BoolLiteral.java | 3 +++ .../java/org/apache/impala/analysis/NullLiteral.java | 1 + .../apache/impala/analysis/ExprCardinalityTest.java| 9 +++-- testdata/data/README | 15 +++ testdata/data/customer_multiblock_page_index.parquet | Bin 0 -> 451607 bytes .../queries/QueryTest/parquet-page-index.test | 16 tests/query_test/test_parquet_stats.py | 2 ++ 11 files changed, 56 insertions(+), 12 deletions(-) create mode 100644 testdata/data/customer_multiblock_page_index.parquet
[impala] 03/03: IMPALA-10320: Specify expression selectivity for BoolLiteral.
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 5a00a4c06f8ec40a8867dcbc036cf5bb47b8a3be Author: Shant Hovsepian AuthorDate: Thu Nov 12 01:56:52 2020 -0500 IMPALA-10320: Specify expression selectivity for BoolLiteral. Setting the selectivity of true and false literals to 1 and 0 respectively. Also setting a NullLiteral's selectivity to 0. Testing: - New tests in ExprCardinalityTest Change-Id: I42c96cd685f22d8634509d9f488f2a1f82b8 Reviewed-on: http://gerrit.cloudera.org:8080/16714 Tested-by: Impala Public Jenkins Reviewed-by: Tim Armstrong --- fe/src/main/java/org/apache/impala/analysis/BoolLiteral.java | 3 +++ fe/src/main/java/org/apache/impala/analysis/NullLiteral.java | 1 + .../java/org/apache/impala/analysis/ExprCardinalityTest.java | 9 +++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/BoolLiteral.java b/fe/src/main/java/org/apache/impala/analysis/BoolLiteral.java index 79ccc22..3b06e05 100644 --- a/fe/src/main/java/org/apache/impala/analysis/BoolLiteral.java +++ b/fe/src/main/java/org/apache/impala/analysis/BoolLiteral.java @@ -30,6 +30,7 @@ public class BoolLiteral extends LiteralExpr { public BoolLiteral(boolean value) { this.value_ = value; +this.selectivity_ = value ? 1 : 0; type_ = Type.BOOLEAN; } @@ -37,8 +38,10 @@ public class BoolLiteral extends LiteralExpr { type_ = Type.BOOLEAN; if (value.toLowerCase().equals("true")) { this.value_ = true; + this.selectivity_ = 1; } else if (value.toLowerCase().equals("false")) { this.value_ = false; + this.selectivity_ = 0; } else { throw new AnalysisException("invalid BOOLEAN literal: " + value); } diff --git a/fe/src/main/java/org/apache/impala/analysis/NullLiteral.java b/fe/src/main/java/org/apache/impala/analysis/NullLiteral.java index 65ceefe..9a2f1e3 100644 --- a/fe/src/main/java/org/apache/impala/analysis/NullLiteral.java +++ b/fe/src/main/java/org/apache/impala/analysis/NullLiteral.java @@ -28,6 +28,7 @@ public class NullLiteral extends LiteralExpr { public NullLiteral() { type_ = Type.NULL; +this.selectivity_ = 0; } /** diff --git a/fe/src/test/java/org/apache/impala/analysis/ExprCardinalityTest.java b/fe/src/test/java/org/apache/impala/analysis/ExprCardinalityTest.java index deae557..0e517a5 100644 --- a/fe/src/test/java/org/apache/impala/analysis/ExprCardinalityTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/ExprCardinalityTest.java @@ -238,8 +238,9 @@ public class ExprCardinalityTest { // Note that the constant NULL has an NDV = 1, but // Null-only columns have an NDV=0... // See IMPALA-8058 -verifySelectExpr("alltypes", "NULL", 1, -1); -verifySelectExpr("alltypes", "true", 1, -1); +verifySelectExpr("alltypes", "NULL", 1, 0); +verifySelectExpr("alltypes", "true", 1, 1); +verifySelectExpr("alltypes", "false", 1, 0); } // Expression selectivity @@ -540,6 +541,10 @@ public class ExprCardinalityTest { 3, 1.0/2 + 1.0/10 - 1.0/2 * 1.0/10); // Chain of OR rewritten to IN verifySelectExpr("alltypes", "int_col = 10 or int_col = 20", 3, 2.0/10); +// Or with literals +verifySelectExpr("alltypes", "int_col = 10 or true", 3, 1.0); +verifySelectExpr("alltypes", "int_col = 10 or false", 3, 0.1); +verifySelectExpr("alltypes", "int_col = 10 or null", 3, 0.1); } /**
[impala] branch master updated (9d6bf35 -> 8243a97)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 9d6bf35 IMPALA-10145,IMPALA-10299: Bump impala-shell thrift version to 0.11.0-p4 new 6694d14 IMPALA-10305: Sync Kudu's FIPS compliant changes new 8243a97 Bump up CDP_BUILD_NUMBER to 6912987 The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/kudu/security/CMakeLists.txt| 2 +- be/src/kudu/security/ca/cert_management-test.cc| 3 +- be/src/kudu/security/crypto.cc | 13 be/src/kudu/security/openssl_util.cc | 80 +- be/src/kudu/security/openssl_util.h| 33 ++--- be/src/kudu/security/tls_context.cc| 20 +- be/src/kudu/security/tls_handshake.cc | 24 +-- be/src/kudu/security/token-test.cc | 2 +- be/src/kudu/util/flags.cc | 22 ++ be/src/kudu/util/flags.h | 2 + be/src/kudu/util/test_util.cc | 55 ++- be/src/kudu/util/test_util.h | 4 ++ bin/impala-config.sh | 24 --- .../queries/QueryTest/create-database.test | 18 +++-- .../queries/QueryTest/describe-db.test | 16 +++-- .../queries/QueryTest/describe-hive-db.test| 10 ++- tests/custom_cluster/test_event_processing.py | 1 + 17 files changed, 235 insertions(+), 94 deletions(-)
[impala] 02/02: Bump up CDP_BUILD_NUMBER to 6912987
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 8243a97ec2ff698e1cf605928d48055160111a09 Author: Zoltan Borok-Nagy AuthorDate: Mon Nov 9 17:19:50 2020 +0100 Bump up CDP_BUILD_NUMBER to 6912987 This change bumps up the CDP_BUILD_NUMBER to 6912987. The new CDP build includes Iceberg artifacts. The new Hive version has a few bugs that cause existing tests to fail. Unfortunately we can't expect them to be fixed soon in CDP Hive, so I adjusted the tests and added some TODO comments. Change-Id: Ide03d6b86043e72753485ff3d4056e0a1bb5c36f Reviewed-on: http://gerrit.cloudera.org:8080/16701 Reviewed-by: Tim Armstrong Tested-by: Impala Public Jenkins --- bin/impala-config.sh | 24 +- .../queries/QueryTest/create-database.test | 18 +++- .../queries/QueryTest/describe-db.test | 16 +++ .../queries/QueryTest/describe-hive-db.test| 10 +++-- tests/custom_cluster/test_event_processing.py | 1 + 5 files changed, 48 insertions(+), 21 deletions(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index faee6d3..d855b33 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -172,18 +172,22 @@ fi : ${IMPALA_TOOLCHAIN_HOST:=native-toolchain.s3.amazonaws.com} export IMPALA_TOOLCHAIN_HOST -export CDP_BUILD_NUMBER=4493826 +#TODO: On next CDP_BUILD_NUMBER bump check if the following issues are resolved +# and adjust existing tests that mentions them: +# * HIVE-23995 +# * HIVE-24175 +export CDP_BUILD_NUMBER=6912987 export CDP_MAVEN_REPOSITORY=\ "https://${IMPALA_TOOLCHAIN_HOST}/build/cdp_components/${CDP_BUILD_NUMBER}/maven; -export CDP_AVRO_JAVA_VERSION=1.8.2.7.2.1.0-287 -export CDP_HADOOP_VERSION=3.1.1.7.2.1.0-287 -export CDP_HBASE_VERSION=2.2.5.7.2.1.0-287 -export CDP_HIVE_VERSION=3.1.3000.7.2.1.0-287 -export CDP_KNOX_VERSION=1.3.0.7.2.1.0-287 -export CDP_OZONE_VERSION=0.6.0.7.2.1.0-287 -export CDP_PARQUET_VERSION=1.10.99.7.2.1.0-287 -export CDP_RANGER_VERSION=2.0.0.7.2.1.0-287 -export CDP_TEZ_VERSION=0.9.1.7.2.1.0-287 +export CDP_AVRO_JAVA_VERSION=1.8.2.7.2.7.0-30 +export CDP_HADOOP_VERSION=3.1.1.7.2.7.0-30 +export CDP_HBASE_VERSION=2.2.6.7.2.7.0-30 +export CDP_HIVE_VERSION=3.1.3000.7.2.7.0-30 +export CDP_KNOX_VERSION=1.3.0.7.2.7.0-30 +export CDP_OZONE_VERSION=1.0.0.7.2.7.0-30 +export CDP_PARQUET_VERSION=1.10.99.7.2.7.0-30 +export CDP_RANGER_VERSION=2.1.0.7.2.7.0-30 +export CDP_TEZ_VERSION=0.9.1.7.2.7.0-30 export ARCH_NAME=$(uname -p) diff --git a/testdata/workloads/functional-query/queries/QueryTest/create-database.test b/testdata/workloads/functional-query/queries/QueryTest/create-database.test index e617a16..2ac56d4 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/create-database.test +++ b/testdata/workloads/functional-query/queries/QueryTest/create-database.test @@ -17,6 +17,7 @@ STRING, STRING describe database $DATABASE_2 RESULTS '$DATABASE_2','$NAMENODE/$EXTERNAL_WAREHOUSE_DIR/$DATABASE_2.db','For testing' +'managedlocation:','$NAMENODE/$MANAGED_WAREHOUSE_DIR/$DATABASE_2.db','' TYPES string, string, string @@ -25,6 +26,7 @@ string, string, string describe database extended $DATABASE_2 RESULTS '$DATABASE_2','$NAMENODE/$EXTERNAL_WAREHOUSE_DIR/$DATABASE_2.db','For testing' +'managedlocation:','$NAMENODE/$MANAGED_WAREHOUSE_DIR/$DATABASE_2.db','' 'Owner: ','','' '','$USER','USER' TYPES @@ -68,6 +70,7 @@ create database if not exists $DATABASE_loc comment "For testing" describe database $DATABASE_loc RESULTS '$DATABASE_loc','$NAMENODE/$EXTERNAL_WAREHOUSE_DIR/specified_location','For testing' +'managedlocation:','$NAMENODE/$MANAGED_WAREHOUSE_DIR/$DATABASE_loc.db','' TYPES string, string, string @@ -76,6 +79,7 @@ string, string, string describe database extended $DATABASE_loc RESULTS '$DATABASE_loc','$NAMENODE/$EXTERNAL_WAREHOUSE_DIR/specified_location','For testing' +'managedlocation:','$NAMENODE/$MANAGED_WAREHOUSE_DIR/$DATABASE_loc.db','' 'Owner: ','','' '','$USER','USER' TYPES @@ -164,6 +168,10 @@ show databases like '$DATABASE_restrict' QUERY # Test CREATE DATABASE ... MANAGEDLOCATION +# +# TODO: Currently Hive ignores the specified managedlocation due to HIVE-24175, +# so we are using the default locations in the checks. Restore the specified +# location once HIVE-24175 is resolved. create database if not exists $DATABASE_loc comment "For testing" managedlocation '$NAMENODE/$MANAGED_WAREHOUSE_DIR/specified_managedlocation' RESULTS @@ -175,7 +183,7 @@ create database if not exists $DATABASE_loc comment "For testing" describe database $DATABASE_loc RESULTS '$DATABASE_loc','$NAMENODE/$EXTERNAL_WAREHOUSE_DIR/$DATABASE_loc.db','For test
[impala] branch master updated: IMPALA-10295: fix analytic limit pushdown with no predicates
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new c3b5cf8 IMPALA-10295: fix analytic limit pushdown with no predicates c3b5cf8 is described below commit c3b5cf8b4c807fa4136b64addb1e7e8be8aaf6c5 Author: Tim Armstrong AuthorDate: Thu Oct 8 17:53:33 2020 -0700 IMPALA-10295: fix analytic limit pushdown with no predicates This handles the first case where analytic limit pushdown could be applied incorrectly: when there are no predicates applied to the output of the analytic. If no rows are filtered out between the pre-analytic sort and the place where the top-N will be inserted, and the order matches exactly, we can push down the limit safely because the limit below the analytic will filter exactly the same rows as the limit above the analytic would. We add a helper to check if the sort order matches exactly and then handle the case with no select node correctly. We leave the other cases where there is a special predicate to be handled in the next patch of the series, as the logic there is a bit more subtle. Tests: Added regression planner and query tests that demonstrate the problem. Ran core tests. Change-Id: I254e85edd5ea6b6e76d20cbdf27fd88059a98a21 Reviewed-on: http://gerrit.cloudera.org:8080/16663 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../apache/impala/planner/AnalyticEvalNode.java| 77 ++--- .../PlannerTest/limit-pushdown-analytic.test | 183 +++-- .../tpch/queries/limit-pushdown-analytic.test | 17 ++ 3 files changed, 238 insertions(+), 39 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java b/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java index efc2273..458f228 100644 --- a/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java +++ b/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java @@ -20,33 +20,33 @@ package org.apache.impala.planner; import java.util.ArrayList; import java.util.List; -import org.apache.impala.common.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.impala.analysis.AnalyticExpr; import org.apache.impala.analysis.AnalyticWindow; import org.apache.impala.analysis.Analyzer; import org.apache.impala.analysis.BinaryPredicate; import org.apache.impala.analysis.BoolLiteral; import org.apache.impala.analysis.CompoundPredicate; +import org.apache.impala.analysis.CompoundPredicate.Operator; import org.apache.impala.analysis.Expr; import org.apache.impala.analysis.ExprSubstitutionMap; import org.apache.impala.analysis.FunctionCallExpr; import org.apache.impala.analysis.IsNullPredicate; +import org.apache.impala.analysis.NumericLiteral; import org.apache.impala.analysis.OrderByElement; import org.apache.impala.analysis.SlotDescriptor; import org.apache.impala.analysis.SlotRef; +import org.apache.impala.analysis.SortInfo; import org.apache.impala.analysis.TupleDescriptor; import org.apache.impala.analysis.TupleId; -import org.apache.impala.analysis.CompoundPredicate.Operator; -import org.apache.impala.analysis.NumericLiteral; -import org.apache.impala.analysis.SortInfo; +import org.apache.impala.common.Pair; import org.apache.impala.thrift.TAnalyticNode; import org.apache.impala.thrift.TExplainLevel; import org.apache.impala.thrift.TPlanNode; import org.apache.impala.thrift.TPlanNodeType; import org.apache.impala.thrift.TQueryOptions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.common.base.Joiner; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; @@ -420,31 +420,21 @@ public class AnalyticEvalNode extends PlanNode { return false; } -if (sortExprs.size() > 0 && pbExprs.size() > sortExprs.size()) return false; - Preconditions.checkArgument(analyticSortSortExprs.size() >= pbExprs.size()); // Check if pby exprs are a prefix of the top level sort exprs +// TODO: also check if subsequent expressions match. Need to check ASC and NULLS FIRST +// compatibility more explicitly in the case. if (sortExprs.size() == 0) { sortExprsForPartitioning.addAll(pbExprs); } else { - for (int i = 0; i < pbExprs.size(); i++) { -Expr pbExpr = pbExprs.get(i); -Expr sortExpr = sortExprs.get(i); -if (!(pbExpr instanceof SlotRef && sortExpr instanceof SlotRef)) return false; - -if (!((SlotRef) pbExpr).equals(((SlotRef) sortExpr))) { - // pby exprs are not a prefix of the top level sort exprs - return false; -} else { - // get the corresponding sort
[impala] branch master updated: IMPALA-10216: add logging to help debug flaky test
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new 227e43f IMPALA-10216: add logging to help debug flaky test 227e43f is described below commit 227e43f48147c8725100ddc05521bae07ee9becd Author: Tim Armstrong AuthorDate: Wed Oct 21 11:53:28 2020 -0700 IMPALA-10216: add logging to help debug flaky test This commit adds additional info to the assertions to help debug it if it reoccurs. Change-Id: I09984dd3cea686808115ca4cb8c88d24271d8cc1 Reviewed-on: http://gerrit.cloudera.org:8080/16620 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/runtime/bufferpool/buffer-pool-test.cc | 53 ++- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/be/src/runtime/bufferpool/buffer-pool-test.cc b/be/src/runtime/bufferpool/buffer-pool-test.cc index 4a9e7e8..37f21be 100644 --- a/be/src/runtime/bufferpool/buffer-pool-test.cc +++ b/be/src/runtime/bufferpool/buffer-pool-test.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include "common/names.h" +using boost::algorithm::join; using boost::filesystem::directory_iterator; using std::mt19937; using std::uniform_int_distribution; @@ -362,6 +364,27 @@ class BufferPoolTest : public ::testing::Test { static string TmpFilePath(PageHandle* page) { return page->page_->write_handle->TmpFilePath(); } + + // Return a comma-separated string with the paths of the temporary file backing the + // pages. + static string TmpFilePaths(vector& pages) { +vector paths; +for (PageHandle& page : pages) { + paths.push_back(TmpFilePath()); +} +return join(paths, ","); + } + + // Return a string with the name of the directory and a comma-separated list of scratch + // files under the specified scratch directory. + string DumpScratchDir(const string& tmp_dir_path) { +string scratch_dir_path = tmp_dir_path + SCRATCH_SUFFIX; +vector entries; +EXPECT_OK( +FileSystemUtil::Directory::GetEntryNames(scratch_dir_path, )); +return "Directory " + scratch_dir_path + ": " + join(entries, ","); + } + // Check that the file backing the page has dir as a prefix of its path. static bool PageInDir(PageHandle* page, const string& dir) { return TmpFilePath(page).find(dir) == 0; @@ -1712,7 +1735,8 @@ void BufferPoolTest::TestWriteErrorBlacklist( const string& good_dir = tmp_dirs[1]; // Delete one file from first scratch dir for first query to trigger an error. PageHandle* error_page = FindPageInDir(pages[ERROR_QUERY], error_dir); - ASSERT_TRUE(error_page != NULL) << "Expected a tmp file in dir " << error_dir; + ASSERT_TRUE(error_page != NULL) + << TmpFilePaths(pages[ERROR_QUERY]) << " not in " << DumpScratchDir(error_dir); const string& error_file_path = TmpFilePath(error_page); for (int i = 0; i < INITIAL_QUERIES; ++i) { ASSERT_OK(PinAll(, [i], [i])); @@ -1749,19 +1773,24 @@ void BufferPoolTest::TestWriteErrorBlacklist( , [ERROR_QUERY], TEST_BUFFER_LEN, MEM_PER_QUERY, _new_pages); UnpinAll(, [ERROR_QUERY], _new_pages); WaitForAllWrites([ERROR_QUERY]); - EXPECT_TRUE(FindPageInDir(error_new_pages, good_dir) != NULL); - EXPECT_TRUE(FindPageInDir(error_new_pages, error_dir) == NULL); + EXPECT_TRUE(FindPageInDir(error_new_pages, good_dir) != NULL) + << TmpFilePaths(error_new_pages) << " not in " << DumpScratchDir(good_dir); + EXPECT_TRUE(FindPageInDir(error_new_pages, error_dir) == NULL) + << TmpFilePaths(error_new_pages) << " in " << DumpScratchDir(error_dir); for (PageHandle& error_new_page : error_new_pages) { LOG(INFO) << "Newly created page backed by file " << TmpFilePath(_new_page); -EXPECT_TRUE(PageInDir(_new_page, good_dir)); +EXPECT_TRUE(PageInDir(_new_page, good_dir)) +<< TmpFilePath(_new_page) << " not in " << DumpScratchDir(good_dir); } DestroyAll(, [ERROR_QUERY], _new_pages); ASSERT_OK(PinAll(, [NO_ERROR_QUERY], [NO_ERROR_QUERY])); UnpinAll(, [NO_ERROR_QUERY], [NO_ERROR_QUERY]); WaitForAllWrites([NO_ERROR_QUERY]); - EXPECT_TRUE(FindPageInDir(pages[NO_ERROR_QUERY], good_dir) != NULL); - EXPECT_TRUE(FindPageInDir(pages[NO_ERROR_QUERY], error_dir) != NULL); + EXPECT_TRUE(FindPageInDir(pages[NO_ERROR_QUERY], good_dir) != NULL) + << TmpFilePaths(pages[NO_ERROR_QUERY]) << " not in " << DumpScratchDir(good_dir); + EXPECT_TRUE(FindPageInDir(pages[NO_ERROR_QUERY],
[impala] branch master updated (9384a18 -> 61a020d)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 9384a18 IMPALA-10257: Relax check for page filtering new 15c3b13 IMPALA-10219: Expose DEBUG_ACTION query option in catalog new 61a020d IMPALA-10007: Impala development environment does not support Ubuntu 20.04 The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/exec/catalog-op-executor.cc | 1 + be/src/util/debug-util.cc | 4 + bin/bootstrap_system.sh| 2 +- bin/bootstrap_toolchain.py | 2 + common/thrift/CatalogService.thrift| 6 + .../impala/catalog/CatalogServiceCatalog.java | 12 +- .../apache/impala/catalog/FileMetadataLoader.java | 21 +++- .../java/org/apache/impala/catalog/HdfsTable.java | 74 +++ .../org/apache/impala/catalog/IcebergTable.java| 2 +- .../impala/catalog/ParallelFileMetadataLoader.java | 10 +- .../org/apache/impala/common/FileSystemUtil.java | 13 +- .../apache/impala/service/CatalogOpExecutor.java | 26 ++-- .../java/org/apache/impala/service/Frontend.java | 10 ++ .../java/org/apache/impala/util/DebugUtils.java| 137 + .../events/MetastoreEventsProcessorTest.java | 2 +- .../org/apache/impala/util/DebugUtilsTest.java | 67 ++ tests/common/impala_test_suite.py | 11 ++ tests/metadata/test_catalogd_debug_actions.py | 50 18 files changed, 392 insertions(+), 58 deletions(-) create mode 100644 fe/src/main/java/org/apache/impala/util/DebugUtils.java create mode 100644 fe/src/test/java/org/apache/impala/util/DebugUtilsTest.java create mode 100644 tests/metadata/test_catalogd_debug_actions.py
[impala] 02/02: IMPALA-10007: Impala development environment does not support Ubuntu 20.04
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 61a020d0f89ef1d6e736495aa2a98f7693dc7f39 Author: Qifan Chen AuthorDate: Fri Jul 24 16:51:25 2020 -0400 IMPALA-10007: Impala development environment does not support Ubuntu 20.04 This is a minor amendment to a previously merged change with ChangeId I4f592f60881fd8f34e2bf393a76f5a921505010a, to address additional review comments. In particular, the original commit referred to Ubuntu 20.4 whereas it should have used Ubuntu 20.04. Change-Id: I7db302b4f1d57ec9aa2100d7589d5e814db75947 Reviewed-on: http://gerrit.cloudera.org:8080/16241 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- bin/bootstrap_system.sh| 2 +- bin/bootstrap_toolchain.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh index bd448e0..f6c33a3 100755 --- a/bin/bootstrap_system.sh +++ b/bin/bootstrap_system.sh @@ -244,7 +244,7 @@ if [[ "$UBUNTU" == true ]]; then fi fi -# Ubuntu 18.04 or 20.04 install OpenJDK 11 and configure it as the default Java version. +# Ubuntu 18.04 and 20.04 install OpenJDK 11 and configure it as the default Java version. # Impala is currently tested with OpenJDK 8, so configure that version as the default. if [[ $ARCH_NAME == 'aarch64' ]]; then ubuntu20 sudo update-java-alternatives -s java-1.8.0-openjdk-arm64 diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py index 71e81d6..494b97c 100755 --- a/bin/bootstrap_toolchain.py +++ b/bin/bootstrap_toolchain.py @@ -71,6 +71,8 @@ from string import Template # Maps return values from 'lsb_release -irs' to the corresponding OS labels for both the # toolchain and the CDP components. +# For Ubuntu20.04, the toolchain and CDP components to be mapped to are still 18.04 +# based, due to the unavailability of 20.04 parts on EC2. OsMapping = namedtuple('OsMapping', ['lsb_release', 'toolchain', 'cdh']) OS_MAPPING = [ OsMapping("centos5", "ec2-package-centos-5", None),
[impala] 01/02: IMPALA-10219: Expose DEBUG_ACTION query option in catalog
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 15c3b13e9730479e096275d974000ae9fe8fbb83 Author: Vihang Karajgaonkar AuthorDate: Sat Oct 3 17:01:35 2020 -0700 IMPALA-10219: Expose DEBUG_ACTION query option in catalog This patches enables DEBUG_ACTION in the catalog service's java code. Specifically, DEBUG_ACTION query option is now exposed to TResetMetadataRequest and TExecDdlRequest so that we can inject delays while executing refresh or ddl statements. For example, 1. To inject a delay of 100ms per HDFS list operation during refresh statement set the following query option: set debug_action=catalogd_refresh_hdfs_listing_delay:SLEEP@100; 2. To inject a delay of 100ms in alter table recover partitions statement: set debug_action=catalogd_table_recover_delay:SLEEP@100; 3. To inject a delay of 100ms in compute stats statement set debug_action=catalogd_update_stats_delay:SLEEP@100; Note that this option only adds the delay during the update_stats phase of the compute stats execution. Testing: 1. Added a test which sets the query option and makes sure that command takes more time than without query option. 2. Added unit tests for the debugAction implementation logic. Change-Id: Ia7196b1ce76415a5faf3fa8575a26d22b2bf50b1 Reviewed-on: http://gerrit.cloudera.org:8080/16548 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/catalog-op-executor.cc | 1 + be/src/util/debug-util.cc | 4 + common/thrift/CatalogService.thrift| 6 + .../impala/catalog/CatalogServiceCatalog.java | 12 +- .../apache/impala/catalog/FileMetadataLoader.java | 21 +++- .../java/org/apache/impala/catalog/HdfsTable.java | 74 +++ .../org/apache/impala/catalog/IcebergTable.java| 2 +- .../impala/catalog/ParallelFileMetadataLoader.java | 10 +- .../org/apache/impala/common/FileSystemUtil.java | 13 +- .../apache/impala/service/CatalogOpExecutor.java | 26 ++-- .../java/org/apache/impala/service/Frontend.java | 10 ++ .../java/org/apache/impala/util/DebugUtils.java| 137 + .../events/MetastoreEventsProcessorTest.java | 2 +- .../org/apache/impala/util/DebugUtilsTest.java | 67 ++ tests/common/impala_test_suite.py | 11 ++ tests/metadata/test_catalogd_debug_actions.py | 50 16 files changed, 389 insertions(+), 57 deletions(-) diff --git a/be/src/exec/catalog-op-executor.cc b/be/src/exec/catalog-op-executor.cc index f22dfad..0ffd708 100644 --- a/be/src/exec/catalog-op-executor.cc +++ b/be/src/exec/catalog-op-executor.cc @@ -132,6 +132,7 @@ Status CatalogOpExecutor::ExecComputeStats( TDdlExecRequest& update_stats_req = catalog_op_req.ddl_params; update_stats_req.__set_ddl_type(TDdlType::ALTER_TABLE); update_stats_req.__set_sync_ddl(compute_stats_request.sync_ddl); + update_stats_req.__set_debug_action(compute_stats_request.ddl_params.debug_action); const TComputeStatsParams& compute_stats_params = compute_stats_request.ddl_params.compute_stats_params; diff --git a/be/src/util/debug-util.cc b/be/src/util/debug-util.cc index e176852..af10ddd 100644 --- a/be/src/util/debug-util.cc +++ b/be/src/util/debug-util.cc @@ -352,6 +352,10 @@ static bool ParseProbability(const string& prob_str, bool* should_execute) { return true; } +/// The catalog java code also implements a equivalent method for processing the debug +/// actions in the Java code. See DebugUtils.java for more details. Any changes to the +/// implementation logic here like adding a new type of action, should make changes in +/// the DebugUtils.java too. Status DebugActionImpl( const string& debug_action, const char* label, const std::vector& args) { const DebugActionTokens& action_list = TokenizeDebugActions(debug_action); diff --git a/common/thrift/CatalogService.thrift b/common/thrift/CatalogService.thrift index 5acdb7f..ad316b9 100644 --- a/common/thrift/CatalogService.thrift +++ b/common/thrift/CatalogService.thrift @@ -156,6 +156,9 @@ struct TDdlExecRequest { // Parameters for replaying an exported testcase. 25: optional JniCatalog.TCopyTestCaseReq copy_test_case_params + + // Passes the debug actions to catalogd if the query option is set. + 26: optional string debug_action } // Response from executing a TDdlExecRequest @@ -256,6 +259,9 @@ struct TResetMetadataRequest { // If set, refreshes partition objects which are modified externally. // Applicable only when refreshing the table. 9: optional bool refresh_updated_hms_partitions + + // debug_action is set from the query_option when available. + 10: opti
[impala] branch master updated: IMPALA-10225: bump impyla version to 0.17a1
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new b8a2b75 IMPALA-10225: bump impyla version to 0.17a1 b8a2b75 is described below commit b8a2b754669eb7f8d164e8112e594ac413e436ef Author: Tim Armstrong AuthorDate: Wed Oct 7 15:22:58 2020 -0700 IMPALA-10225: bump impyla version to 0.17a1 Update a couple of tests with the new improved error messages. Change-Id: I70a0e883275f3c29e2b01fd5bab7725857c8a1ed Reviewed-on: http://gerrit.cloudera.org:8080/16562 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- .../java/org/apache/impala/customcluster/LdapImpylaHttpTest.java| 6 -- infra/python/deps/compiled-requirements.txt | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fe/src/test/java/org/apache/impala/customcluster/LdapImpylaHttpTest.java b/fe/src/test/java/org/apache/impala/customcluster/LdapImpylaHttpTest.java index b551ec8..904b0d0 100644 --- a/fe/src/test/java/org/apache/impala/customcluster/LdapImpylaHttpTest.java +++ b/fe/src/test/java/org/apache/impala/customcluster/LdapImpylaHttpTest.java @@ -96,10 +96,12 @@ public class LdapImpylaHttpTest { RunShellCommand.Run(validCmd, /*shouldSucceed*/ true, testUser_, ""); // 2. Invalid username password combination. Should fail. String[] invalidCmd = buildCommand("foo", "bar", null); -RunShellCommand.Run(invalidCmd, /*shouldSucceed*/ false, "", "EOFError"); +RunShellCommand.Run( +invalidCmd, /*shouldSucceed*/ false, "", "HTTP code 401: Unauthorized"); // 3. Without username and password. Should fail. String[] noAuthCmd = {"impala-python", helper_, "--query", query_}; -RunShellCommand.Run(noAuthCmd, /*shouldSucceed*/ false, "", "EOFError"); +RunShellCommand.Run( +noAuthCmd, /*shouldSucceed*/ false, "", "HTTP code 401: Unauthorized"); } private String[] buildCommand(String user, String password, String httpPath) { diff --git a/infra/python/deps/compiled-requirements.txt b/infra/python/deps/compiled-requirements.txt index 061e286..0914bc3 100644 --- a/infra/python/deps/compiled-requirements.txt +++ b/infra/python/deps/compiled-requirements.txt @@ -19,7 +19,7 @@ # after the toolchain is bootstrapped. Installed after requirements.txt argparse == 1.4.0 -impyla == 0.16.2 +impyla == 0.17a1 bitarray == 1.2.1 sasl == 0.2.1 six == 1.14.0
[impala] branch master updated (c9f8d25 -> 5b720a4)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from c9f8d25 IMPALA-3335: Allow single-node optimization with joins new 6bb3b88 IMPALA-9180 (part 1): Remove legacy ImpalaInternalService new a0a25a6 IMPALA-10193: Limit the memory usage for the whole test cluster new 5b720a4 IMPALA-10164: Supporting HadoopCatalog for Iceberg table The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/generated-sources/gen-cpp/CMakeLists.txt| 1 - be/src/benchmarks/expr-benchmark.cc| 7 +- be/src/exprs/expr-test.cc | 4 +- be/src/exprs/utility-functions-ir.cc | 4 +- be/src/rpc/impala-service-pool.cc | 16 ++- be/src/rpc/impala-service-pool.h | 10 ++ be/src/rpc/rpc-mgr.cc | 7 + be/src/rpc/rpc-mgr.h | 3 + be/src/rpc/thrift-server-test.cc | 26 be/src/runtime/backend-client.h| 46 -- be/src/runtime/client-cache-types.h| 8 -- be/src/runtime/coordinator-backend-state.cc| 1 - be/src/runtime/data-stream-test.cc | 1 - be/src/runtime/exec-env.cc | 30 ++-- be/src/runtime/exec-env.h | 19 ++- be/src/runtime/fragment-instance-state.cc | 1 - be/src/runtime/fragment-instance-state.h | 1 - be/src/runtime/initial-reservations.cc | 4 +- be/src/runtime/query-exec-mgr.cc | 3 +- be/src/runtime/query-state.cc | 8 +- be/src/runtime/runtime-filter-bank.cc | 12 +- be/src/runtime/test-env.cc | 4 +- be/src/scheduling/scheduler-test-util.h| 1 - be/src/service/CMakeLists.txt | 1 - be/src/service/client-request-state.cc | 3 +- be/src/service/control-service.cc | 3 +- be/src/service/impala-internal-service.cc | 46 -- be/src/service/impala-internal-service.h | 40 -- be/src/service/impala-server.cc| 66 ++--- be/src/service/impala-server.h | 17 +-- be/src/service/impalad-main.cc | 6 +- be/src/service/session-expiry-test.cc | 1 - be/src/testutil/in-process-servers.cc | 32 ++--- be/src/testutil/in-process-servers.h | 7 +- be/src/util/debug-util.cc | 4 +- bin/generate_minidump_collection_testdata.py | 1 - bin/impala-config.sh | 3 + bin/start-impala-cluster.py| 6 +- common/thrift/CatalogObjects.thrift| 6 + common/thrift/ImpalaInternalService.thrift | 10 +- .../apache/impala/analysis/CreateTableStmt.java| 42 -- .../org/apache/impala/analysis/ToSqlUtils.java | 11 +- .../org/apache/impala/catalog/FeIcebergTable.java | 52 ++- .../org/apache/impala/catalog/IcebergTable.java| 32 - .../impala/catalog/local/LocalIcebergTable.java| 28 +++- .../org/apache/impala/planner/IcebergScanNode.java | 4 +- .../apache/impala/service/CatalogOpExecutor.java | 41 -- .../impala/service/IcebergCatalogOpExecutor.java | 43 +- .../java/org/apache/impala/util/IcebergUtil.java | 125 +++-- .../common/etc/hadoop/conf/yarn-site.xml.py| 3 +- ...2da0-b562-4310-9001-06f9b6b0f9ae-0.parquet} | Bin 1162 -> 1162 bytes ...aefa-65fc-4698-8f26-b155fc965cf6-0.parquet} | Bin 1162 -> 1162 bytes ...b016-05e1-43fc-b4a0-0e0df52a5035-0.parquet} | Bin 1162 -> 1162 bytes ...92523-c3b9-401d-b429-363c245dbe9c-0.parquet | Bin 0 -> 1161 bytes ...70cf-10a1-4e49-86dc-b094fe739aa6-0.parquet} | Bin 1162 -> 1162 bytes ...f86fa-286f-4cd3-8337-98685c48176d-0.parquet | Bin 0 -> 1161 bytes ...2bbc-46a2-4040-a4a8-7488447de3b6-0.parquet} | Bin 1162 -> 1162 bytes ...a250-ed1c-4868-bbf1-f2aad65fa80c-0.parquet} | Bin 1162 -> 1162 bytes ...7823-ded1-4a12-9e03-4027cd43966a-0.parquet} | Bin 1169 -> 1169 bytes ...d7a4-245f-44d5-8a59-ed511854c8f8-0.parquet} | Bin 1169 -> 1169 bytes ...5490-91f7-47bd-a3b6-e86caa7fe47d-0.parquet} | Bin 1169 -> 1169 bytes ...5fcf-4346-421f-b2ef-1f9d55fb4c84-0.parquet} | Bin 1169 -> 1169 bytes ...64ed-7a99-4f43-ada7-225c92f6a993-0.parquet} | Bin 1169 -> 1169 bytes ...c862-3d63-42cb-8041-0a0b14b8ca13-0.parquet} | Bin 1169 -> 1169 bytes ...8e68-c862-4248-b3e5
[impala] 03/03: IMPALA-10164: Supporting HadoopCatalog for Iceberg table
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 5b720a4d18cc2f2ade54ab223663521a3822343f Author: skyyws AuthorDate: Fri Sep 11 13:40:38 2020 +0800 IMPALA-10164: Supporting HadoopCatalog for Iceberg table This patch mainly realizes creating Iceberg table by HadoopCatalog. We only supported HadoopTables api before this patch, but now we can use HadoopCatalog to create Iceberg table. When creating managed table, we can use SQL like this: CREATE TABLE default.iceberg_test ( level string, event_time timestamp, message string, ) STORED AS ICEBERG TBLPROPERTIES ('iceberg.catalog'='hadoop.catalog', 'iceberg.catalog_location'='hdfs://test-warehouse/iceberg_test'); We supported two values ('hadoop.catalog', 'hadoop.tables') for 'iceberg.catalog' now. If you don't specify this property in your SQL, default catalog type is 'hadoop.catalog'. As for external Iceberg table, you can use SQL like this: CREATE EXTERNAL TABLE default.iceberg_test_external STORED AS ICEBERG TBLPROPERTIES ('iceberg.catalog'='hadoop.catalog', 'iceberg.catalog_location'='hdfs://test-warehouse/iceberg_test', 'iceberg.table_identifier'='default.iceberg_test'); We cannot set table location for both managed and external Iceberg table with 'hadoop.catalog', and 'SHOW CREATE TABLE' will not display table location yet. We need to use 'DESCRIBE FORMATTED/EXTENDED' to get this location info. 'iceberg.catalog_location' is necessary for 'hadoop.catalog' table, which used to reserved Iceberg table metadata and data, and we use this location to load table metadata from Iceberg. 'iceberg.table_identifier' is used for Icebreg TableIdentifier.If this property not been specified in SQL, Impala will use database and table name to load Iceberg table, which is 'default.iceberg_test_external' in above SQL. This property value is splitted by '.', you can alse set this value like this: 'org.my_db.my_tbl'. And this property is valid for both managed and external table. Testing: - Create table tests in functional_schema_template.sql - Iceberg table create test in test_iceberg.py - Iceberg table query test in test_scanners.py - Iceberg table show create table test in test_show_create_table.py Change-Id: Ic1893c50a633ca22d4bca6726c9937b026f5d5ef Reviewed-on: http://gerrit.cloudera.org:8080/16446 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- common/thrift/CatalogObjects.thrift| 6 + .../apache/impala/analysis/CreateTableStmt.java| 42 -- .../org/apache/impala/analysis/ToSqlUtils.java | 11 +- .../org/apache/impala/catalog/FeIcebergTable.java | 52 ++- .../org/apache/impala/catalog/IcebergTable.java| 32 - .../impala/catalog/local/LocalIcebergTable.java| 28 +++- .../org/apache/impala/planner/IcebergScanNode.java | 4 +- .../apache/impala/service/CatalogOpExecutor.java | 41 -- .../impala/service/IcebergCatalogOpExecutor.java | 43 +- .../java/org/apache/impala/util/IcebergUtil.java | 125 +++-- ...02da0-b562-4310-9001-06f9b6b0f9ae-0.parquet | Bin 0 -> 1162 bytes ...3aefa-65fc-4698-8f26-b155fc965cf6-0.parquet | Bin 0 -> 1162 bytes ...4b016-05e1-43fc-b4a0-0e0df52a5035-0.parquet | Bin 0 -> 1162 bytes ...92523-c3b9-401d-b429-363c245dbe9c-0.parquet | Bin 0 -> 1161 bytes ...370cf-10a1-4e49-86dc-b094fe739aa6-0.parquet | Bin 0 -> 1162 bytes ...f86fa-286f-4cd3-8337-98685c48176d-0.parquet | Bin 0 -> 1161 bytes ...d2bbc-46a2-4040-a4a8-7488447de3b6-0.parquet | Bin 0 -> 1162 bytes ...da250-ed1c-4868-bbf1-f2aad65fa80c-0.parquet | Bin 0 -> 1162 bytes ...77823-ded1-4a12-9e03-4027cd43966a-0.parquet | Bin 0 -> 1169 bytes ...8d7a4-245f-44d5-8a59-ed511854c8f8-0.parquet | Bin 0 -> 1169 bytes ...d5490-91f7-47bd-a3b6-e86caa7fe47d-0.parquet | Bin 0 -> 1169 bytes ...f5fcf-4346-421f-b2ef-1f9d55fb4c84-0.parquet | Bin 0 -> 1169 bytes ...c64ed-7a99-4f43-ada7-225c92f6a993-0.parquet | Bin 0 -> 1169 bytes ...2c862-3d63-42cb-8041-0a0b14b8ca13-0.parquet | Bin 0 -> 1169 bytes ...88e68-c862-4248-b3e5-84228a3ec39d-0.parquet | Bin 0 -> 1190 bytes ...31dc0-b7eb-424d-9edb-dd2cedc59784-0.parquet | Bin 0 -> 1190 bytes ...1e5f3-cfa7-4190-bb30-0db1d53202fd-0.parquet | Bin 0 -> 1190 bytes ...b52b1-dc5b-4417-81b7-8e9fd992280b-0.parquet | Bin 0 -> 1190 bytes ...6ba9a-9387-4c38-bab8-a0598c400fde-0.parquet | Bin 0 -> 1190 bytes ...283a3-b39f-4273-984b-cf7faf39dd9d-0.parquet | Bin 0 -> 1190 bytes .../2c2fa00b-eb20-460a-835b-d69b32560e21-m0.avro | Bin 0 -> 5599 bytes ...465-1-
[impala] 01/03: IMPALA-9180 (part 1): Remove legacy ImpalaInternalService
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 6bb3b88d05f89fb7a1a54f302b4d329cbf4f69ec Author: wzhou-code AuthorDate: Tue Aug 4 17:03:46 2020 -0700 IMPALA-9180 (part 1): Remove legacy ImpalaInternalService The legacy Thrift based Impala internal service has been deprecated and can be removed now. This patch removes ImpalaInternalService. All infrastructures around it are cleaned up, except one place for flag be_port. StatestoreSubscriber::subscriber_id consists be_port, but we cannot change format of subscriber_id now. This remaining be_port issue will be fixed in a succeeding patch (part 4). TQueryCtx.coord_address is changed to TQueryCtx.coord_hostname since the port in TQueryCtx.coord_address is set as be_port and is unused now. Also Rename TQueryCtx.coord_krpc_address as TQueryCtx.coord_ip_address. Testing: - Passed the exhaustive test. - Passed Quasar-L0 test. Change-Id: I5fa83c8009590124dded4783f77ef70fa30119e6 Reviewed-on: http://gerrit.cloudera.org:8080/16291 Reviewed-by: Thomas Tauber-Marshall Tested-by: Impala Public Jenkins --- be/generated-sources/gen-cpp/CMakeLists.txt | 1 - be/src/benchmarks/expr-benchmark.cc | 7 ++- be/src/exprs/expr-test.cc | 4 +- be/src/exprs/utility-functions-ir.cc | 4 +- be/src/rpc/impala-service-pool.cc | 16 +-- be/src/rpc/impala-service-pool.h | 10 be/src/rpc/rpc-mgr.cc | 7 +++ be/src/rpc/rpc-mgr.h | 3 ++ be/src/rpc/thrift-server-test.cc | 26 --- be/src/runtime/backend-client.h | 46 --- be/src/runtime/client-cache-types.h | 8 be/src/runtime/coordinator-backend-state.cc | 1 - be/src/runtime/data-stream-test.cc| 1 - be/src/runtime/exec-env.cc| 30 +--- be/src/runtime/exec-env.h | 19 be/src/runtime/fragment-instance-state.cc | 1 - be/src/runtime/fragment-instance-state.h | 1 - be/src/runtime/initial-reservations.cc| 4 +- be/src/runtime/query-exec-mgr.cc | 3 +- be/src/runtime/query-state.cc | 8 ++-- be/src/runtime/runtime-filter-bank.cc | 12 ++--- be/src/runtime/test-env.cc| 4 +- be/src/scheduling/scheduler-test-util.h | 1 - be/src/service/CMakeLists.txt | 1 - be/src/service/client-request-state.cc| 3 +- be/src/service/control-service.cc | 3 +- be/src/service/impala-internal-service.cc | 46 --- be/src/service/impala-internal-service.h | 40 be/src/service/impala-server.cc | 66 ++- be/src/service/impala-server.h| 17 ++- be/src/service/impalad-main.cc| 6 +-- be/src/service/session-expiry-test.cc | 1 - be/src/testutil/in-process-servers.cc | 32 + be/src/testutil/in-process-servers.h | 7 ++- be/src/util/debug-util.cc | 4 +- bin/generate_minidump_collection_testdata.py | 1 - common/thrift/ImpalaInternalService.thrift| 10 ++-- tests/custom_cluster/test_blacklist.py| 8 ++-- tests/custom_cluster/test_process_failures.py | 2 +- tests/custom_cluster/test_query_retries.py| 2 +- tests/custom_cluster/test_restart_services.py | 11 ++--- tests/webserver/test_web_pages.py | 4 +- 42 files changed, 130 insertions(+), 351 deletions(-) diff --git a/be/generated-sources/gen-cpp/CMakeLists.txt b/be/generated-sources/gen-cpp/CMakeLists.txt index 56093f4..271dcb7 100644 --- a/be/generated-sources/gen-cpp/CMakeLists.txt +++ b/be/generated-sources/gen-cpp/CMakeLists.txt @@ -30,7 +30,6 @@ set(SRC_FILES CatalogService_types.cpp CatalogInternalService_constants.cpp CatalogInternalService_types.cpp - ImpalaInternalService.cpp ImpalaInternalService_constants.cpp ImpalaInternalService_types.cpp ImpalaService.cpp diff --git a/be/src/benchmarks/expr-benchmark.cc b/be/src/benchmarks/expr-benchmark.cc index be42114..689295f 100644 --- a/be/src/benchmarks/expr-benchmark.cc +++ b/be/src/benchmarks/expr-benchmark.cc @@ -40,10 +40,8 @@ #include "gen-cpp/Types_types.h" #include "gen-cpp/ImpalaService.h" #include "gen-cpp/ImpalaService_types.h" -#include "gen-cpp/ImpalaInternalService.h" #include "gen-cpp/Frontend_types.h" #include "gen-cpp/ImpalaService.h" -#include "gen-cpp/ImpalaInternalService.h" #include "gen-cpp/Frontend_types.h" #include "rpc/thrift-server.h" #include "codegen/llvm-codegen.h" @@ -81,8 +79,9 @@ cl
[impala] 02/03: IMPALA-10193: Limit the memory usage for the whole test cluster
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a0a25a61c302d864315daa7f09827b37a37419d5 Author: fifteencai AuthorDate: Wed Sep 30 13:03:08 2020 +0800 IMPALA-10193: Limit the memory usage for the whole test cluster This patch introduces a new approach of limiting the memory usage for both mini-cluster and CDH cluster. Without this limit, clusters are prone to getting killed when running in docker containers with a lower mem limit than host's memory size. i.e. The mini-cluster may running in a container with 32GB limitted by CGROUPS, while the host machine has 128GB. Under this circumstance, if the container is started with '-privileged' command argument, both mini and CDH clusters compute their mem_limit according to 128GB rather than 32GB. They will be killed when attempting to apply for extra resource. Currently, the mem-limit estimating algorithms for Impalad and Node Manager are different: for Impalad: mem_limit = 0.7 * sys_mem / cluster_size (default is 3) for Node Manager: 1. Leave aside 24GB, then fit the left into threasholds below. 2. The bare limit is 4GB and maximum limit 48GB In headge of over-consumption, we - Added a new environment variable IMPALA_CLUSTER_MAX_MEM_GB - Modified the algorithm in 'bin/start-impala-cluster.py', making it taking IMPALA_CLUSTER_MAX_MEM_GB rather than sys_mem into account. - Modified the logic in 'testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py' Similarly, making IMPALA_CLUSTER_MAX_MEM_GB substitutes for sys_mem . Testing: this patch worked in a 32GB docker container running on a 128GB host machine. All 1188 unit tests get passed. Change-Id: I8537fd748e279d5a0e689872aeb4dbfd0c84dc93 Reviewed-on: http://gerrit.cloudera.org:8080/16522 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- bin/impala-config.sh| 3 +++ bin/start-impala-cluster.py | 6 -- .../cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index e0998c1..5d9b8a6 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -112,6 +112,9 @@ unset IMPALA_LLVM_URL export IMPALA_LLVM_ASAN_VERSION=5.0.1-p3 unset IMPALA_LLVM_ASAN_URL +# Maximum memory available for mini-cluster and CDH cluster +export IMPALA_CLUSTER_MAX_MEM_GB + # LLVM stores some files in subdirectories that are named after what # version it thinks it is. We might think it is 5.0.1-p1, based on a # patch we have applied, but LLVM thinks its version is 5.0.1. diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py index c708ce4..452700c 100755 --- a/bin/start-impala-cluster.py +++ b/bin/start-impala-cluster.py @@ -430,7 +430,7 @@ def build_kerberos_args(daemon): def compute_impalad_mem_limit(cluster_size): # Set mem_limit of each impalad to the smaller of 12GB or - # 1/cluster_size (typically 1/3) of 70% of system memory. + # 1/cluster_size (typically 1/3) of 70% of available memory. # # The default memory limit for an impalad is 80% of the total system memory. On a # mini-cluster with 3 impalads that means 240%. Since having an impalad be OOM killed @@ -442,7 +442,9 @@ def compute_impalad_mem_limit(cluster_size): # memory choice here to max out at 12GB. This should be sufficient for tests. # # Beware that ASAN builds use more memory than regular builds. - mem_limit = int(0.7 * psutil.virtual_memory().total / cluster_size) + physical_mem_gb = psutil.virtual_memory().total / 1024 / 1024 / 1024 + available_mem = int(os.getenv("IMPALA_CLUSTER_MAX_MEM_GB", str(physical_mem_gb))) + mem_limit = int(0.7 * available_mem * 1024 * 1024 * 1024 / cluster_size) return min(12 * 1024 * 1024 * 1024, mem_limit) class MiniClusterOperations(object): diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py index 0987925..b286da4 100644 --- a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py +++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py @@ -33,11 +33,12 @@ def _get_system_ram_mb(): def _get_yarn_nm_ram_mb(): sys_ram = _get_system_ram_mb() + available_ram_gb = int(os.getenv("IMPALA_CLUSTER_MAX_MEM_GB", str(sys_ram / 1024))) # Fit into the following envelope: # - need 4GB at a bare minimum # - leave at least 24G for other services # - don't need more than 48G - ret = min(max(sys_ram - 24 * 1024, 4096), 48 * 10
[impala] branch master updated (aeeff53 -> 28181cb)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from aeeff53 IMPALA-10196: Remove LlvmCodeGen::CastPtrToLlvmPtr new a973c77 Fix extra semicolon in LocalIcebergTable new 28181cb IMPALA-9930 (part 1): Initial refactor for admission control service The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/rpc/sidecar-util.h | 94 ++ be/src/runtime/coordinator-backend-state.cc| 37 +++-- be/src/runtime/coordinator.cc | 18 ++--- be/src/runtime/query-driver.cc | 10 +-- be/src/runtime/query-driver.h | 2 +- be/src/scheduling/CMakeLists.txt | 2 + .../admission-control-client.cc} | 14 ++-- be/src/scheduling/admission-control-client.h | 56 + be/src/scheduling/admission-controller.cc | 22 +++-- be/src/scheduling/admission-controller.h | 1 + .../scheduling/local-admission-control-client.cc | 53 be/src/scheduling/local-admission-control-client.h | 56 + be/src/scheduling/scheduler.cc | 35 be/src/scheduling/scheduler.h | 4 +- be/src/service/CMakeLists.txt | 1 - be/src/service/client-request-state.cc | 19 +++-- be/src/service/client-request-state.h | 16 ++-- be/src/service/control-service.cc | 15 +--- be/src/service/impala-server.cc| 3 +- be/src/service/impala-server.h | 3 +- be/src/service/query-driver-map.h | 44 -- be/src/util/CMakeLists.txt | 1 + .../sharded-query-map-util.cc} | 44 +++--- be/src/util/sharded-query-map-util.h | 82 +-- common/thrift/ImpalaService.thrift | 2 +- .../impala/catalog/local/LocalIcebergTable.java| 2 +- tests/custom_cluster/test_admission_controller.py | 6 +- tests/custom_cluster/test_query_retries.py | 4 +- tests/custom_cluster/test_restart_services.py | 2 +- tests/query_test/test_observability.py | 6 +- 30 files changed, 452 insertions(+), 202 deletions(-) create mode 100644 be/src/rpc/sidecar-util.h copy be/src/{util/uid-util.cc => scheduling/admission-control-client.cc} (75%) create mode 100644 be/src/scheduling/admission-control-client.h create mode 100644 be/src/scheduling/local-admission-control-client.cc create mode 100644 be/src/scheduling/local-admission-control-client.h delete mode 100644 be/src/service/query-driver-map.h rename be/src/{service/query-driver-map.cc => util/sharded-query-map-util.cc} (53%)
[impala] 02/02: IMPALA-9930 (part 1): Initial refactor for admission control service
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 28181cbe6c59cc2f3da2246fd907d6bc69a5f7d2 Author: Thomas Tauber-Marshall AuthorDate: Wed Sep 2 12:22:30 2020 -0700 IMPALA-9930 (part 1): Initial refactor for admission control service This patch contains the following refactors that are needed for the admission control service, in order to make the main patch easier to review: - Adds a new class AdmissionControlClient which will be used to abstract the logic for submitting queries to either a local or remote admission controller out from ClientRequestState/Coordinator. Currently only local submission is supported. - SubmitForAdmission now takes a BackendId representing the coordinator instead of assuming that the local impalad will be the coordinator. - The CRS_BEFORE_ADMISSION debug action is moved into SubmitForAdmission() so that it will be executed on whichever daemon is performing admission control rather than always on the coordinator (needed for TestAdmissionController.test_cancellation). - ShardedQueryMap is extended to allow keys to be either TUniqueId or UniqueIdPB and Add(), Get(), and Delete() convenience functions are added. - Some utils related to seralizing Thrift objects into sidecars are added. Testing: - Passed a run of existing core tests. Change-Id: I7974a979cf05ed569f31e1ab20694e29fd3e4508 Reviewed-on: http://gerrit.cloudera.org:8080/16411 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/rpc/sidecar-util.h | 94 ++ be/src/runtime/coordinator-backend-state.cc| 37 +++-- be/src/runtime/coordinator.cc | 18 ++--- be/src/runtime/query-driver.cc | 10 +-- be/src/runtime/query-driver.h | 2 +- be/src/scheduling/CMakeLists.txt | 2 + be/src/scheduling/admission-control-client.cc | 30 +++ be/src/scheduling/admission-control-client.h | 56 + be/src/scheduling/admission-controller.cc | 22 +++-- be/src/scheduling/admission-controller.h | 1 + .../scheduling/local-admission-control-client.cc | 53 be/src/scheduling/local-admission-control-client.h | 56 + be/src/scheduling/scheduler.cc | 35 be/src/scheduling/scheduler.h | 4 +- be/src/service/CMakeLists.txt | 1 - be/src/service/client-request-state.cc | 19 +++-- be/src/service/client-request-state.h | 16 ++-- be/src/service/control-service.cc | 15 +--- be/src/service/impala-server.cc| 3 +- be/src/service/impala-server.h | 3 +- be/src/service/query-driver-map.h | 44 -- be/src/util/CMakeLists.txt | 1 + .../sharded-query-map-util.cc} | 44 +++--- be/src/util/sharded-query-map-util.h | 82 +-- common/thrift/ImpalaService.thrift | 2 +- tests/custom_cluster/test_admission_controller.py | 6 +- tests/custom_cluster/test_query_retries.py | 4 +- tests/custom_cluster/test_restart_services.py | 2 +- tests/query_test/test_observability.py | 6 +- 29 files changed, 475 insertions(+), 193 deletions(-) diff --git a/be/src/rpc/sidecar-util.h b/be/src/rpc/sidecar-util.h new file mode 100644 index 000..47b15f6 --- /dev/null +++ b/be/src/rpc/sidecar-util.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "common/status.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/rpc/rpc_sidecar.h" +#include "kudu/util/faststring.h" +#include "rpc/thrift-util.h" +#include "util/kudu-status-util.h" + +DEC
[impala] 01/02: Fix extra semicolon in LocalIcebergTable
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a973c776f98273978256163a8db792d44275341b Author: Tim Armstrong AuthorDate: Tue Sep 29 09:17:42 2020 -0700 Fix extra semicolon in LocalIcebergTable This caused errors in my Eclipse setup. Change-Id: Iccc7d5fb967f66c8194678be0d7e074f8638a630 Reviewed-on: http://gerrit.cloudera.org:8080/16519 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java index 117a621..388e3a3 100644 --- a/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java @@ -36,7 +36,7 @@ import org.apache.impala.catalog.HdfsPartition.FileDescriptor; import org.apache.impala.catalog.TableLoadingException; import org.apache.impala.thrift.THdfsPartition; import org.apache.impala.thrift.THdfsTable; -import org.apache.impala.thrift.TIcebergFileFormat;; +import org.apache.impala.thrift.TIcebergFileFormat; import org.apache.impala.thrift.TTableDescriptor; import org.apache.impala.thrift.TTableType; import org.apache.impala.util.IcebergUtil;
[impala] branch master updated: IMPALA-10196: Remove LlvmCodeGen::CastPtrToLlvmPtr
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git The following commit(s) were added to refs/heads/master by this push: new aeeff53 IMPALA-10196: Remove LlvmCodeGen::CastPtrToLlvmPtr aeeff53 is described below commit aeeff53e884a67ee7f5980654a1d394c6e3e34ac Author: Daniel Becker AuthorDate: Tue Sep 22 09:31:34 2020 +0200 IMPALA-10196: Remove LlvmCodeGen::CastPtrToLlvmPtr LlvmCodeGen::CastPtrToLlvmPtr embeds a pointer that points to data in the current process's memory into codegen'd IR code. Our long term goal is to share the codegen'd IR among processes working on the same fragment, which is not possible if the IR contains pointers pointing to data of a specific process. A step in making the IR independent of the process generating it is removing LlvmCodeGen::CastPtrToLlvmPtr. Change-Id: I046a06fbf23629a90cc2cca164176a89e557c7c4 Reviewed-on: http://gerrit.cloudera.org:8080/16517 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/codegen/gen_ir_descriptions.py | 22 be/src/codegen/llvm-codegen-test.cc | 94 --- be/src/codegen/llvm-codegen.cc| 21 ++-- be/src/codegen/llvm-codegen.h | 8 --- be/src/exec/text-converter.cc | 6 +-- be/src/exprs/scalar-expr-ir.cc| 65 be/src/exprs/scalar-expr.cc | 33 be/src/exprs/scalar-expr.h| 5 -- be/src/exprs/scalar-fn-call.cc| 10 +--- testdata/llvm/test-loop.cc| 14 -- 10 files changed, 62 insertions(+), 216 deletions(-) diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py index b069e9c..83db8bb 100755 --- a/be/src/codegen/gen_ir_descriptions.py +++ b/be/src/codegen/gen_ir_descriptions.py @@ -75,28 +75,6 @@ ir_functions = [ "_Z14TimestampValEqRKN10impala_udf12TimestampValES2_"], ["CODEGEN_ANYVAL_TIMESTAMP_VALUE_EQ", "_Z16TimestampValueEqRKN10impala_udf12TimestampValERKN6impala14TimestampValueE"], - ["SCALAR_EXPR_GET_BOOLEAN_VAL", - "_ZN6impala10ScalarExpr24GetBooleanValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_TINYINT_VAL", - "_ZN6impala10ScalarExpr24GetTinyIntValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_SMALLINT_VAL", - "_ZN6impala10ScalarExpr25GetSmallIntValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_INT_VAL", - "_ZN6impala10ScalarExpr20GetIntValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_BIGINT_VAL", - "_ZN6impala10ScalarExpr23GetBigIntValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_FLOAT_VAL", - "_ZN6impala10ScalarExpr22GetFloatValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_DOUBLE_VAL", - "_ZN6impala10ScalarExpr23GetDoubleValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_STRING_VAL", - "_ZN6impala10ScalarExpr23GetStringValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_TIMESTAMP_VAL", - "_ZN6impala10ScalarExpr26GetTimestampValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_DECIMAL_VAL", - "_ZN6impala10ScalarExpr24GetDecimalValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], - ["SCALAR_EXPR_GET_DATE_VAL", - "_ZN6impala10ScalarExpr21GetDateValInterpretedEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], ["HASH_CRC", "IrCrcHash"], ["HASH_MURMUR", "IrMurmurHash"], ["PHJ_PROCESS_BUILD_BATCH", diff --git a/be/src/codegen/llvm-codegen-test.cc b/be/src/codegen/llvm-codegen-test.cc index 1ec4ca1..bd5a247 100644 --- a/be/src/codegen/llvm-codegen-test.cc +++ b/be/src/codegen/llvm-codegen-test.cc @@ -173,35 +173,33 @@ TEST_F(LlvmCodeGenTest, BadIRFile) { codegen->Close(); } -// IR for the generated linner loop -// define void @JittedInnerLoop() { +// IR for the generated linner loop: +// define void @JittedInnerLoop(i64* %counter) { // entry: -// call void @DebugTrace(i8* inttoptr (i64 18970856 to i8*)) -// %0 = load i64* inttoptr (i64 140735197627800 to i64*) -// %1 = add i64 %0, -// store i64 %1, i64* inttoptr (i64 140735197627800 to i64*) +// %0 = call i32 (i8*, ...) @printf( +// i8* getelementptr inbounds ([19 x i8], [19 x i8]* @0, i32 0, i32 0)) +// %1 = load i64, i64* %counter +// %2 = add i64 %1, 1 +// store i64 %2, i64* %counter
[impala] branch master updated (ee9904b -> 0e96ee8)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from ee9904b IMPALA-10175: Extend error msg when CAST(FORMAT) fails for DATE new e53d649 IMPALA-9664: Support hive replication new 0e96ee8 IMPALA-9923: Load ORC files as full ACID only in workload 'functional-query' The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/service/client-request-state.cc | 1 + common/thrift/CatalogService.thrift| 5 +- .../org/apache/impala/compat/MetastoreShim.java| 90 -- .../java/org/apache/impala/catalog/HdfsTable.java | 4 +- .../catalog/HiveStorageDescriptorFactory.java | 3 + .../catalog/events/MetastoreEventsProcessor.java | 2 +- .../apache/impala/service/CatalogOpExecutor.java | 310 ++--- .../java/org/apache/impala/util/AcidUtils.java | 41 +++ .../java/org/apache/impala/util/MetaStoreUtil.java | 61 ++-- .../events/MetastoreEventsProcessorTest.java | 238 +--- .../SynchronousHMSEventProcessorForTests.java | 32 +++ fe/src/test/resources/hive-site.xml.py | 6 + testdata/bin/generate-schema-statements.py | 12 +- .../queries/PlannerTest/resource-requirements.test | 24 +- tests/custom_cluster/test_event_processing.py | 180 +++- 15 files changed, 809 insertions(+), 200 deletions(-)
[impala] 02/02: IMPALA-9923: Load ORC files as full ACID only in workload 'functional-query'
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 0e96ee8e99359efe413aec29c1cad03859d268a1 Author: Zoltan Borok-Nagy AuthorDate: Mon Sep 28 13:09:57 2020 +0200 IMPALA-9923: Load ORC files as full ACID only in workload 'functional-query' HIVE-24145 still causes data load failures quite frequently. The failure usually occurs during TPC-DS loading. I modified generate-schema-statements.py to only load ORC tables as full ACID in the 'functional-query' workload. Since this workload contains the ACID-specific tests, we should still have enough coverage for ORC/ACID testing. Testing * Ran exhaustive tests successfully Change-Id: I0c81aedd3be314819dc4bc5bebec17bad3d03b10 Reviewed-on: http://gerrit.cloudera.org:8080/16511 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- testdata/bin/generate-schema-statements.py | 12 --- .../queries/PlannerTest/resource-requirements.test | 24 +++--- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index 441fd36..a69fe9e 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -549,7 +549,7 @@ def eval_section(section_str): def generate_statements(output_name, test_vectors, sections, schema_include_constraints, schema_exclude_constraints, -schema_only_constraints): +schema_only_constraints, convert_orc_to_full_acid): # TODO: This method has become very unwieldy. It has to be re-factored sooner than # later. # Parquet statements to be executed separately by Impala @@ -656,7 +656,8 @@ def generate_statements(output_name, test_vectors, sections, tblproperties = parse_table_properties(create_file_format, table_properties) # ORC tables are full ACID by default. - if (HIVE_MAJOR_VERSION == 3 and + if (convert_orc_to_full_acid and + HIVE_MAJOR_VERSION == 3 and create_file_format == 'orc' and 'transactional' not in tblproperties): tblproperties['transactional'] = 'true' @@ -816,6 +817,10 @@ if __name__ == "__main__": test_vectors =\ [TableFormatInfo.create_from_string(dataset, tf) for tf in table_formats] + # Hack to resolve IMPALA-9923. + # TODO: Try to remove it once we have HIVE-24145 in the dev environment. + convert_orc_to_full_acid = options.workload == 'functional-query' + target_dataset = test_vectors[0].dataset print 'Target Dataset: ' + target_dataset dataset_load_dir = os.path.join(SQL_OUTPUT_DIR, target_dataset) @@ -847,4 +852,5 @@ if __name__ == "__main__": parse_table_constraints(constraints_file) sections = parse_schema_template_file(schema_template_file) generate_statements('%s-%s' % (options.workload, options.exploration_strategy), - test_vectors, sections, include_constraints, exclude_constraints, only_constraints) + test_vectors, sections, include_constraints, exclude_constraints, only_constraints, + convert_orc_to_full_acid) diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test index 1ea06fb..9d97d5d 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test @@ -1465,22 +1465,22 @@ select * from tpch_orc_def.lineitem 3 PLAN Max Per-Host Resource Reservation: Memory=8.00MB Threads=2 -Per-Host Resource Estimates: Memory=24MB +Per-Host Resource Estimates: Memory=88MB Analyzed query: SELECT * FROM tpch_orc_def.lineitem F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 -| Per-Host Resources: mem-estimate=24.00MB mem-reservation=8.00MB thread-reservation=2 +| Per-Host Resources: mem-estimate=88.00MB mem-reservation=8.00MB thread-reservation=2 PLAN-ROOT SINK | output exprs: tpch_orc_def.lineitem.l_orderkey, tpch_orc_def.lineitem.l_partkey, tpch_orc_def.lineitem.l_suppkey, tpch_orc_def.lineitem.l_linenumber, tpch_orc_def.lineitem.l_quantity, tpch_orc_def.lineitem.l_extendedprice, tpch_orc_def.lineitem.l_discount, tpch_orc_def.lineitem.l_tax, tpch_orc_def.lineitem.l_returnflag, tpch_orc_def.lineitem.l_linestatus, tpch_orc_def.lineitem.l_shipdate, tpch_orc_def.lineitem.l_commitdate, tpch_orc_def.lineitem.l_receiptdate, tpch_orc_def.lineitem.l_ [...] | mem-estimate=0B mem-reservation=0B thread-reservation=0 | 00:SCAN HDFS [tpch_orc_def.lineitem] - HDFS partitions=1/1 files=12 size=142.90MB + HDFS partitions=1/1 files=1 size=142.84MB