[impala] branch master updated (eb85c6e -> 08367e9)

2021-01-26 Thread tarmstrong
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


from eb85c6e  IMPALA-9793: Impala quickstart cluster with docker-compose
 new f4584dd  IMPALA-10404: Update docs to reflect RLE_DICTIONARY support
 new 08367e9  IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY 
syntax

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/shared/impala_common.xml  |  5 +--
 .../topics/impala_parquet_dictionary_filtering.xml |  8 +++--
 .../apache/impala/analysis/CreateTableStmt.java| 25 ++-
 .../queries/QueryTest/iceberg-create.test  | 37 ++
 .../queries/QueryTest/iceberg-negative.test| 18 +++
 .../queries/QueryTest/show-create-table.test   | 19 +++
 6 files changed, 106 insertions(+), 6 deletions(-)



[impala] 02/02: IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax

2021-01-26 Thread tarmstrong
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 08367e91f04508b54f77b56e0d211dd167b0116f
Author: Zoltan Borok-Nagy 
AuthorDate: Mon Jan 25 16:09:59 2021 +0100

IMPALA-10452: CREATE Iceberg tables with old PARTITIONED BY syntax

For convenience this patch adds support with the old-style
CREATE TABLE ... PARTITIONED BY ...; syntax for Iceberg tables.

So users should be able to write the following:

CREATE TABLE ice_t (i int)
PARTITIONED BY (p int)
STORED AS ICEBERG;

Which should be equivalent to this:

CREATE TABLE ice_t (i int, p int)
PARTITION BY SPEC (p IDENTITY)
STORED AS ICEBERG;

Please note that the old-style CREATE TABLE statement creates
IDENTITY-partitioned tables. For other partition transforms the
users must use the new, more generic syntax.

Hive also supports the old PARTITIONED BY syntax with the same
behavior.

Testing:
 * added e2e tests

Change-Id: I789876c161bc0987820955aa9ae01414e0dcb45d
Reviewed-on: http://gerrit.cloudera.org:8080/16979
Reviewed-by: Impala Public Jenkins 
Tested-by: Impala Public Jenkins 
---
 .../apache/impala/analysis/CreateTableStmt.java| 25 ++-
 .../queries/QueryTest/iceberg-create.test  | 37 ++
 .../queries/QueryTest/iceberg-negative.test| 18 +++
 .../queries/QueryTest/show-create-table.test   | 19 +++
 4 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java 
b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
index 5e89724..1477536 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
@@ -37,6 +37,7 @@ import org.apache.impala.service.BackendConfig;
 import org.apache.impala.thrift.TCreateTableParams;
 import org.apache.impala.thrift.THdfsFileFormat;
 import org.apache.impala.thrift.TIcebergCatalog;
+import org.apache.impala.thrift.TIcebergPartitionTransformType;
 import org.apache.impala.thrift.TSortingOrder;
 import org.apache.impala.thrift.TTableName;
 import org.apache.impala.util.AvroSchemaConverter;
@@ -271,8 +272,8 @@ public class CreateTableStmt extends StatementBase {
 }
 
 if (getFileFormat() == THdfsFileFormat.ICEBERG) {
-  analyzeIcebergFormat(analyzer);
   analyzeIcebergColumns();
+  analyzeIcebergFormat(analyzer);
 } else {
   List iceSpec = 
tableDef_.getIcebergPartitionSpecs();
   if (iceSpec != null && !iceSpec.isEmpty()) {
@@ -703,10 +704,32 @@ public class CreateTableStmt extends StatementBase {
* Iceberg field
*/
   private void analyzeIcebergColumns() {
+if (!getPartitionColumnDefs().isEmpty()) {
+  createIcebergPartitionSpecFromPartitionColumns();
+}
 for (ColumnDef def : getColumnDefs()) {
   if (!def.isNullabilitySet()) {
 def.setNullable(true);
   }
 }
   }
+
+  /**
+   * Creates Iceberg partition spec from partition columns. Needed to support 
old-style
+   * CREATE TABLE .. PARTITIONED BY () syntax. In this case the column 
list in
+   * 'cols' is appended to the table-level columns, but also Iceberg-level 
IDENTITY
+   * partitions are created from this list.
+   */
+  private void createIcebergPartitionSpecFromPartitionColumns() {
+Preconditions.checkState(!getPartitionColumnDefs().isEmpty());
+Preconditions.checkState(getIcebergPartitionSpecs().isEmpty());
+List partFields = new ArrayList<>();
+for (ColumnDef colDef : getPartitionColumnDefs()) {
+  partFields.add(new IcebergPartitionField(colDef.getColName(),
+  new 
IcebergPartitionTransform(TIcebergPartitionTransformType.IDENTITY)));
+}
+getIcebergPartitionSpecs().add(new IcebergPartitionSpec(partFields));
+getColumnDefs().addAll(getPartitionColumnDefs());
+getPartitionColumnDefs().clear();
+  }
 }
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test
index 4b7daf9..eba73d8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-create.test
@@ -445,3 +445,40 @@ DESCRIBE iceberg_nullable_test;
  TYPES
 STRING,STRING,STRING,STRING
 
+ QUERY
+CREATE TABLE iceberg_old_style_partitions (
+  register_time DATE,
+  message STRING,
+  price DECIMAL(8,1),
+  map_test MAP >,
+  struct_test STRUCT 
+)
+PARTITIONED BY (
+  level STRING,
+  event_id INT
+)
+STORED AS ICEBERG;
+ RESULTS
+'Table has been created.'
+
+ QUERY
+DESCRIBE iceberg_old_style_partitions;
+ RESULTS
+'register_time','date','','true'
+'message','string','','true'

[impala] 01/02: IMPALA-10404: Update docs to reflect RLE_DICTIONARY support

2021-01-26 Thread tarmstrong
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f4584dd2763edda9a24e4466b2d0a8f4bb065437
Author: Tim Armstrong 
AuthorDate: Mon Jan 25 14:41:53 2021 -0800

IMPALA-10404: Update docs to reflect RLE_DICTIONARY support

Fix references to PLAIN_DICTIONARY to reflect that
RLE_DICTIONARY is supported too.

Change-Id: Iee98abfd760396cf43302c9077c6165eb3623335
Reviewed-on: http://gerrit.cloudera.org:8080/16982
Reviewed-by: Tim Armstrong 
Tested-by: Impala Public Jenkins 
---
 docs/shared/impala_common.xml   | 5 +++--
 docs/topics/impala_parquet_dictionary_filtering.xml | 8 +---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 6b0e812..f18ca00 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -3175,8 +3175,9 @@ flight_num:   INT32 SNAPPY DO:83456393 
FPO:83488603 SZ:10216514/11474301
 
   
 Impala can query Parquet files that use the PLAIN,
-PLAIN_DICTIONARY, BIT_PACKED, and 
RLE
-encodings. Currently, Impala does not support 
RLE_DICTIONARY encoding.
+PLAIN_DICTIONARY, BIT_PACKED, 
RLE
+and RLE_DICTIONARY encodings. 
RLE_DICTIONARY is supported
+only in  and up.
 When creating files outside of Impala for use by Impala, make sure to 
use one of the
 supported encodings. In particular, for MapReduce jobs,
 parquet.writer.version must not be defined 
(especially as
diff --git a/docs/topics/impala_parquet_dictionary_filtering.xml 
b/docs/topics/impala_parquet_dictionary_filtering.xml
index 3460f2a..2d68b4d 100644
--- a/docs/topics/impala_parquet_dictionary_filtering.xml
+++ b/docs/topics/impala_parquet_dictionary_filtering.xml
@@ -58,7 +58,8 @@ under the License.
 
   If the encoding_stats is in the Parquet file, 
dictionary filtering
   uses it to determine if there are only dictionary encoded pages 
(i.e. there are no
-  data pages with an encoding other than PLAIN_DICTIONARY).
+  data pages with an encoding other than 
RLE_DICTIONARY or
+  PLAIN_DICTIONARY).
 
 
 
@@ -66,11 +67,12 @@ under the License.
   The column is purely dictionary encoded if both of the conditions 
satisfy:
   
 
-  PLAIN_DICTIONARY is present.
+  PLAIN_DICTIONARY or 
RLE_DICTIONARY is present.
 
 
 
-  Only PLAIN_DICTIONARY, RLE, or BIT_PACKED encodings are listed.
+  Only PLAIN_DICTIONARY, 
RLE_DICTIONARY,
+  RLE, or BIT_PACKED encodings 
are listed.
 
   
 



[impala] branch master updated (e8720b4 -> eb85c6e)

2021-01-26 Thread tarmstrong
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


from e8720b4  IMPALA-2019(Part-1): Provide UTF-8 support in length, 
substring and reverse functions
 new 3b763b5  IMPALA-10447: Add a newline when exporting shell output to a 
file.
 new eb85c6e  IMPALA-9793: Impala quickstart cluster with docker-compose

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docker/CMakeLists.txt  |   36 +-
 docker/README.md   |  144 ++-
 docs/build-doc.sh => docker/docker-build.sh|   29 +-
 docker/impala_base/Dockerfile  |   10 +
 docker/quickstart-kudu-minimal.yml |  128 ++
 .../quickstart-load-data.yml   |   34 +-
 docker/quickstart.yml  |  104 ++
 docker/quickstart_client/Dockerfile|   70 ++
 docker/quickstart_client/data-load-entrypoint.sh   |   86 ++
 .../quickstart_client/load_tpcds_kudu.sql  |  316 +++--
 docker/quickstart_client/load_tpcds_parquet.sql| 1248 
 docker/quickstart_conf/hive-site.xml   |   74 ++
 docker/quickstart_hms/Dockerfile   |   67 ++
 docker/quickstart_hms/hms-entrypoint.sh|   68 ++
 shell/shell_output.py  |1 +
 tests/shell/test_shell_commandline.py  |   28 +
 16 files changed, 2250 insertions(+), 193 deletions(-)
 copy docs/build-doc.sh => docker/docker-build.sh (54%)
 create mode 100644 docker/quickstart-kudu-minimal.yml
 copy be/src/catalog/CMakeLists.txt => docker/quickstart-load-data.yml (58%)
 create mode 100644 docker/quickstart.yml
 create mode 100644 docker/quickstart_client/Dockerfile
 create mode 100755 docker/quickstart_client/data-load-entrypoint.sh
 copy testdata/datasets/tpcds/tpcds_kudu_template.sql => 
docker/quickstart_client/load_tpcds_kudu.sql (68%)
 create mode 100644 docker/quickstart_client/load_tpcds_parquet.sql
 create mode 100644 docker/quickstart_conf/hive-site.xml
 create mode 100644 docker/quickstart_hms/Dockerfile
 create mode 100755 docker/quickstart_hms/hms-entrypoint.sh



[impala] 01/02: IMPALA-10447: Add a newline when exporting shell output to a file.

2021-01-26 Thread tarmstrong
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3b763b5c3235ebdb445ad0a4ae1bf79385e8df02
Author: Andrew Sherman 
AuthorDate: Mon Jan 18 19:10:11 2021 -0800

IMPALA-10447: Add a newline when exporting shell output to a file.

Impala shell outputs a batch of rows using OutputStream. Inside
OutputStream, output to a file is handled slightly differently from
output that is written to stdout. When writing to stdout we use print()
(which appends a newline) while when writing to a file we use write()
(which adds nothing). This difference was introduced in IMPALA-3343 so
this bug may be a regression introduced then. To ensure that output is
the same in either case we need to add a newline after writing each
batch of rows to a file.

TESTING:
Added a new test for this case.

Change-Id: I078a06c54e0834bc1f898626afbfff4ded579fa9
Reviewed-on: http://gerrit.cloudera.org:8080/16966
Reviewed-by: Impala Public Jenkins 
Tested-by: Impala Public Jenkins 
---
 shell/shell_output.py |  1 +
 tests/shell/test_shell_commandline.py | 28 
 2 files changed, 29 insertions(+)

diff --git a/shell/shell_output.py b/shell/shell_output.py
index 31d91a0..bfb418c 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -117,6 +117,7 @@ class OutputStream(object):
   # Note that instances of this class do not persist, so it's fine to
   # close the we close the file handle after each write.
   out_file.write(formatted_data.encode('utf-8'))  # file opened in 
binary mode
+  out_file.write(b'\n')
   except IOError as err:
 file_err_msg = "Error opening file %s: %s" % (self.filename, str(err))
 print('{0} (falling back to stderr)'.format(file_err_msg), 
file=sys.stderr)
diff --git a/tests/shell/test_shell_commandline.py 
b/tests/shell/test_shell_commandline.py
index 666162f..30cd85d 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -105,6 +105,16 @@ def populated_table(empty_table, request):
   return fq_table_name
 
 
+@pytest.yield_fixture
+def tmp_file():
+  """
+  Test fixture which manages a temporary file
+  """
+  _, tmp_file = tempfile.mkstemp()
+  yield tmp_file
+  os.remove(tmp_file)
+
+
 class TestImpalaShell(ImpalaTestSuite):
   """A set of sanity tests for the Impala shell commandline parameters.
 
@@ -1071,3 +1081,21 @@ class TestImpalaShell(ImpalaTestSuite):
 expected_result = """anonymous\tanonymous\n"""
 assert result.stdout == expected_result
 assert result.stderr == ""
+
+  def test_output_file(self, vector, tmp_file):
+"""Test that writing output to a file using '--output_file' produces the 
same output
+as is written to stdout."""
+row_count = 6000  # Should be > 2048 to tickle IMPALA-10447.
+query = "select * from tpcds.item order by i_item_sk limit %d" % row_count
+# Run the query normally and keep the stdout.
+output = run_impala_shell_cmd(vector, ['-q', query, '-B', 
'--output_delimiter=;'])
+assert "Fetched %d row(s)" % row_count in output.stderr
+rows_from_stdout = output.stdout.strip().split('\n')
+# Run the query with output sent to a file using '--output_file'.
+result = run_impala_shell_cmd(vector, ['-q', query, '-B', 
'--output_delimiter=;',
+   '--output_file=%s' % tmp_file])
+assert "Fetched %d row(s)" % row_count in result.stderr
+# Check that the output from the file is the same as that written to 
stdout.
+with open(tmp_file, "r") as f:
+  rows_from_file = [line.rstrip() for line in f]
+  assert rows_from_stdout == rows_from_file



[impala] 02/02: IMPALA-9793: Impala quickstart cluster with docker-compose

2021-01-26 Thread tarmstrong
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit eb85c6eeca77c748a01d304625ea5d608a3e12c0
Author: Tim Armstrong 
AuthorDate: Sat May 9 21:27:41 2020 -0700

IMPALA-9793: Impala quickstart cluster with docker-compose

What works:
* A single node cluster can be started up with docker-compose
* HMS data is stored in Derby database in a docker volume
* Filesystem data is stored in a shared docker volume, using the
  localfs support in the Hadoop client.
* A Kudu cluster with a single master can be optionally added on
  to the Impala cluster.
* TPC-DS data can be loaded automatically by a data loading container.

We need to set up a docker network called quickstart-network,
purely because docker-compose insists on generating network names
with underscores, which are part of the FQDN and end up causing
problems with Java's URL parsing, which rejects these technically
invalid domain names.

How to run:

Instructions for running the quickstart cluster are in
docker/README.md.

How to build containers:

  ./buildall.sh -release -noclean -notests -ninja
  ninja quickstart_hms_image quickstart_client_image docker_images

How to upload containers to dockerhub:

  IMPALA_QUICKSTART_IMAGE_PREFIX=timgarmstrong/
  for i in impalad_coord_exec impalad_coordinator statestored \
   impalad_executor catalogd impala_quickstart_client \
   impala_quickstart_hms
  do
docker tag $i ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i
docker push ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i
  done

I pushed containers build from commit f260cce22, which
was branched from 6cb7cecacf on master.

Misc other stuff:
* Added more metadata to all images.

TODO:
* Test and instructions to run against Kudu quickstart
* Upload latest version of containers before merging.

Change-Id: Ifc0b862af40a368381ada7ec2a355fe4b0aa778c
Reviewed-on: http://gerrit.cloudera.org:8080/15966
Reviewed-by: Impala Public Jenkins 
Tested-by: Impala Public Jenkins 
---
 docker/CMakeLists.txt|   36 +-
 docker/README.md |  144 ++-
 docker/docker-build.sh   |   39 +
 docker/impala_base/Dockerfile|   10 +
 docker/quickstart-kudu-minimal.yml   |  128 +++
 docker/quickstart-load-data.yml  |   38 +
 docker/quickstart.yml|  104 ++
 docker/quickstart_client/Dockerfile  |   70 ++
 docker/quickstart_client/data-load-entrypoint.sh |   86 ++
 docker/quickstart_client/load_tpcds_kudu.sql |  877 +++
 docker/quickstart_client/load_tpcds_parquet.sql  | 1248 ++
 docker/quickstart_conf/hive-site.xml |   74 ++
 docker/quickstart_hms/Dockerfile |   67 ++
 docker/quickstart_hms/hms-entrypoint.sh  |   68 ++
 14 files changed, 2985 insertions(+), 4 deletions(-)

diff --git a/docker/CMakeLists.txt b/docker/CMakeLists.txt
index 60fd8c2..7fe085b 100644
--- a/docker/CMakeLists.txt
+++ b/docker/CMakeLists.txt
@@ -19,6 +19,8 @@ set(IMPALA_BASE_BUILD_CONTEXT_DIR
   ${CMAKE_SOURCE_DIR}/docker/build_context
 )
 
+set(DOCKER_BUILD ${CMAKE_SOURCE_DIR}/docker/docker-build.sh)
+
 find_program(LSB_RELEASE_EXEC lsb_release)
 execute_process(COMMAND ${LSB_RELEASE_EXEC} -is
   OUTPUT_VARIABLE LSB_RELEASE_ID
@@ -38,6 +40,7 @@ else()
 endif()
 MESSAGE(STATUS "Picked docker base image based on host OS: 
${DISTRO_BASE_IMAGE}")
 
+
 if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED")
   # Add a target to build a base docker image for 'build_type'. 
'build_context_args' are
   # passed to the setup_build_context.py script.
@@ -58,7 +61,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED")
   # sent to the docker daemon. This allows the Dockerfile built to copy 
all necessary
   # dependencies.
   COMMAND tar cvh . -C ${CMAKE_SOURCE_DIR}/docker/impala_base/ . |
-  docker build -t impala_base_${build_type}
+  ${DOCKER_BUILD} -t impala_base_${build_type}
   --build-arg BASE_IMAGE=${DISTRO_BASE_IMAGE} -
   WORKING_DIRECTORY ${IMPALA_BASE_BUILD_CONTEXT_DIR}/${build_type}
   DEPENDS impala_base_build_context_${build_type} 
${CMAKE_SOURCE_DIR}/docker/impala_base/Dockerfile
@@ -88,7 +91,7 @@ if (NOT ${DISTRO_BASE_IMAGE} STREQUAL "UNSUPPORTED")
   # build context used for the base image is used for each daemon image. 
This allows
   # each daemon image to only copy in the dependencies it requires.
   COMMAND tar cvh . -C ${CMAKE_SOURCE_DIR}/docker/${daemon_name}/ . |
-  docker build --build-arg BASE_IMAGE=impala_base_${build_type}
+  ${DOCKER_BUILD}