[hive] branch master updated: HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)

2022-06-13 Thread kgyrtkirk
This is an automated email from the ASF dual-hosted git repository.

kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
 new c4f6eb2f914 HIVE-26184: COLLECT_SET with GROUP BY is very slow when 
some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
c4f6eb2f914 is described below

commit c4f6eb2f91478152c89070a7455df9a1b8980c75
Author: okumin 
AuthorDate: Tue Jun 14 00:26:36 2022 +0900

HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are 
highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
---
 .../udf/generic/GenericUDAFMkCollectionEvaluator.java  | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
index b05023dd37a..cffc7f76510 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
@@ -95,11 +95,27 @@ public class GenericUDAFMkCollectionEvaluator extends 
GenericUDAFEvaluator
 throw new RuntimeException("Buffer type unknown");
   }
 }
+
+private void reset() {
+  if (bufferType == BufferType.LIST) {
+container.clear();
+  } else if (bufferType == BufferType.SET) {
+// Don't reuse a container because HashSet#clear can be very slow. The 
operation takes O(N)
+// and N is the capacity of the internal hash table. The internal 
capacity grows based on
+// the number of elements and it never shrinks. Thus, HashSet#clear 
takes O(N) every time
+// once a skewed key appears.
+// In order to avoid too many resizing in average cases, we set the 
initial capacity to the
+// number of elements of the previous aggregation.
+container = new LinkedHashSet<>(container.size());
+  } else {
+throw new RuntimeException("Buffer type unknown");
+  }
+}
   }
 
   @Override
   public void reset(AggregationBuffer agg) throws HiveException {
-((MkArrayAggregationBuffer) agg).container.clear();
+((MkArrayAggregationBuffer) agg).reset();
   }
 
   @Override



[hive] branch master updated: HIVE-26307: Avoid FS init in FileIO::newInputFile in vectorized Iceberg reads (Peter Vary reviewed by Adam Szita) (#3354)

2022-06-13 Thread pvary
This is an automated email from the ASF dual-hosted git repository.

pvary pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
 new 76d4abe402a HIVE-26307: Avoid FS init in FileIO::newInputFile in 
vectorized Iceberg reads (Peter Vary reviewed by Adam Szita) (#3354)
76d4abe402a is described below

commit 76d4abe402abdebb6534e9db3f4209cce8b0d4e6
Author: pvary 
AuthorDate: Mon Jun 13 17:20:53 2022 +0200

HIVE-26307: Avoid FS init in FileIO::newInputFile in vectorized Iceberg 
reads (Peter Vary reviewed by Adam Szita) (#3354)
---
 .../mr/hive/vector/HiveVectorizedReader.java   |  14 +-
 .../iceberg/mr/mapreduce/IcebergInputFormat.java   | 148 ++---
 .../apache/iceberg/orc/VectorizedReadUtils.java|  14 +-
 3 files changed, 77 insertions(+), 99 deletions(-)

diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java
index 19fa0f06506..00b9b3c73f0 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java
@@ -51,7 +51,6 @@ import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.CloseableIterator;
-import org.apache.iceberg.io.InputFile;
 import org.apache.iceberg.mr.mapred.MapredIcebergInputFormat;
 import org.apache.iceberg.orc.VectorizedReadUtils;
 import org.apache.iceberg.parquet.ParquetSchemaUtil;
@@ -72,11 +71,10 @@ public class HiveVectorizedReader {
 
   }
 
-  public static  CloseableIterable reader(InputFile inputFile, 
FileScanTask task, Map idToConstant,
+  public static  CloseableIterable reader(Path path, FileScanTask task, 
Map idToConstant,
   TaskAttemptContext context) {
 // Tweaks on jobConf here are relevant for this task only, so we need to 
copy it first as context's conf is reused..
-JobConf job = new JobConf((JobConf) context.getConfiguration());
-Path path = new Path(inputFile.location());
+JobConf job = new JobConf(context.getConfiguration());
 FileFormat format = task.file().format();
 Reporter reporter = 
((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) 
context).getLegacyReporter();
 
@@ -131,7 +129,7 @@ public class HiveVectorizedReader {
 
   switch (format) {
 case ORC:
-  recordReader = orcRecordReader(job, reporter, task, inputFile, path, 
start, length, readColumnIds, fileId);
+  recordReader = orcRecordReader(job, reporter, task, path, start, 
length, readColumnIds, fileId);
   break;
 
 case PARQUET:
@@ -144,12 +142,12 @@ public class HiveVectorizedReader {
   return createVectorizedRowBatchIterable(recordReader, job, 
partitionColIndices, partitionValues);
 
 } catch (IOException ioe) {
-  throw new RuntimeException("Error creating vectorized record reader for 
" + inputFile, ioe);
+  throw new RuntimeException("Error creating vectorized record reader for 
" + path, ioe);
 }
   }
 
   private static RecordReader 
orcRecordReader(JobConf job, Reporter reporter,
-  FileScanTask task, InputFile inputFile, Path path, long start, long 
length, List readColumnIds,
+  FileScanTask task, Path path, long start, long length, List 
readColumnIds,
   SyntheticFileId fileId) throws IOException {
 RecordReader recordReader = null;
 
@@ -159,7 +157,7 @@ public class HiveVectorizedReader {
 
 // Metadata information has to be passed along in the OrcSplit. Without 
specifying this, the vectorized
 // reader will assume that the ORC file ends at the task's start + length, 
and might fail reading the tail..
-ByteBuffer serializedOrcTail = 
VectorizedReadUtils.getSerializedOrcTail(inputFile, fileId, job);
+ByteBuffer serializedOrcTail = 
VectorizedReadUtils.getSerializedOrcTail(path, fileId, job);
 OrcTail orcTail = 
VectorizedReadUtils.deserializeToOrcTail(serializedOrcTail);
 
 VectorizedReadUtils.handleIcebergProjection(task, job,
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
index 7bbd03a09ed..7617c6b17e9 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -31,6 +31,7 @@ import java.util.function.BiFunction;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import 

[hive] branch master updated: HIVE-25733: Add check-spelling/check-spelling (#2809) (Josh Soref reviewed by Zoltan Haindrich)

2022-06-13 Thread kgyrtkirk
This is an automated email from the ASF dual-hosted git repository.

kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
 new 0099b14aa6a HIVE-25733: Add check-spelling/check-spelling (#2809) 
(Josh Soref reviewed by Zoltan Haindrich)
0099b14aa6a is described below

commit 0099b14aa6a50d4470b057e93a95a7391b74add7
Author: Josh Soref <2119212+jso...@users.noreply.github.com>
AuthorDate: Mon Jun 13 11:05:41 2022 -0400

HIVE-25733: Add check-spelling/check-spelling (#2809) (Josh Soref reviewed 
by Zoltan Haindrich)
---
 .github/actions/spelling/README.md|  17 ++
 .github/actions/spelling/advice.md|  25 ++
 .github/actions/spelling/allow.txt|   0
 .github/actions/spelling/excludes.txt |  57 +
 .github/actions/spelling/expect.txt   | 449 ++
 .github/actions/spelling/only.txt |   1 +
 .github/actions/spelling/patterns.txt |  38 +++
 .github/actions/spelling/reject.txt   |   7 +
 .github/workflows/spelling.yml|  69 ++
 9 files changed, 663 insertions(+)

diff --git a/.github/actions/spelling/README.md 
b/.github/actions/spelling/README.md
new file mode 100644
index 000..749294b33fb
--- /dev/null
+++ b/.github/actions/spelling/README.md
@@ -0,0 +1,17 @@
+# check-spelling/check-spelling configuration
+
+File | Purpose | Format | Info
+-|-|-|-
+
+[allow.txt](allow.txt) | Add words to the dictionary | one word per line (only 
letters and `'`s allowed) | 
[allow](https://github.com/check-spelling/check-spelling/wiki/Configuration#allow)
+[reject.txt](reject.txt) | Remove words from the dictionary (after allow) | 
grep pattern matching whole dictionary words | 
[reject](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-reject)
+[excludes.txt](excludes.txt) | Files to ignore entirely | perl regular 
expression | 
[excludes](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-excludes)
+[only.txt](only.txt) | Only check matching files (applied after excludes) | 
perl regular expression | 
[only](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-only)
+[patterns.txt](patterns.txt) | Patterns to ignore from checked lines | perl 
regular expression (order matters, first match wins) | 
[patterns](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-patterns)
+[expect.txt](expect.txt) | Expected words that aren't in the dictionary | one 
word per line (sorted, alphabetically) | 
[expect](https://github.com/check-spelling/check-spelling/wiki/Configuration#expect)
+[advice.md](advice.md) | Supplement for GitHub comment when unrecognized words 
are found | GitHub Markdown | 
[advice](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-advice)
+
+Note: you can replace any of these files with a directory by the same name 
(minus the suffix)
+and then include multiple files inside that directory (with that suffix) to 
merge multiple files together.
diff --git a/.github/actions/spelling/advice.md 
b/.github/actions/spelling/advice.md
new file mode 100644
index 000..c83423a8ef6
--- /dev/null
+++ b/.github/actions/spelling/advice.md
@@ -0,0 +1,25 @@
+ 
+If the flagged items do not appear to be text
+
+If items relate to a ...
+* well-formed pattern.
+
+  If you can write a 
[pattern](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-patterns)
 that would match it,
+  try adding it to the `patterns.txt` file.
+
+  Patterns are Perl 5 Regular Expressions - you can [test](
+https://www.regexplanet.com/advanced/perl/) yours before committing to verify 
it will match your lines.
+
+  Note that patterns can't match multiline strings.
+
+* binary file.
+
+  Please add a file path to the `excludes.txt` file matching the containing 
file.
+
+  File paths are Perl 5 Regular Expressions - you can [test](
+https://www.regexplanet.com/advanced/perl/) yours before committing to verify 
it will match your files.
+
+  `^` refers to the file's path from the root of the repository, so 
`^README\.md$` would exclude [README.md](
+../tree/HEAD/README.md) (on whichever branch you're using).
+
+
diff --git a/.github/actions/spelling/allow.txt 
b/.github/actions/spelling/allow.txt
new file mode 100644
index 000..e69de29bb2d
diff --git a/.github/actions/spelling/excludes.txt 
b/.github/actions/spelling/excludes.txt
new file mode 100644
index 000..f15a0e0c9ca
--- /dev/null
+++ b/.github/actions/spelling/excludes.txt
@@ -0,0 +1,57 @@
+# See 
https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-excludes
+(?:^|/)(?i)COPYRIGHT
+(?:^|/)(?i)LICEN[CS]E
+(?:^|/)package(?:-lock|)\.json$
+(?:^|/)vendor/
+ignore$
+LICENSE
+\.avi$
+\.avro$
+\.bz2$
+\.deflate$
+\.eot$
+\.gif$
+\.gz$
+\.ico$
+\.jar$
+\.jceks$
+\.jks$
+\.jpe?g$
+\.jpeg$
+\.jpg$
+\.keep$

[hive] branch master updated: HIVE-26165: Remove READ locks for ACID tables (Denys Kuzmenko, reviewed by Karen Coppage)

2022-06-13 Thread dkuzmenko
This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
 new 83b4a0887cc HIVE-26165: Remove READ locks for ACID tables (Denys 
Kuzmenko, reviewed by Karen Coppage)
83b4a0887cc is described below

commit 83b4a0887cc1c081af64f353d3e66d3d977c861d
Author: Denys Kuzmenko 
AuthorDate: Mon Jun 13 09:14:09 2022 +0200

HIVE-26165: Remove READ locks for ACID tables (Denys Kuzmenko, reviewed by 
Karen Coppage)

Closes #3235
---
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java| 60 ++
 .../hadoop/hive/ql/lockmgr/TestDbTxnManager2.java  | 54 +++
 2 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 45684df17e6..5ade4dabb80 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -2846,36 +2846,32 @@ public class AcidUtils {
   }
 
   private static boolean needsLock(Entity entity, boolean isExternalEnabled) {
-switch (entity.getType()) {
-case TABLE:
-  return isLockableTable(entity.getTable(), isExternalEnabled);
-case PARTITION:
-  return isLockableTable(entity.getPartition().getTable(), 
isExternalEnabled);
-default:
-  return true;
-}
+return needsLock(entity, isExternalEnabled, false);
   }
 
-  private static Table getTable(WriteEntity we) {
-Table t = we.getTable();
-if (t == null) {
-  throw new IllegalStateException("No table info for " + we);
+  private static boolean needsLock(Entity entity, boolean isExternalEnabled, 
boolean isLocklessReads) {
+switch (entity.getType()) {
+  case TABLE:
+return isLockableTable(entity.getTable(), isExternalEnabled, 
isLocklessReads);
+  case PARTITION:
+return isLockableTable(entity.getPartition().getTable(), 
isExternalEnabled, isLocklessReads);
+  default:
+return true;
 }
-return t;
   }
 
-  private static boolean isLockableTable(Table t, boolean isExternalEnabled) {
+  private static boolean isLockableTable(Table t, boolean isExternalEnabled, 
boolean isLocklessReads) {
 if (t.isTemporary()) {
   return false;
 }
 switch (t.getTableType()) {
-case MANAGED_TABLE:
-case MATERIALIZED_VIEW:
-  return true;
-case EXTERNAL_TABLE:
-  return isExternalEnabled;
-default:
-  return false;
+  case MANAGED_TABLE:
+  case MATERIALIZED_VIEW:
+return !(isLocklessReads && isTransactionalTable(t));
+  case EXTERNAL_TABLE:
+return isExternalEnabled;
+  default:
+return false;
 }
   }
 
@@ -2890,8 +2886,10 @@ public class AcidUtils {
   Context.Operation operation, HiveConf conf) {
 
 List lockComponents = new ArrayList<>();
+boolean isLocklessReadsEnabled = HiveConf.getBoolVar(conf, 
HiveConf.ConfVars.HIVE_ACID_LOCKLESS_READS_ENABLED);
 boolean skipReadLock = !conf.getBoolVar(ConfVars.HIVE_TXN_READ_LOCKS);
 boolean skipNonAcidReadLock = 
!conf.getBoolVar(ConfVars.HIVE_TXN_NONACID_READ_LOCKS);
+
 boolean sharedWrite = !conf.getBoolVar(HiveConf.ConfVars.TXN_WRITE_X_LOCK);
 boolean isExternalEnabled = 
conf.getBoolVar(HiveConf.ConfVars.HIVE_TXN_EXT_LOCKING_ENABLED);
 boolean isMerge = operation == Context.Operation.MERGE;
@@ -2902,7 +2900,7 @@ public class AcidUtils {
   .filter(input -> !input.isDummy()
 && input.needsLock()
 && !input.isUpdateOrDelete()
-&& AcidUtils.needsLock(input, isExternalEnabled)
+&& AcidUtils.needsLock(input, isExternalEnabled, 
isLocklessReadsEnabled)
 && !skipReadLock)
   .collect(Collectors.toList());
 
@@ -2961,9 +2959,8 @@ public class AcidUtils {
 // overwrite) than we need a shared.  If it's update or delete then we
 // need a SHARED_WRITE.
 for (WriteEntity output : outputs) {
-  LOG.debug("output is null " + (output == null));
-  if (output.getType() == Entity.Type.DFS_DIR || output.getType() == 
Entity.Type.LOCAL_DIR || !AcidUtils
-  .needsLock(output, isExternalEnabled)) {
+  if (output.getType() == Entity.Type.DFS_DIR || output.getType() == 
Entity.Type.LOCAL_DIR 
+  || !AcidUtils.needsLock(output, isExternalEnabled)) {
 // We don't lock files or directories. We also skip locking temp 
tables.
 continue;
   }
@@ -3015,7 +3012,8 @@ public class AcidUtils {
   case INSERT_OVERWRITE:
 assert t != null;
 if (AcidUtils.isTransactionalTable(t)) {
-  if (conf.getBoolVar(HiveConf.ConfVars.TXN_OVERWRITE_X_LOCK) && 
!sharedWrite) {
+  if (conf.getBoolVar(HiveConf.ConfVars.TXN_OVERWRITE_X_LOCK) && 
!sharedWrite 
+  && !isLocklessReadsEnabled) {