[hive] branch master updated: HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git The following commit(s) were added to refs/heads/master by this push: new c4f6eb2f914 HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich) c4f6eb2f914 is described below commit c4f6eb2f91478152c89070a7455df9a1b8980c75 Author: okumin AuthorDate: Tue Jun 14 00:26:36 2022 +0900 HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich) --- .../udf/generic/GenericUDAFMkCollectionEvaluator.java | 18 +- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java index b05023dd37a..cffc7f76510 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java @@ -95,11 +95,27 @@ public class GenericUDAFMkCollectionEvaluator extends GenericUDAFEvaluator throw new RuntimeException("Buffer type unknown"); } } + +private void reset() { + if (bufferType == BufferType.LIST) { +container.clear(); + } else if (bufferType == BufferType.SET) { +// Don't reuse a container because HashSet#clear can be very slow. The operation takes O(N) +// and N is the capacity of the internal hash table. The internal capacity grows based on +// the number of elements and it never shrinks. Thus, HashSet#clear takes O(N) every time +// once a skewed key appears. +// In order to avoid too many resizing in average cases, we set the initial capacity to the +// number of elements of the previous aggregation. +container = new LinkedHashSet<>(container.size()); + } else { +throw new RuntimeException("Buffer type unknown"); + } +} } @Override public void reset(AggregationBuffer agg) throws HiveException { -((MkArrayAggregationBuffer) agg).container.clear(); +((MkArrayAggregationBuffer) agg).reset(); } @Override
[hive] branch master updated: HIVE-26307: Avoid FS init in FileIO::newInputFile in vectorized Iceberg reads (Peter Vary reviewed by Adam Szita) (#3354)
This is an automated email from the ASF dual-hosted git repository. pvary pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git The following commit(s) were added to refs/heads/master by this push: new 76d4abe402a HIVE-26307: Avoid FS init in FileIO::newInputFile in vectorized Iceberg reads (Peter Vary reviewed by Adam Szita) (#3354) 76d4abe402a is described below commit 76d4abe402abdebb6534e9db3f4209cce8b0d4e6 Author: pvary AuthorDate: Mon Jun 13 17:20:53 2022 +0200 HIVE-26307: Avoid FS init in FileIO::newInputFile in vectorized Iceberg reads (Peter Vary reviewed by Adam Szita) (#3354) --- .../mr/hive/vector/HiveVectorizedReader.java | 14 +- .../iceberg/mr/mapreduce/IcebergInputFormat.java | 148 ++--- .../apache/iceberg/orc/VectorizedReadUtils.java| 14 +- 3 files changed, 77 insertions(+), 99 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java index 19fa0f06506..00b9b3c73f0 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java @@ -51,7 +51,6 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mr.mapred.MapredIcebergInputFormat; import org.apache.iceberg.orc.VectorizedReadUtils; import org.apache.iceberg.parquet.ParquetSchemaUtil; @@ -72,11 +71,10 @@ public class HiveVectorizedReader { } - public static CloseableIterable reader(InputFile inputFile, FileScanTask task, Map idToConstant, + public static CloseableIterable reader(Path path, FileScanTask task, Map idToConstant, TaskAttemptContext context) { // Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused.. -JobConf job = new JobConf((JobConf) context.getConfiguration()); -Path path = new Path(inputFile.location()); +JobConf job = new JobConf(context.getConfiguration()); FileFormat format = task.file().format(); Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter(); @@ -131,7 +129,7 @@ public class HiveVectorizedReader { switch (format) { case ORC: - recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId); + recordReader = orcRecordReader(job, reporter, task, path, start, length, readColumnIds, fileId); break; case PARQUET: @@ -144,12 +142,12 @@ public class HiveVectorizedReader { return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues); } catch (IOException ioe) { - throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe); + throw new RuntimeException("Error creating vectorized record reader for " + path, ioe); } } private static RecordReader orcRecordReader(JobConf job, Reporter reporter, - FileScanTask task, InputFile inputFile, Path path, long start, long length, List readColumnIds, + FileScanTask task, Path path, long start, long length, List readColumnIds, SyntheticFileId fileId) throws IOException { RecordReader recordReader = null; @@ -159,7 +157,7 @@ public class HiveVectorizedReader { // Metadata information has to be passed along in the OrcSplit. Without specifying this, the vectorized // reader will assume that the ORC file ends at the task's start + length, and might fail reading the tail.. -ByteBuffer serializedOrcTail = VectorizedReadUtils.getSerializedOrcTail(inputFile, fileId, job); +ByteBuffer serializedOrcTail = VectorizedReadUtils.getSerializedOrcTail(path, fileId, job); OrcTail orcTail = VectorizedReadUtils.deserializeToOrcTail(serializedOrcTail); VectorizedReadUtils.handleIcebergProjection(task, job, diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java index 7bbd03a09ed..7617c6b17e9 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java @@ -31,6 +31,7 @@ import java.util.function.BiFunction; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import
[hive] branch master updated: HIVE-25733: Add check-spelling/check-spelling (#2809) (Josh Soref reviewed by Zoltan Haindrich)
This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git The following commit(s) were added to refs/heads/master by this push: new 0099b14aa6a HIVE-25733: Add check-spelling/check-spelling (#2809) (Josh Soref reviewed by Zoltan Haindrich) 0099b14aa6a is described below commit 0099b14aa6a50d4470b057e93a95a7391b74add7 Author: Josh Soref <2119212+jso...@users.noreply.github.com> AuthorDate: Mon Jun 13 11:05:41 2022 -0400 HIVE-25733: Add check-spelling/check-spelling (#2809) (Josh Soref reviewed by Zoltan Haindrich) --- .github/actions/spelling/README.md| 17 ++ .github/actions/spelling/advice.md| 25 ++ .github/actions/spelling/allow.txt| 0 .github/actions/spelling/excludes.txt | 57 + .github/actions/spelling/expect.txt | 449 ++ .github/actions/spelling/only.txt | 1 + .github/actions/spelling/patterns.txt | 38 +++ .github/actions/spelling/reject.txt | 7 + .github/workflows/spelling.yml| 69 ++ 9 files changed, 663 insertions(+) diff --git a/.github/actions/spelling/README.md b/.github/actions/spelling/README.md new file mode 100644 index 000..749294b33fb --- /dev/null +++ b/.github/actions/spelling/README.md @@ -0,0 +1,17 @@ +# check-spelling/check-spelling configuration + +File | Purpose | Format | Info +-|-|-|- + +[allow.txt](allow.txt) | Add words to the dictionary | one word per line (only letters and `'`s allowed) | [allow](https://github.com/check-spelling/check-spelling/wiki/Configuration#allow) +[reject.txt](reject.txt) | Remove words from the dictionary (after allow) | grep pattern matching whole dictionary words | [reject](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-reject) +[excludes.txt](excludes.txt) | Files to ignore entirely | perl regular expression | [excludes](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-excludes) +[only.txt](only.txt) | Only check matching files (applied after excludes) | perl regular expression | [only](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-only) +[patterns.txt](patterns.txt) | Patterns to ignore from checked lines | perl regular expression (order matters, first match wins) | [patterns](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-patterns) +[expect.txt](expect.txt) | Expected words that aren't in the dictionary | one word per line (sorted, alphabetically) | [expect](https://github.com/check-spelling/check-spelling/wiki/Configuration#expect) +[advice.md](advice.md) | Supplement for GitHub comment when unrecognized words are found | GitHub Markdown | [advice](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples%3A-advice) + +Note: you can replace any of these files with a directory by the same name (minus the suffix) +and then include multiple files inside that directory (with that suffix) to merge multiple files together. diff --git a/.github/actions/spelling/advice.md b/.github/actions/spelling/advice.md new file mode 100644 index 000..c83423a8ef6 --- /dev/null +++ b/.github/actions/spelling/advice.md @@ -0,0 +1,25 @@ + +If the flagged items do not appear to be text + +If items relate to a ... +* well-formed pattern. + + If you can write a [pattern](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-patterns) that would match it, + try adding it to the `patterns.txt` file. + + Patterns are Perl 5 Regular Expressions - you can [test]( +https://www.regexplanet.com/advanced/perl/) yours before committing to verify it will match your lines. + + Note that patterns can't match multiline strings. + +* binary file. + + Please add a file path to the `excludes.txt` file matching the containing file. + + File paths are Perl 5 Regular Expressions - you can [test]( +https://www.regexplanet.com/advanced/perl/) yours before committing to verify it will match your files. + + `^` refers to the file's path from the root of the repository, so `^README\.md$` would exclude [README.md]( +../tree/HEAD/README.md) (on whichever branch you're using). + + diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt new file mode 100644 index 000..e69de29bb2d diff --git a/.github/actions/spelling/excludes.txt b/.github/actions/spelling/excludes.txt new file mode 100644 index 000..f15a0e0c9ca --- /dev/null +++ b/.github/actions/spelling/excludes.txt @@ -0,0 +1,57 @@ +# See https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-excludes +(?:^|/)(?i)COPYRIGHT +(?:^|/)(?i)LICEN[CS]E +(?:^|/)package(?:-lock|)\.json$ +(?:^|/)vendor/ +ignore$ +LICENSE +\.avi$ +\.avro$ +\.bz2$ +\.deflate$ +\.eot$ +\.gif$ +\.gz$ +\.ico$ +\.jar$ +\.jceks$ +\.jks$ +\.jpe?g$ +\.jpeg$ +\.jpg$ +\.keep$
[hive] branch master updated: HIVE-26165: Remove READ locks for ACID tables (Denys Kuzmenko, reviewed by Karen Coppage)
This is an automated email from the ASF dual-hosted git repository. dkuzmenko pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git The following commit(s) were added to refs/heads/master by this push: new 83b4a0887cc HIVE-26165: Remove READ locks for ACID tables (Denys Kuzmenko, reviewed by Karen Coppage) 83b4a0887cc is described below commit 83b4a0887cc1c081af64f353d3e66d3d977c861d Author: Denys Kuzmenko AuthorDate: Mon Jun 13 09:14:09 2022 +0200 HIVE-26165: Remove READ locks for ACID tables (Denys Kuzmenko, reviewed by Karen Coppage) Closes #3235 --- .../org/apache/hadoop/hive/ql/io/AcidUtils.java| 60 ++ .../hadoop/hive/ql/lockmgr/TestDbTxnManager2.java | 54 +++ 2 files changed, 60 insertions(+), 54 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 45684df17e6..5ade4dabb80 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -2846,36 +2846,32 @@ public class AcidUtils { } private static boolean needsLock(Entity entity, boolean isExternalEnabled) { -switch (entity.getType()) { -case TABLE: - return isLockableTable(entity.getTable(), isExternalEnabled); -case PARTITION: - return isLockableTable(entity.getPartition().getTable(), isExternalEnabled); -default: - return true; -} +return needsLock(entity, isExternalEnabled, false); } - private static Table getTable(WriteEntity we) { -Table t = we.getTable(); -if (t == null) { - throw new IllegalStateException("No table info for " + we); + private static boolean needsLock(Entity entity, boolean isExternalEnabled, boolean isLocklessReads) { +switch (entity.getType()) { + case TABLE: +return isLockableTable(entity.getTable(), isExternalEnabled, isLocklessReads); + case PARTITION: +return isLockableTable(entity.getPartition().getTable(), isExternalEnabled, isLocklessReads); + default: +return true; } -return t; } - private static boolean isLockableTable(Table t, boolean isExternalEnabled) { + private static boolean isLockableTable(Table t, boolean isExternalEnabled, boolean isLocklessReads) { if (t.isTemporary()) { return false; } switch (t.getTableType()) { -case MANAGED_TABLE: -case MATERIALIZED_VIEW: - return true; -case EXTERNAL_TABLE: - return isExternalEnabled; -default: - return false; + case MANAGED_TABLE: + case MATERIALIZED_VIEW: +return !(isLocklessReads && isTransactionalTable(t)); + case EXTERNAL_TABLE: +return isExternalEnabled; + default: +return false; } } @@ -2890,8 +2886,10 @@ public class AcidUtils { Context.Operation operation, HiveConf conf) { List lockComponents = new ArrayList<>(); +boolean isLocklessReadsEnabled = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_ACID_LOCKLESS_READS_ENABLED); boolean skipReadLock = !conf.getBoolVar(ConfVars.HIVE_TXN_READ_LOCKS); boolean skipNonAcidReadLock = !conf.getBoolVar(ConfVars.HIVE_TXN_NONACID_READ_LOCKS); + boolean sharedWrite = !conf.getBoolVar(HiveConf.ConfVars.TXN_WRITE_X_LOCK); boolean isExternalEnabled = conf.getBoolVar(HiveConf.ConfVars.HIVE_TXN_EXT_LOCKING_ENABLED); boolean isMerge = operation == Context.Operation.MERGE; @@ -2902,7 +2900,7 @@ public class AcidUtils { .filter(input -> !input.isDummy() && input.needsLock() && !input.isUpdateOrDelete() -&& AcidUtils.needsLock(input, isExternalEnabled) +&& AcidUtils.needsLock(input, isExternalEnabled, isLocklessReadsEnabled) && !skipReadLock) .collect(Collectors.toList()); @@ -2961,9 +2959,8 @@ public class AcidUtils { // overwrite) than we need a shared. If it's update or delete then we // need a SHARED_WRITE. for (WriteEntity output : outputs) { - LOG.debug("output is null " + (output == null)); - if (output.getType() == Entity.Type.DFS_DIR || output.getType() == Entity.Type.LOCAL_DIR || !AcidUtils - .needsLock(output, isExternalEnabled)) { + if (output.getType() == Entity.Type.DFS_DIR || output.getType() == Entity.Type.LOCAL_DIR + || !AcidUtils.needsLock(output, isExternalEnabled)) { // We don't lock files or directories. We also skip locking temp tables. continue; } @@ -3015,7 +3012,8 @@ public class AcidUtils { case INSERT_OVERWRITE: assert t != null; if (AcidUtils.isTransactionalTable(t)) { - if (conf.getBoolVar(HiveConf.ConfVars.TXN_OVERWRITE_X_LOCK) && !sharedWrite) { + if (conf.getBoolVar(HiveConf.ConfVars.TXN_OVERWRITE_X_LOCK) && !sharedWrite + && !isLocklessReadsEnabled) {