[ https://issues.apache.org/jira/browse/HIVE-22977?focusedWorklogId=837804&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-837804 ]
ASF GitHub Bot logged work on HIVE-22977: ----------------------------------------- Author: ASF GitHub Bot Created on: 09/Jan/23 05:16 Start Date: 09/Jan/23 05:16 Worklog Time Spent: 10m Work Description: SourabhBadhya commented on code in PR #3801: URL: https://github.com/apache/hive/pull/3801#discussion_r1064290065 ########## ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/OrcFileMerger.java: ########## @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.txn.compactor; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.io.orc.CompressionKind; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.ql.io.orc.RecordReader; +import org.apache.hadoop.hive.ql.io.orc.Writer; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; + +/** + * Class to support fast merging of ORC files. + */ +public class OrcFileMerger { + + private final Configuration conf; + private static final Logger LOG = LoggerFactory.getLogger(OrcFileMerger.class); + + public OrcFileMerger(Configuration conf) { + this.conf = conf; + } + + /** + * Merge orc files into a single file + * @param files list of orc file paths to be merged + * @param outPath the path of output orc file + * @throws IOException error happened during file operations + */ + public void mergeFiles(List<Reader> readers, Path outPath) throws IOException { + Writer writer = null; + try { + for (Reader reader : readers) { + if (writer == null) { + writer = setupWriter(reader, outPath); + } + VectorizedRowBatch batch = reader.getSchema().createRowBatchV2(); + RecordReader rows = reader.rows(); + while (rows.nextBatch(batch)) { + if (batch != null) { + writer.addRowBatch(batch); + } + } + rows.close(); + } + } finally { + if (writer != null) { + writer.close(); + } + } Review Comment: If we create a writer irrespective of number of readers in try with resources block then a writer will always be created and this may create a empty file (if readers are not present). ########## ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/OrcFileMerger.java: ########## @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.txn.compactor; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.io.orc.CompressionKind; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.ql.io.orc.RecordReader; +import org.apache.hadoop.hive.ql.io.orc.Writer; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; + +/** + * Class to support fast merging of ORC files. + */ +public class OrcFileMerger { + + private final Configuration conf; + private static final Logger LOG = LoggerFactory.getLogger(OrcFileMerger.class); + + public OrcFileMerger(Configuration conf) { + this.conf = conf; + } + + /** + * Merge orc files into a single file + * @param files list of orc file paths to be merged + * @param outPath the path of output orc file + * @throws IOException error happened during file operations + */ + public void mergeFiles(List<Reader> readers, Path outPath) throws IOException { + Writer writer = null; + try { + for (Reader reader : readers) { + if (writer == null) { + writer = setupWriter(reader, outPath); + } + VectorizedRowBatch batch = reader.getSchema().createRowBatchV2(); + RecordReader rows = reader.rows(); + while (rows.nextBatch(batch)) { + if (batch != null) { + writer.addRowBatch(batch); + } + } + rows.close(); + } + } finally { + if (writer != null) { + writer.close(); + } + } Review Comment: If we create a writer irrespective of number of readers in try with resources block then a writer will always be created and this may create an empty file (if readers are not present). Issue Time Tracking ------------------- Worklog Id: (was: 837804) Time Spent: 3h (was: 2h 50m) > Merge delta files instead of running a query in major/minor compaction > ---------------------------------------------------------------------- > > Key: HIVE-22977 > URL: https://issues.apache.org/jira/browse/HIVE-22977 > Project: Hive > Issue Type: Improvement > Reporter: László Pintér > Assignee: Sourabh Badhya > Priority: Major > Labels: pull-request-available > Attachments: HIVE-22977.01.patch, HIVE-22977.02.patch > > Time Spent: 3h > Remaining Estimate: 0h > > [Compaction Optimiziation] > We should analyse the possibility to move a delta file instead of running a > major/minor compaction query. > Please consider the following use cases: > - full acid table but only insert queries were run. This means that no > delete delta directories were created. Is it possible to merge the delta > directory contents without running a compaction query? > - full acid table, initiating queries through the streaming API. If there > are no abort transactions during the streaming, is it possible to merge the > delta directory contents without running a compaction query? -- This message was sent by Atlassian Jira (v8.20.10#820010)