[ https://issues.apache.org/jira/browse/HIVE-23956?focusedWorklogId=465736&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-465736 ]
ASF GitHub Bot logged work on HIVE-23956: ----------------------------------------- Author: ASF GitHub Bot Created on: 03/Aug/20 15:03 Start Date: 03/Aug/20 15:03 Worklog Time Spent: 10m Work Description: pvargacl commented on a change in pull request #1339: URL: https://github.com/apache/hive/pull/1339#discussion_r464472005 ########## File path: ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java ########## @@ -118,70 +126,217 @@ */ private long visibilityTxnId; + private List<DeltaFileMetaData> deltaFiles; + public DeltaMetaData() { - this(0,0,new ArrayList<Integer>(), 0); + this(0, 0, new ArrayList<>(), 0, new ArrayList<>()); } + /** + * @param minWriteId min writeId of the delta directory + * @param maxWriteId max writeId of the delta directory * @param stmtIds delta dir suffixes when a single txn writes > 1 delta in the same partition * @param visibilityTxnId maybe 0, if the dir name didn't have it. txnid:0 is always visible + * @param deltaFiles bucketFiles in the directory */ - DeltaMetaData(long minWriteId, long maxWriteId, List<Integer> stmtIds, long visibilityTxnId) { + public DeltaMetaData(long minWriteId, long maxWriteId, List<Integer> stmtIds, long visibilityTxnId, + List<DeltaFileMetaData> deltaFiles) { this.minWriteId = minWriteId; this.maxWriteId = maxWriteId; if (stmtIds == null) { throw new IllegalArgumentException("stmtIds == null"); } this.stmtIds = stmtIds; this.visibilityTxnId = visibilityTxnId; + this.deltaFiles = ObjectUtils.defaultIfNull(deltaFiles, new ArrayList<>()); } - long getMinWriteId() { + + public long getMinWriteId() { return minWriteId; } - long getMaxWriteId() { + + public long getMaxWriteId() { return maxWriteId; } - List<Integer> getStmtIds() { + + public List<Integer> getStmtIds() { return stmtIds; } - long getVisibilityTxnId() { + + public long getVisibilityTxnId() { return visibilityTxnId; } + + public List<DeltaFileMetaData> getDeltaFiles() { + return deltaFiles; + } + + public List<DeltaFileMetaData> getDeltaFilesForStmtId(final Integer stmtId) { + if (stmtIds.size() <= 1 || stmtId == null) { + // If it is not a multistatement delta, we do not store the stmtId in the file list + return deltaFiles; + } else { + return deltaFiles.stream().filter(df -> stmtId.equals(df.getStmtId())).collect(Collectors.toList()); + } + } + @Override public void write(DataOutput out) throws IOException { out.writeLong(minWriteId); out.writeLong(maxWriteId); out.writeInt(stmtIds.size()); - for(Integer id : stmtIds) { + for (Integer id : stmtIds) { out.writeInt(id); } out.writeLong(visibilityTxnId); + out.writeInt(deltaFiles.size()); + for (DeltaFileMetaData fileMeta : deltaFiles) { + fileMeta.write(out); + } } + @Override public void readFields(DataInput in) throws IOException { minWriteId = in.readLong(); maxWriteId = in.readLong(); stmtIds.clear(); int numStatements = in.readInt(); - for(int i = 0; i < numStatements; i++) { + for (int i = 0; i < numStatements; i++) { stmtIds.add(in.readInt()); } visibilityTxnId = in.readLong(); + + deltaFiles.clear(); + int numFiles = in.readInt(); + for (int i = 0; i < numFiles; i++) { + DeltaFileMetaData file = new DeltaFileMetaData(); + file.readFields(in); + deltaFiles.add(file); + } } - String getName() { + + private String getName() { assert stmtIds.isEmpty() : "use getName(int)"; - return AcidUtils.addVisibilitySuffix(AcidUtils - .deleteDeltaSubdir(minWriteId, maxWriteId), visibilityTxnId); + return AcidUtils.addVisibilitySuffix(AcidUtils.deleteDeltaSubdir(minWriteId, maxWriteId), visibilityTxnId); } - String getName(int stmtId) { + + private String getName(int stmtId) { assert !stmtIds.isEmpty() : "use getName()"; return AcidUtils.addVisibilitySuffix(AcidUtils .deleteDeltaSubdir(minWriteId, maxWriteId, stmtId), visibilityTxnId); } + + public List<Pair<Path, Integer>> getPaths(Path root) { Review comment: I think the List is much more straightforward, it will keep the stmid order. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 465736) Time Spent: 4h (was: 3h 50m) > Delete delta directory file information should be pushed to execution side > -------------------------------------------------------------------------- > > Key: HIVE-23956 > URL: https://issues.apache.org/jira/browse/HIVE-23956 > Project: Hive > Issue Type: Improvement > Reporter: Peter Varga > Assignee: Peter Varga > Priority: Major > Labels: pull-request-available > Time Spent: 4h > Remaining Estimate: 0h > > Since HIVE-23840 LLAP cache is used to retrieve the tail of the ORC bucket > files in the delete deltas, but to use the cache the fileId must be > determined, so one more FileSystem call is issued for each bucket. > This fileId is already available during compilation in the AcidState > calculation, we should serialise this to the OrcSplit, and remove the > unnecessary FS calls. > Furthermore instead of sending the SyntheticFileId directly, we should pass > the attemptId instead of the standard path hash, this way the path and the > SyntheticFileId. can be calculated, and it will work even, if the move free > delete operations will be introduced. -- This message was sent by Atlassian Jira (v8.3.4#803005)