SourabhBadhya commented on code in PR #4520:
URL: https://github.com/apache/hive/pull/4520#discussion_r1305572798
##########
ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java:
##########
@@ -419,17 +471,90 @@ private AverageSize getAverageSize(FileSystem inpFs, Path
dirPath) {
*/
private long getMergeSize(FileSystem inpFs, Path dirPath, long avgSize) {
AverageSize averageSize = getAverageSize(inpFs, dirPath);
- if (averageSize.getTotalSize() < 0) {
+ return getMergeSize(averageSize, avgSize);
+ }
+
+ private List<FileStatus> getManifestFilePaths(HiveConf conf, Path dirPath)
throws IOException {
+ FileSystem manifestFs = dirPath.getFileSystem(conf);
+ List<String> filesKept;
+ List<FileStatus> pathsKept = new ArrayList<>();
+ try (FSDataInputStream inStream = manifestFs.open(new Path(dirPath,
Utilities.BLOB_MANIFEST_FILE))) {
+ String paths = IOUtils.toString(inStream, Charset.defaultCharset());
+ filesKept = new
ArrayList(Arrays.asList(paths.split(System.lineSeparator())));
+ }
+ // The first string contains the directory information. Not useful.
+ filesKept.remove(0);
+
+ for (String file : filesKept) {
+ pathsKept.add(manifestFs.getFileStatus(new Path(file)));
+ }
+ return pathsKept;
+ }
+
+ private long getMergeSize(List<FileStatus> fileStatuses, long avgSize) {
+ AverageSize averageSize = getAverageSize(fileStatuses);
+ return getMergeSize(averageSize, avgSize);
+ }
+
+ private long getMergeSize(AverageSize averageSize, long avgSize) {
+ if (averageSize.getTotalSize() <= 0) {
return -1;
}
if (averageSize.getNumFiles() <= 1) {
return -1;
}
- if (averageSize.getTotalSize()/averageSize.getNumFiles() < avgSize) {
+ if (averageSize.getTotalSize() / averageSize.getNumFiles() < avgSize) {
return averageSize.getTotalSize();
}
return -1;
}
+
+ private void setupWorkWhenUsingManifestFile(MapWork mapWork,
List<FileStatus> fileStatuses, Path dirPath,
+ boolean isTblLevel) {
+ Map<String, Operator<? extends OperatorDesc>> aliasToWork =
mapWork.getAliasToWork();
+ Map<Path, PartitionDesc> pathToPartitionInfo =
mapWork.getPathToPartitionInfo();
+ Operator<? extends OperatorDesc> op = aliasToWork.get(dirPath.toString());
+ PartitionDesc partitionDesc = pathToPartitionInfo.get(dirPath);
+ Path tmpDirPath = Utilities.toTempPath(dirPath);
+ if (op != null) {
+ aliasToWork.remove(dirPath.toString());
+ aliasToWork.put(tmpDirPath.toString(), op);
+ mapWork.setAliasToWork(aliasToWork);
+ }
+ if (partitionDesc != null) {
+ pathToPartitionInfo.remove(dirPath);
+ pathToPartitionInfo.put(tmpDirPath, partitionDesc);
+ mapWork.setPathToPartitionInfo(pathToPartitionInfo);
+ }
+ mapWork.removePathToAlias(dirPath);
+ mapWork.addPathToAlias(tmpDirPath, tmpDirPath.toString());
+ List<Path> paths = new ArrayList<>();
+ if (isTblLevel) {
+ for (FileStatus fileStatus : fileStatuses) {
+ if (!fileStatus.isDirectory()) {
+ paths.add(fileStatus.getPath());
+ }
+ }
+ mapWork.setInputPaths(paths);
+ }
+ mapWork.setUseInputPathsDirectly(true);
+ }
+
+ private Map<FileStatus, List<FileStatus>> getManifestDirs(FileSystem inpFs,
List<FileStatus> fileStatuses)
+ throws IOException {
+ Map<FileStatus, List<FileStatus>> manifestDirsToPaths = new HashMap<>();
+ for (FileStatus fileStatus : fileStatuses) {
+ if (!fileStatus.isDirectory()) {
+ FileStatus parentDir =
inpFs.getFileStatus(fileStatus.getPath().getParent());
+ List<FileStatus> fileStatusList = new
ArrayList<>(Collections.singletonList(fileStatus));
Review Comment:
Done. Replaced it with `Lists.newArrayList()`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]