[ 
https://issues.apache.org/jira/browse/DRILL-7063?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16806578#comment-16806578
 ] 

ASF GitHub Bot commented on DRILL-7063:
---------------------------------------

vdiravka commented on pull request #1723: DRILL-7063: Seperate metadata cache 
file into summary, file metadata
URL: https://github.com/apache/drill/pull/1723#discussion_r270779624
 
 

 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java
 ##########
 @@ -633,43 +713,120 @@ private void readBlockMeta(Path path, boolean dirsOnly, 
MetadataContext metaCont
         parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
         if (!alreadyCheckedModification && 
tableModified(parquetTableMetadataDirs.getDirectories(), path, 
metadataParentDir, metaContext, fs)) {
           parquetTableMetadataDirs =
-              
(createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
 fs, true, null)).getRight();
+              
(createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
 fs, true, null, true)).getRight();
           newMetadata = true;
         }
       } else {
-        parquetTableMetadata = mapper.readValue(is, 
ParquetTableMetadataBase.class);
+        if (isFileMetadata) {
+          parquetTableMetadata.assignFiles((mapper.readValue(is, 
FileMetadata.class)).getFiles());
+          if (new 
MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new 
MetadataVersion(4, 0)) >= 0) {
+            ((ParquetTableMetadata_v4) 
parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
+          }
+
+          if (!alreadyCheckedModification && 
tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, 
metaContext, fs)) {
+            parquetTableMetadata =
+                    
(createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
 fs, true, null, true)).getLeft();
+            newMetadata = true;
+          }
+        } else if (isSummaryFile) {
+          MetadataSummary metadataSummary = mapper.readValue(is, 
Metadata_V4.MetadataSummary.class);
+          ParquetTableMetadata_v4 parquetTableMetadata_v4 = new 
ParquetTableMetadata_v4(metadataSummary);
+          parquetTableMetadata = (ParquetTableMetadataBase) 
parquetTableMetadata_v4;
+        } else {
+          parquetTableMetadata = mapper.readValue(is, 
ParquetTableMetadataBase.class);
+          if (new 
MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new 
MetadataVersion(3, 0)) >= 0) {
+            ((Metadata_V3.ParquetTableMetadata_v3) 
parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
+          }
+          if (!alreadyCheckedModification && 
tableModified((parquetTableMetadata.getDirectories()), path, metadataParentDir, 
metaContext, fs)) {
+            parquetTableMetadata =
+                    
(createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
 fs, true, null, true)).getLeft();
+            newMetadata = true;
+          }
+        }
         if (timer != null) {
           logger.debug("Took {} ms to read metadata from cache file", 
timer.elapsed(TimeUnit.MILLISECONDS));
           timer.stop();
         }
-        if (new 
MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new 
MetadataVersion(3, 0)) >= 0) {
-          ((ParquetTableMetadata_v3) 
parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
-        }
-          if (!alreadyCheckedModification && 
tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, 
metaContext, fs)) {
-          // TODO change with current columns in existing metadata (auto 
refresh feature)
-          parquetTableMetadata =
-              
(createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
 fs, true, null)).getLeft();
-          newMetadata = true;
+        if (!isSummaryFile) {
+          // DRILL-5009: Remove the RowGroup if it is empty
+          List<? extends ParquetFileMetadata> files = 
parquetTableMetadata.getFiles();
+          if (files != null) {
+            for (ParquetFileMetadata file : files) {
+              List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
+              rowGroups.removeIf(r -> r.getRowCount() == 0);
+            }
+          }
         }
-
-        // DRILL-5009: Remove the RowGroup if it is empty
-        List<? extends ParquetFileMetadata> files = 
parquetTableMetadata.getFiles();
-        for (ParquetFileMetadata file : files) {
-          List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
-          rowGroups.removeIf(r -> r.getRowCount() == 0);
+        if (newMetadata) {
+          // if new metadata files were created, invalidate the existing 
metadata context
+          metaContext.clear();
         }
-
-      }
-      if (newMetadata) {
-        // if new metadata files were created, invalidate the existing 
metadata context
-        metaContext.clear();
       }
     } catch (IOException e) {
       logger.error("Failed to read '{}' metadata file", path, e);
       metaContext.setMetadataCacheCorrupted(true);
     }
   }
 
+  private Set<String> getInterestingColumns(FileSystem fs, Path 
metadataParentDir, boolean autoRefreshTriggered) {
+    Metadata_V4.MetadataSummary metadataSummary = getSummary(fs, 
metadataParentDir, autoRefreshTriggered, null);
+    if (metadataSummary == null) {
+      return null;
+    } else {
+      Set<String> interestingColumns = new HashSet<String>();
+      for (ColumnTypeMetadata_v4 columnTypeMetadata_v4: 
metadataSummary.columnTypeInfo.values()) {
+        if (columnTypeMetadata_v4.isInteresting) {
+          interestingColumns.add(String.join("", columnTypeMetadata_v4.name));
+        }
+      }
+      return interestingColumns;
+    }
+  }
+
+  private boolean getallColumnsInteresting(FileSystem fs, Path 
metadataParentDir, boolean autoRefreshTriggered) {
+    Metadata_V4.MetadataSummary metadataSummary = getSummary(fs, 
metadataParentDir, autoRefreshTriggered, null);
+    if (metadataSummary == null) {
+      return true;
+    }
+    return metadataSummary.isAllColumnsInteresting();
+  }
+
+  public static Metadata_V4.MetadataSummary getSummary(FileSystem fs, Path 
metadataParentDir, boolean autoRefreshTriggered, ParquetReaderConfig 
readerConfig) {
+    Path summaryFile = new Path(metadataParentDir, METADATA_SUMMARY_FILENAME);
+    Path metadataDirFile = new Path(metadataParentDir, 
METADATA_DIRECTORIES_FILENAME);
+    MetadataContext metaContext = new MetadataContext();
+    try {
+      if (!fs.exists(summaryFile)) {
+        return null;
+      } else {
+        // If the autorefresh is not triggered, check if the cache file is 
stale and trigger auto-refresh
+        if (!autoRefreshTriggered) {
+          Metadata metadata = new Metadata(readerConfig);
+          ParquetTableMetadataDirs metadataDirs  = readMetadataDirs(fs, 
metadataDirFile, metaContext, readerConfig);
+          if (metadata.tableModified(metadataDirs.getDirectories(), 
summaryFile, metadataParentDir, metaContext, fs) && true) {
+            ParquetTableMetadata_v4 parquetTableMetadata = 
(metadata.createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(summaryFile.getParent()),
 fs, true, null, true)).getLeft();
+            return parquetTableMetadata.getSummary();
+          }
+        }
+        // Read the existing metadataSummary cache file to get the 
metadataSummary
+        ObjectMapper mapper = new ObjectMapper();
+        final SimpleModule serialModule = new SimpleModule();
+        serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
+        serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new 
ColumnTypeMetadata_v4.Key.DeSerializer());
+        AfterburnerModule module = new AfterburnerModule();
+        module.setUseOptimizedBeanDeserializer(true);
+        mapper.registerModule(serialModule);
+        mapper.registerModule(module);
+        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
+        InputStream is = fs.open(summaryFile);
+        Metadata_V4.MetadataSummary metadataSummary = mapper.readValue(is, 
Metadata_V4.MetadataSummary.class);
+        return metadataSummary;
+        }
+    } catch (IOException e) {
+      return null;
 
 Review comment:
   How the case with null is handled?
   It is necessary to print the info from `e` to logs possibly
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Create separate summary file for schema, totalRowCount, totalNullCount 
> (includes maintenance)
> ---------------------------------------------------------------------------------------------
>
>                 Key: DRILL-7063
>                 URL: https://issues.apache.org/jira/browse/DRILL-7063
>             Project: Apache Drill
>          Issue Type: Sub-task
>          Components: Metadata
>            Reporter: Venkata Jyothsna Donapati
>            Assignee: Venkata Jyothsna Donapati
>            Priority: Major
>             Fix For: 1.16.0
>
>   Original Estimate: 252h
>  Remaining Estimate: 252h
>




--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to