Re: [PR] DRILL-8188: Convert HDF5 format to EVF2 (drill)

via GitHub Sun, 14 Jan 2024 23:00:25 -0800


jnturton commented on code in PR #2515:
URL: https://github.com/apache/drill/pull/2515#discussion_r1446231938



##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +168,109 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, 
HDF5FormatConfig formatConfig)
     }
   }
 
-  public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
-    this.readerConfig = readerConfig;
-    this.maxRecords = maxRecords;
+  public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, 
FileSchemaNegotiator negotiator) {
+    errorContext = negotiator.parentErrorContext();
+    file = negotiator.file();
+    readerConfig = config;
     dataWriters = new ArrayList<>();
-    this.showMetadataPreview = readerConfig.formatConfig.showPreview();
-  }
+    showMetadataPreview = readerConfig.formatConfig.showPreview();
 
-  @Override
-  public boolean open(FileSchemaNegotiator negotiator) {
-    split = negotiator.split();
-    errorContext = negotiator.parentErrorContext();
     // Since the HDF file reader uses a stream to actually read the file, the 
file name from the
     // module is incorrect.
-    fileName = split.getPath().getName();
-    try {
-      openFile(negotiator);
-    } catch (IOException e) {
-      throw UserException
-        .dataReadError(e)
-        .addContext("Failed to close input file: %s", split.getPath())
-        .addContext(errorContext)
-        .build(logger);
-    }
+    fileName = file.split().getPath().getName();
 
-    ResultSetLoader loader;
-    if (readerConfig.defaultPath == null) {
-      // Get file metadata
-      List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new 
ArrayList<>());
-      metadataIterator = metadata.iterator();
-
-      // Schema for Metadata query
-      SchemaBuilder builder = new SchemaBuilder()
-        .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
-        .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
-        .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
-        .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
-        .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
-      negotiator.tableSchema(builder.buildSchema(), false);
-
-      loader = negotiator.build();
-      dimensions = new int[0];
-      rowWriter = loader.writer();
-
-    } else {
-      // This is the case when the default path is specified. Since the user 
is explicitly asking for a dataset
-      // Drill can obtain the schema by getting the datatypes below and 
ultimately mapping that schema to columns
-      Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
-      dimensions = dataSet.getDimensions();
-
-      loader = negotiator.build();
-      rowWriter = loader.writer();
-      writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
-          negotiator.parentErrorContext());
-      if (dimensions.length <= 1) {
-        buildSchemaFor1DimensionalDataset(dataSet);
-      } else if (dimensions.length == 2) {
-        buildSchemaFor2DimensionalDataset(dataSet);
-      } else {
-        // Case for datasets of greater than 2D
-        // These are automatically flattened
-        buildSchemaFor2DimensionalDataset(dataSet);
+    { // Opens an HDF5 file

Review Comment:
   I guess some of these could become private methods but it's a minor point 
for me.



##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +164,104 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, 
HDF5FormatConfig formatConfig)
     }
   }
 
-  public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
-    this.readerConfig = readerConfig;
-    this.maxRecords = maxRecords;
+  public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, 
FileSchemaNegotiator negotiator) {
+    errorContext = negotiator.parentErrorContext();
+    file = negotiator.file();
+    readerConfig = config;
     dataWriters = new ArrayList<>();
-    this.showMetadataPreview = readerConfig.formatConfig.showPreview();
-  }
+    showMetadataPreview = readerConfig.formatConfig.showPreview();
 
-  @Override
-  public boolean open(FileSchemaNegotiator negotiator) {
-    split = negotiator.split();
-    errorContext = negotiator.parentErrorContext();
     // Since the HDF file reader uses a stream to actually read the file, the 
file name from the
     // module is incorrect.
-    fileName = split.getPath().getName();
-    try {
-      openFile(negotiator);
-    } catch (IOException e) {
-      throw UserException
-        .dataReadError(e)
-        .addContext("Failed to close input file: %s", split.getPath())
-        .addContext(errorContext)
-        .build(logger);
+    fileName = file.split().getPath().getName();
+
+    { // Opens an HDF5 file
+      try (InputStream in = 
file.fileSystem().openPossiblyCompressedStream(file.split().getPath())) {
+        /*
+         * As a possible future improvement, the jhdf reader has the ability 
to read hdf5 files from
+         * a byte array or byte buffer. This implementation is better in that 
it does not require creating
+         * a temporary file which must be deleted later.  However, it could 
result in memory issues in the
+         * event of large files.
+         */
+        hdfFile = HdfFile.fromInputStream(in);
+      } catch (IOException e) {
+        throw UserException
+          .dataReadError(e)
+          .message("Failed to open input file: %s", file.split().getPath())
+          .addContext(errorContext)
+          .build(logger);
+      }
     }
 
-    ResultSetLoader loader;
-    if (readerConfig.defaultPath == null) {
-      // Get file metadata
-      List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new 
ArrayList<>());
-      metadataIterator = metadata.iterator();
-
-      // Schema for Metadata query
-      SchemaBuilder builder = new SchemaBuilder()
-        .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
-        .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
-        .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
-        .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
-        .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
-      negotiator.tableSchema(builder.buildSchema(), false);
-
-      loader = negotiator.build();
-      dimensions = new int[0];
-      rowWriter = loader.writer();
-
-    } else {
-      // This is the case when the default path is specified. Since the user 
is explicitly asking for a dataset
-      // Drill can obtain the schema by getting the datatypes below and 
ultimately mapping that schema to columns
-      Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
-      dimensions = dataSet.getDimensions();
-
-      loader = negotiator.build();
-      rowWriter = loader.writer();
-      writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
-          negotiator.parentErrorContext());
-      if (dimensions.length <= 1) {
-        buildSchemaFor1DimensionalDataset(dataSet);
-      } else if (dimensions.length == 2) {
-        buildSchemaFor2DimensionalDataset(dataSet);
+    { // Build the schema and initial the writer
+      ResultSetLoader loader;
+      if (readerConfig.defaultPath == null) {
+        // Get file metadata
+        List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new 
ArrayList<>());
+        metadataIterator = metadata.iterator();
+
+        // Schema for Metadata query
+        SchemaBuilder builder = new SchemaBuilder()
+          .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
+          .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
+          .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
+          .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
+          .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
+          .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
+          .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
+          .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
+
+        negotiator.tableSchema(builder.buildSchema(), false);

Review Comment:
   It would be nice to have the GitHub comment above this one as a comment 
above line 214 in the code.



##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +164,104 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, 
HDF5FormatConfig formatConfig)
     }
   }
 
-  public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
-    this.readerConfig = readerConfig;
-    this.maxRecords = maxRecords;
+  public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, 
FileSchemaNegotiator negotiator) {
+    errorContext = negotiator.parentErrorContext();
+    file = negotiator.file();
+    readerConfig = config;
     dataWriters = new ArrayList<>();
-    this.showMetadataPreview = readerConfig.formatConfig.showPreview();
-  }
+    showMetadataPreview = readerConfig.formatConfig.showPreview();
 
-  @Override
-  public boolean open(FileSchemaNegotiator negotiator) {
-    split = negotiator.split();
-    errorContext = negotiator.parentErrorContext();
     // Since the HDF file reader uses a stream to actually read the file, the 
file name from the
     // module is incorrect.
-    fileName = split.getPath().getName();
-    try {
-      openFile(negotiator);
-    } catch (IOException e) {
-      throw UserException
-        .dataReadError(e)
-        .addContext("Failed to close input file: %s", split.getPath())
-        .addContext(errorContext)
-        .build(logger);
+    fileName = file.split().getPath().getName();
+
+    { // Opens an HDF5 file
+      try (InputStream in = 
file.fileSystem().openPossiblyCompressedStream(file.split().getPath())) {
+        /*
+         * As a possible future improvement, the jhdf reader has the ability 
to read hdf5 files from
+         * a byte array or byte buffer. This implementation is better in that 
it does not require creating
+         * a temporary file which must be deleted later.  However, it could 
result in memory issues in the
+         * event of large files.
+         */
+        hdfFile = HdfFile.fromInputStream(in);
+      } catch (IOException e) {
+        throw UserException
+          .dataReadError(e)
+          .message("Failed to open input file: %s", file.split().getPath())
+          .addContext(errorContext)
+          .build(logger);
+      }
     }
 
-    ResultSetLoader loader;
-    if (readerConfig.defaultPath == null) {
-      // Get file metadata
-      List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new 
ArrayList<>());
-      metadataIterator = metadata.iterator();
-
-      // Schema for Metadata query
-      SchemaBuilder builder = new SchemaBuilder()
-        .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
-        .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
-        .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
-        .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
-        .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
-        .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
-      negotiator.tableSchema(builder.buildSchema(), false);
-
-      loader = negotiator.build();
-      dimensions = new int[0];
-      rowWriter = loader.writer();
-
-    } else {
-      // This is the case when the default path is specified. Since the user 
is explicitly asking for a dataset
-      // Drill can obtain the schema by getting the datatypes below and 
ultimately mapping that schema to columns
-      Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
-      dimensions = dataSet.getDimensions();
-
-      loader = negotiator.build();
-      rowWriter = loader.writer();
-      writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
-          negotiator.parentErrorContext());
-      if (dimensions.length <= 1) {
-        buildSchemaFor1DimensionalDataset(dataSet);
-      } else if (dimensions.length == 2) {
-        buildSchemaFor2DimensionalDataset(dataSet);
+    { // Build the schema and initial the writer
+      ResultSetLoader loader;
+      if (readerConfig.defaultPath == null) {
+        // Get file metadata
+        List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new 
ArrayList<>());
+        metadataIterator = metadata.iterator();
+
+        // Schema for Metadata query
+        SchemaBuilder builder = new SchemaBuilder()
+          .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
+          .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
+          .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
+          .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
+          .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
+          .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
+          .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
+          .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
+
+        negotiator.tableSchema(builder.buildSchema(), false);
+
+        loader = negotiator.build();
+        rowWriter = loader.writer();
+
+        dimensions = null;
+        writerSpec = null;
+        pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
+        dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
+        fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
+        dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
+        linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
+        elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
+        datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
+        dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
       } else {
-        // Case for datasets of greater than 2D
-        // These are automatically flattened
-        buildSchemaFor2DimensionalDataset(dataSet);
+        // This is the case when the default path is specified. Since the user 
is explicitly asking for a dataset
+        // Drill can obtain the schema by getting the datatypes below and 
ultimately mapping that schema to columns
+        Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
+        dimensions = dataSet.getDimensions();
+
+        loader = negotiator.build();
+        rowWriter = loader.writer();
+        writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), 
negotiator.parentErrorContext());
+        if (dimensions.length <= 1) {
+          buildSchemaFor1DimensionalDataset(dataSet);
+        } else if (dimensions.length == 2) {
+          buildSchemaFor2DimensionalDataset(dataSet);
+        } else {
+          // Case for datasets of greater than 2D
+          // These are automatically flattened
+          buildSchemaFor2DimensionalDataset(dataSet);
+        }
+        // Initial the final fields
+        metadataIterator = null;
+        pathWriter = null;
+        dataTypeWriter = null;
+        fileNameWriter = null;
+        dataSizeWriter = null;
+        linkWriter = null;
+        elementCountWriter = null;
+        datasetTypeWriter = null;
+        dimensionsWriter = null;
       }
     }
-    if (readerConfig.defaultPath == null) {
-      pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
-      dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
-      fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
-      dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
-      linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
-      elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
-      datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
-      dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
-    }
-    return true;
-  }
-
-  /**
-   * This function is called when the default path is set and the data set is 
a single dimension.
-   * This function will create an array of one dataWriter of the
-   * correct datatype
-   * @param dataset The HDF5 dataset
-   */
-  private void buildSchemaFor1DimensionalDataset(Dataset dataset) {
-    MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
-
-    // Case for null or unknown data types:
-    if (currentDataType == null) {
-      logger.warn("Couldn't add {}", dataset.getJavaType().getName());
-      return;
-    }
-    dataWriters.add(buildWriter(currentDataType));
   }
 
   private HDF5DataWriter buildWriter(MinorType dataType) {
     switch (dataType) {
-      /*case GENERIC_OBJECT:
-        return new HDF5EnumDataWriter(hdfFile, writerSpec, 
readerConfig.defaultPath);*/
+      /* case GENERIC_OBJECT:

Review Comment:
   I think we should have an explanatory comment if we're going to keep 
commented out code.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@drill.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] DRILL-8188: Convert HDF5 format to EVF2 (drill)

Reply via email to