[
https://issues.apache.org/jira/browse/DRILL-8188?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17806649#comment-17806649
]
ASF GitHub Bot commented on DRILL-8188:
---------------------------------------
jnturton commented on code in PR #2515:
URL: https://github.com/apache/drill/pull/2515#discussion_r1446231938
##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +168,109 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin,
HDF5FormatConfig formatConfig)
}
}
- public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
- this.readerConfig = readerConfig;
- this.maxRecords = maxRecords;
+ public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan,
FileSchemaNegotiator negotiator) {
+ errorContext = negotiator.parentErrorContext();
+ file = negotiator.file();
+ readerConfig = config;
dataWriters = new ArrayList<>();
- this.showMetadataPreview = readerConfig.formatConfig.showPreview();
- }
+ showMetadataPreview = readerConfig.formatConfig.showPreview();
- @Override
- public boolean open(FileSchemaNegotiator negotiator) {
- split = negotiator.split();
- errorContext = negotiator.parentErrorContext();
// Since the HDF file reader uses a stream to actually read the file, the
file name from the
// module is incorrect.
- fileName = split.getPath().getName();
- try {
- openFile(negotiator);
- } catch (IOException e) {
- throw UserException
- .dataReadError(e)
- .addContext("Failed to close input file: %s", split.getPath())
- .addContext(errorContext)
- .build(logger);
- }
+ fileName = file.split().getPath().getName();
- ResultSetLoader loader;
- if (readerConfig.defaultPath == null) {
- // Get file metadata
- List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new
ArrayList<>());
- metadataIterator = metadata.iterator();
-
- // Schema for Metadata query
- SchemaBuilder builder = new SchemaBuilder()
- .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
- .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
- .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
- .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
- .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
- negotiator.tableSchema(builder.buildSchema(), false);
-
- loader = negotiator.build();
- dimensions = new int[0];
- rowWriter = loader.writer();
-
- } else {
- // This is the case when the default path is specified. Since the user
is explicitly asking for a dataset
- // Drill can obtain the schema by getting the datatypes below and
ultimately mapping that schema to columns
- Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
- dimensions = dataSet.getDimensions();
-
- loader = negotiator.build();
- rowWriter = loader.writer();
- writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
- negotiator.parentErrorContext());
- if (dimensions.length <= 1) {
- buildSchemaFor1DimensionalDataset(dataSet);
- } else if (dimensions.length == 2) {
- buildSchemaFor2DimensionalDataset(dataSet);
- } else {
- // Case for datasets of greater than 2D
- // These are automatically flattened
- buildSchemaFor2DimensionalDataset(dataSet);
+ { // Opens an HDF5 file
Review Comment:
I guess some of these could become private methods but it's a minor point
for me.
##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +164,104 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin,
HDF5FormatConfig formatConfig)
}
}
- public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
- this.readerConfig = readerConfig;
- this.maxRecords = maxRecords;
+ public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan,
FileSchemaNegotiator negotiator) {
+ errorContext = negotiator.parentErrorContext();
+ file = negotiator.file();
+ readerConfig = config;
dataWriters = new ArrayList<>();
- this.showMetadataPreview = readerConfig.formatConfig.showPreview();
- }
+ showMetadataPreview = readerConfig.formatConfig.showPreview();
- @Override
- public boolean open(FileSchemaNegotiator negotiator) {
- split = negotiator.split();
- errorContext = negotiator.parentErrorContext();
// Since the HDF file reader uses a stream to actually read the file, the
file name from the
// module is incorrect.
- fileName = split.getPath().getName();
- try {
- openFile(negotiator);
- } catch (IOException e) {
- throw UserException
- .dataReadError(e)
- .addContext("Failed to close input file: %s", split.getPath())
- .addContext(errorContext)
- .build(logger);
+ fileName = file.split().getPath().getName();
+
+ { // Opens an HDF5 file
+ try (InputStream in =
file.fileSystem().openPossiblyCompressedStream(file.split().getPath())) {
+ /*
+ * As a possible future improvement, the jhdf reader has the ability
to read hdf5 files from
+ * a byte array or byte buffer. This implementation is better in that
it does not require creating
+ * a temporary file which must be deleted later. However, it could
result in memory issues in the
+ * event of large files.
+ */
+ hdfFile = HdfFile.fromInputStream(in);
+ } catch (IOException e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Failed to open input file: %s", file.split().getPath())
+ .addContext(errorContext)
+ .build(logger);
+ }
}
- ResultSetLoader loader;
- if (readerConfig.defaultPath == null) {
- // Get file metadata
- List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new
ArrayList<>());
- metadataIterator = metadata.iterator();
-
- // Schema for Metadata query
- SchemaBuilder builder = new SchemaBuilder()
- .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
- .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
- .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
- .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
- .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
- negotiator.tableSchema(builder.buildSchema(), false);
-
- loader = negotiator.build();
- dimensions = new int[0];
- rowWriter = loader.writer();
-
- } else {
- // This is the case when the default path is specified. Since the user
is explicitly asking for a dataset
- // Drill can obtain the schema by getting the datatypes below and
ultimately mapping that schema to columns
- Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
- dimensions = dataSet.getDimensions();
-
- loader = negotiator.build();
- rowWriter = loader.writer();
- writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
- negotiator.parentErrorContext());
- if (dimensions.length <= 1) {
- buildSchemaFor1DimensionalDataset(dataSet);
- } else if (dimensions.length == 2) {
- buildSchemaFor2DimensionalDataset(dataSet);
+ { // Build the schema and initial the writer
+ ResultSetLoader loader;
+ if (readerConfig.defaultPath == null) {
+ // Get file metadata
+ List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new
ArrayList<>());
+ metadataIterator = metadata.iterator();
+
+ // Schema for Metadata query
+ SchemaBuilder builder = new SchemaBuilder()
+ .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
+ .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
+ .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
+ .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
+ .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
+ .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
+ .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
+ .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
+
+ negotiator.tableSchema(builder.buildSchema(), false);
Review Comment:
It would be nice to have the GitHub comment above this one as a comment
above line 214 in the code.
##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +164,104 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin,
HDF5FormatConfig formatConfig)
}
}
- public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
- this.readerConfig = readerConfig;
- this.maxRecords = maxRecords;
+ public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan,
FileSchemaNegotiator negotiator) {
+ errorContext = negotiator.parentErrorContext();
+ file = negotiator.file();
+ readerConfig = config;
dataWriters = new ArrayList<>();
- this.showMetadataPreview = readerConfig.formatConfig.showPreview();
- }
+ showMetadataPreview = readerConfig.formatConfig.showPreview();
- @Override
- public boolean open(FileSchemaNegotiator negotiator) {
- split = negotiator.split();
- errorContext = negotiator.parentErrorContext();
// Since the HDF file reader uses a stream to actually read the file, the
file name from the
// module is incorrect.
- fileName = split.getPath().getName();
- try {
- openFile(negotiator);
- } catch (IOException e) {
- throw UserException
- .dataReadError(e)
- .addContext("Failed to close input file: %s", split.getPath())
- .addContext(errorContext)
- .build(logger);
+ fileName = file.split().getPath().getName();
+
+ { // Opens an HDF5 file
+ try (InputStream in =
file.fileSystem().openPossiblyCompressedStream(file.split().getPath())) {
+ /*
+ * As a possible future improvement, the jhdf reader has the ability
to read hdf5 files from
+ * a byte array or byte buffer. This implementation is better in that
it does not require creating
+ * a temporary file which must be deleted later. However, it could
result in memory issues in the
+ * event of large files.
+ */
+ hdfFile = HdfFile.fromInputStream(in);
+ } catch (IOException e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Failed to open input file: %s", file.split().getPath())
+ .addContext(errorContext)
+ .build(logger);
+ }
}
- ResultSetLoader loader;
- if (readerConfig.defaultPath == null) {
- // Get file metadata
- List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new
ArrayList<>());
- metadataIterator = metadata.iterator();
-
- // Schema for Metadata query
- SchemaBuilder builder = new SchemaBuilder()
- .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
- .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
- .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
- .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
- .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
- negotiator.tableSchema(builder.buildSchema(), false);
-
- loader = negotiator.build();
- dimensions = new int[0];
- rowWriter = loader.writer();
-
- } else {
- // This is the case when the default path is specified. Since the user
is explicitly asking for a dataset
- // Drill can obtain the schema by getting the datatypes below and
ultimately mapping that schema to columns
- Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
- dimensions = dataSet.getDimensions();
-
- loader = negotiator.build();
- rowWriter = loader.writer();
- writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
- negotiator.parentErrorContext());
- if (dimensions.length <= 1) {
- buildSchemaFor1DimensionalDataset(dataSet);
- } else if (dimensions.length == 2) {
- buildSchemaFor2DimensionalDataset(dataSet);
+ { // Build the schema and initial the writer
+ ResultSetLoader loader;
+ if (readerConfig.defaultPath == null) {
+ // Get file metadata
+ List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new
ArrayList<>());
+ metadataIterator = metadata.iterator();
+
+ // Schema for Metadata query
+ SchemaBuilder builder = new SchemaBuilder()
+ .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
+ .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
+ .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
+ .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
+ .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
+ .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
+ .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
+ .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
+
+ negotiator.tableSchema(builder.buildSchema(), false);
+
+ loader = negotiator.build();
+ rowWriter = loader.writer();
+
+ dimensions = null;
+ writerSpec = null;
+ pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
+ dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
+ fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
+ dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
+ linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
+ elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
+ datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
+ dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
} else {
- // Case for datasets of greater than 2D
- // These are automatically flattened
- buildSchemaFor2DimensionalDataset(dataSet);
+ // This is the case when the default path is specified. Since the user
is explicitly asking for a dataset
+ // Drill can obtain the schema by getting the datatypes below and
ultimately mapping that schema to columns
+ Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
+ dimensions = dataSet.getDimensions();
+
+ loader = negotiator.build();
+ rowWriter = loader.writer();
+ writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
negotiator.parentErrorContext());
+ if (dimensions.length <= 1) {
+ buildSchemaFor1DimensionalDataset(dataSet);
+ } else if (dimensions.length == 2) {
+ buildSchemaFor2DimensionalDataset(dataSet);
+ } else {
+ // Case for datasets of greater than 2D
+ // These are automatically flattened
+ buildSchemaFor2DimensionalDataset(dataSet);
+ }
+ // Initial the final fields
+ metadataIterator = null;
+ pathWriter = null;
+ dataTypeWriter = null;
+ fileNameWriter = null;
+ dataSizeWriter = null;
+ linkWriter = null;
+ elementCountWriter = null;
+ datasetTypeWriter = null;
+ dimensionsWriter = null;
}
}
- if (readerConfig.defaultPath == null) {
- pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
- dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
- fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
- dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
- linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
- elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
- datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
- dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
- }
- return true;
- }
-
- /**
- * This function is called when the default path is set and the data set is
a single dimension.
- * This function will create an array of one dataWriter of the
- * correct datatype
- * @param dataset The HDF5 dataset
- */
- private void buildSchemaFor1DimensionalDataset(Dataset dataset) {
- MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
-
- // Case for null or unknown data types:
- if (currentDataType == null) {
- logger.warn("Couldn't add {}", dataset.getJavaType().getName());
- return;
- }
- dataWriters.add(buildWriter(currentDataType));
}
private HDF5DataWriter buildWriter(MinorType dataType) {
switch (dataType) {
- /*case GENERIC_OBJECT:
- return new HDF5EnumDataWriter(hdfFile, writerSpec,
readerConfig.defaultPath);*/
+ /* case GENERIC_OBJECT:
Review Comment:
I think we should have an explanatory comment if we're going to keep
commented out code.
> Convert HDF5 format to EVF2
> ---------------------------
>
> Key: DRILL-8188
> URL: https://issues.apache.org/jira/browse/DRILL-8188
> Project: Apache Drill
> Issue Type: Improvement
> Affects Versions: 1.20.0
> Reporter: Cong Luo
> Assignee: Cong Luo
> Priority: Major
>
> Use EVF V2 instead of old V1.
> Also, fixed a few bugs in V2 framework.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)