[ https://issues.apache.org/jira/browse/DRILL-8188?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17806649#comment-17806649 ]
ASF GitHub Bot commented on DRILL-8188: --------------------------------------- jnturton commented on code in PR #2515: URL: https://github.com/apache/drill/pull/2515#discussion_r1446231938 ########## contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java: ########## @@ -171,107 +168,109 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, HDF5FormatConfig formatConfig) } } - public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) { - this.readerConfig = readerConfig; - this.maxRecords = maxRecords; + public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, FileSchemaNegotiator negotiator) { + errorContext = negotiator.parentErrorContext(); + file = negotiator.file(); + readerConfig = config; dataWriters = new ArrayList<>(); - this.showMetadataPreview = readerConfig.formatConfig.showPreview(); - } + showMetadataPreview = readerConfig.formatConfig.showPreview(); - @Override - public boolean open(FileSchemaNegotiator negotiator) { - split = negotiator.split(); - errorContext = negotiator.parentErrorContext(); // Since the HDF file reader uses a stream to actually read the file, the file name from the // module is incorrect. - fileName = split.getPath().getName(); - try { - openFile(negotiator); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .addContext("Failed to close input file: %s", split.getPath()) - .addContext(errorContext) - .build(logger); - } + fileName = file.split().getPath().getName(); - ResultSetLoader loader; - if (readerConfig.defaultPath == null) { - // Get file metadata - List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); - metadataIterator = metadata.iterator(); - - // Schema for Metadata query - SchemaBuilder builder = new SchemaBuilder() - .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) - .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) - .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) - .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) - .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); - - negotiator.tableSchema(builder.buildSchema(), false); - - loader = negotiator.build(); - dimensions = new int[0]; - rowWriter = loader.writer(); - - } else { - // This is the case when the default path is specified. Since the user is explicitly asking for a dataset - // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns - Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath); - dimensions = dataSet.getDimensions(); - - loader = negotiator.build(); - rowWriter = loader.writer(); - writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), - negotiator.parentErrorContext()); - if (dimensions.length <= 1) { - buildSchemaFor1DimensionalDataset(dataSet); - } else if (dimensions.length == 2) { - buildSchemaFor2DimensionalDataset(dataSet); - } else { - // Case for datasets of greater than 2D - // These are automatically flattened - buildSchemaFor2DimensionalDataset(dataSet); + { // Opens an HDF5 file Review Comment: I guess some of these could become private methods but it's a minor point for me. ########## contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java: ########## @@ -171,107 +164,104 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, HDF5FormatConfig formatConfig) } } - public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) { - this.readerConfig = readerConfig; - this.maxRecords = maxRecords; + public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, FileSchemaNegotiator negotiator) { + errorContext = negotiator.parentErrorContext(); + file = negotiator.file(); + readerConfig = config; dataWriters = new ArrayList<>(); - this.showMetadataPreview = readerConfig.formatConfig.showPreview(); - } + showMetadataPreview = readerConfig.formatConfig.showPreview(); - @Override - public boolean open(FileSchemaNegotiator negotiator) { - split = negotiator.split(); - errorContext = negotiator.parentErrorContext(); // Since the HDF file reader uses a stream to actually read the file, the file name from the // module is incorrect. - fileName = split.getPath().getName(); - try { - openFile(negotiator); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .addContext("Failed to close input file: %s", split.getPath()) - .addContext(errorContext) - .build(logger); + fileName = file.split().getPath().getName(); + + { // Opens an HDF5 file + try (InputStream in = file.fileSystem().openPossiblyCompressedStream(file.split().getPath())) { + /* + * As a possible future improvement, the jhdf reader has the ability to read hdf5 files from + * a byte array or byte buffer. This implementation is better in that it does not require creating + * a temporary file which must be deleted later. However, it could result in memory issues in the + * event of large files. + */ + hdfFile = HdfFile.fromInputStream(in); + } catch (IOException e) { + throw UserException + .dataReadError(e) + .message("Failed to open input file: %s", file.split().getPath()) + .addContext(errorContext) + .build(logger); + } } - ResultSetLoader loader; - if (readerConfig.defaultPath == null) { - // Get file metadata - List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); - metadataIterator = metadata.iterator(); - - // Schema for Metadata query - SchemaBuilder builder = new SchemaBuilder() - .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) - .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) - .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) - .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) - .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); - - negotiator.tableSchema(builder.buildSchema(), false); - - loader = negotiator.build(); - dimensions = new int[0]; - rowWriter = loader.writer(); - - } else { - // This is the case when the default path is specified. Since the user is explicitly asking for a dataset - // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns - Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath); - dimensions = dataSet.getDimensions(); - - loader = negotiator.build(); - rowWriter = loader.writer(); - writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), - negotiator.parentErrorContext()); - if (dimensions.length <= 1) { - buildSchemaFor1DimensionalDataset(dataSet); - } else if (dimensions.length == 2) { - buildSchemaFor2DimensionalDataset(dataSet); + { // Build the schema and initial the writer + ResultSetLoader loader; + if (readerConfig.defaultPath == null) { + // Get file metadata + List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); + metadataIterator = metadata.iterator(); + + // Schema for Metadata query + SchemaBuilder builder = new SchemaBuilder() + .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) + .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) + .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) + .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) + .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); + + negotiator.tableSchema(builder.buildSchema(), false); Review Comment: It would be nice to have the GitHub comment above this one as a comment above line 214 in the code. ########## contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java: ########## @@ -171,107 +164,104 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, HDF5FormatConfig formatConfig) } } - public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) { - this.readerConfig = readerConfig; - this.maxRecords = maxRecords; + public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, FileSchemaNegotiator negotiator) { + errorContext = negotiator.parentErrorContext(); + file = negotiator.file(); + readerConfig = config; dataWriters = new ArrayList<>(); - this.showMetadataPreview = readerConfig.formatConfig.showPreview(); - } + showMetadataPreview = readerConfig.formatConfig.showPreview(); - @Override - public boolean open(FileSchemaNegotiator negotiator) { - split = negotiator.split(); - errorContext = negotiator.parentErrorContext(); // Since the HDF file reader uses a stream to actually read the file, the file name from the // module is incorrect. - fileName = split.getPath().getName(); - try { - openFile(negotiator); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .addContext("Failed to close input file: %s", split.getPath()) - .addContext(errorContext) - .build(logger); + fileName = file.split().getPath().getName(); + + { // Opens an HDF5 file + try (InputStream in = file.fileSystem().openPossiblyCompressedStream(file.split().getPath())) { + /* + * As a possible future improvement, the jhdf reader has the ability to read hdf5 files from + * a byte array or byte buffer. This implementation is better in that it does not require creating + * a temporary file which must be deleted later. However, it could result in memory issues in the + * event of large files. + */ + hdfFile = HdfFile.fromInputStream(in); + } catch (IOException e) { + throw UserException + .dataReadError(e) + .message("Failed to open input file: %s", file.split().getPath()) + .addContext(errorContext) + .build(logger); + } } - ResultSetLoader loader; - if (readerConfig.defaultPath == null) { - // Get file metadata - List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); - metadataIterator = metadata.iterator(); - - // Schema for Metadata query - SchemaBuilder builder = new SchemaBuilder() - .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) - .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) - .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) - .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) - .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); - - negotiator.tableSchema(builder.buildSchema(), false); - - loader = negotiator.build(); - dimensions = new int[0]; - rowWriter = loader.writer(); - - } else { - // This is the case when the default path is specified. Since the user is explicitly asking for a dataset - // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns - Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath); - dimensions = dataSet.getDimensions(); - - loader = negotiator.build(); - rowWriter = loader.writer(); - writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), - negotiator.parentErrorContext()); - if (dimensions.length <= 1) { - buildSchemaFor1DimensionalDataset(dataSet); - } else if (dimensions.length == 2) { - buildSchemaFor2DimensionalDataset(dataSet); + { // Build the schema and initial the writer + ResultSetLoader loader; + if (readerConfig.defaultPath == null) { + // Get file metadata + List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); + metadataIterator = metadata.iterator(); + + // Schema for Metadata query + SchemaBuilder builder = new SchemaBuilder() + .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) + .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) + .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) + .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) + .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); + + negotiator.tableSchema(builder.buildSchema(), false); + + loader = negotiator.build(); + rowWriter = loader.writer(); + + dimensions = null; + writerSpec = null; + pathWriter = rowWriter.scalar(PATH_COLUMN_NAME); + dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME); + fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME); + dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME); + linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME); + elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME); + datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME); + dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME); } else { - // Case for datasets of greater than 2D - // These are automatically flattened - buildSchemaFor2DimensionalDataset(dataSet); + // This is the case when the default path is specified. Since the user is explicitly asking for a dataset + // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns + Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath); + dimensions = dataSet.getDimensions(); + + loader = negotiator.build(); + rowWriter = loader.writer(); + writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), negotiator.parentErrorContext()); + if (dimensions.length <= 1) { + buildSchemaFor1DimensionalDataset(dataSet); + } else if (dimensions.length == 2) { + buildSchemaFor2DimensionalDataset(dataSet); + } else { + // Case for datasets of greater than 2D + // These are automatically flattened + buildSchemaFor2DimensionalDataset(dataSet); + } + // Initial the final fields + metadataIterator = null; + pathWriter = null; + dataTypeWriter = null; + fileNameWriter = null; + dataSizeWriter = null; + linkWriter = null; + elementCountWriter = null; + datasetTypeWriter = null; + dimensionsWriter = null; } } - if (readerConfig.defaultPath == null) { - pathWriter = rowWriter.scalar(PATH_COLUMN_NAME); - dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME); - fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME); - dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME); - linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME); - elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME); - datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME); - dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME); - } - return true; - } - - /** - * This function is called when the default path is set and the data set is a single dimension. - * This function will create an array of one dataWriter of the - * correct datatype - * @param dataset The HDF5 dataset - */ - private void buildSchemaFor1DimensionalDataset(Dataset dataset) { - MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType()); - - // Case for null or unknown data types: - if (currentDataType == null) { - logger.warn("Couldn't add {}", dataset.getJavaType().getName()); - return; - } - dataWriters.add(buildWriter(currentDataType)); } private HDF5DataWriter buildWriter(MinorType dataType) { switch (dataType) { - /*case GENERIC_OBJECT: - return new HDF5EnumDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);*/ + /* case GENERIC_OBJECT: Review Comment: I think we should have an explanatory comment if we're going to keep commented out code. > Convert HDF5 format to EVF2 > --------------------------- > > Key: DRILL-8188 > URL: https://issues.apache.org/jira/browse/DRILL-8188 > Project: Apache Drill > Issue Type: Improvement > Affects Versions: 1.20.0 > Reporter: Cong Luo > Assignee: Cong Luo > Priority: Major > > Use EVF V2 instead of old V1. > Also, fixed a few bugs in V2 framework. -- This message was sent by Atlassian Jira (v8.20.10#820010)