[ https://issues.apache.org/jira/browse/DRILL-8188?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17523084#comment-17523084 ]
ASF GitHub Bot commented on DRILL-8188: --------------------------------------- luocooong commented on code in PR #2515: URL: https://github.com/apache/drill/pull/2515#discussion_r851623209 ########## contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java: ########## @@ -171,107 +168,109 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, HDF5FormatConfig formatConfig) } } - public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) { - this.readerConfig = readerConfig; - this.maxRecords = maxRecords; + public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, FileSchemaNegotiator negotiator) { + errorContext = negotiator.parentErrorContext(); + file = negotiator.file(); + readerConfig = config; dataWriters = new ArrayList<>(); - this.showMetadataPreview = readerConfig.formatConfig.showPreview(); - } + showMetadataPreview = readerConfig.formatConfig.showPreview(); - @Override - public boolean open(FileSchemaNegotiator negotiator) { - split = negotiator.split(); - errorContext = negotiator.parentErrorContext(); // Since the HDF file reader uses a stream to actually read the file, the file name from the // module is incorrect. - fileName = split.getPath().getName(); - try { - openFile(negotiator); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .addContext("Failed to close input file: %s", split.getPath()) - .addContext(errorContext) - .build(logger); - } + fileName = file.split().getPath().getName(); - ResultSetLoader loader; - if (readerConfig.defaultPath == null) { - // Get file metadata - List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); - metadataIterator = metadata.iterator(); - - // Schema for Metadata query - SchemaBuilder builder = new SchemaBuilder() - .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) - .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) - .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) - .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) - .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); - - negotiator.tableSchema(builder.buildSchema(), false); - - loader = negotiator.build(); - dimensions = new int[0]; - rowWriter = loader.writer(); - - } else { - // This is the case when the default path is specified. Since the user is explicitly asking for a dataset - // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns - Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath); - dimensions = dataSet.getDimensions(); - - loader = negotiator.build(); - rowWriter = loader.writer(); - writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), - negotiator.parentErrorContext()); - if (dimensions.length <= 1) { - buildSchemaFor1DimensionalDataset(dataSet); - } else if (dimensions.length == 2) { - buildSchemaFor2DimensionalDataset(dataSet); - } else { - // Case for datasets of greater than 2D - // These are automatically flattened - buildSchemaFor2DimensionalDataset(dataSet); + { // Opens an HDF5 file + InputStream in = null; + try { + /* + * As a possible future improvement, the jhdf reader has the ability to read hdf5 files from + * a byte array or byte buffer. This implementation is better in that it does not require creating + * a temporary file which must be deleted later. However, it could result in memory issues in the + * event of large files. + */ + in = file.fileSystem().openPossiblyCompressedStream(file.split().getPath()); + hdfFile = HdfFile.fromInputStream(in); + reader = new BufferedReader(new InputStreamReader(in)); Review Comment: Good catch, Done. > Convert HDF5 format to EVF2 > --------------------------- > > Key: DRILL-8188 > URL: https://issues.apache.org/jira/browse/DRILL-8188 > Project: Apache Drill > Issue Type: Improvement > Affects Versions: 1.20.0 > Reporter: Cong Luo > Assignee: Cong Luo > Priority: Major > > Use EVF V2 instead of old V1. > Also, fixed a few bugs in V2 framework. -- This message was sent by Atlassian Jira (v8.20.1#820001)