[ https://issues.apache.org/jira/browse/DRILL-8188?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17523082#comment-17523082 ]
ASF GitHub Bot commented on DRILL-8188: --------------------------------------- luocooong commented on code in PR #2515: URL: https://github.com/apache/drill/pull/2515#discussion_r851623160 ########## contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java: ########## @@ -171,107 +168,109 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin, HDF5FormatConfig formatConfig) } } - public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) { - this.readerConfig = readerConfig; - this.maxRecords = maxRecords; + public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan, FileSchemaNegotiator negotiator) { + errorContext = negotiator.parentErrorContext(); + file = negotiator.file(); + readerConfig = config; dataWriters = new ArrayList<>(); - this.showMetadataPreview = readerConfig.formatConfig.showPreview(); - } + showMetadataPreview = readerConfig.formatConfig.showPreview(); - @Override - public boolean open(FileSchemaNegotiator negotiator) { - split = negotiator.split(); - errorContext = negotiator.parentErrorContext(); // Since the HDF file reader uses a stream to actually read the file, the file name from the // module is incorrect. - fileName = split.getPath().getName(); - try { - openFile(negotiator); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .addContext("Failed to close input file: %s", split.getPath()) - .addContext(errorContext) - .build(logger); - } + fileName = file.split().getPath().getName(); - ResultSetLoader loader; - if (readerConfig.defaultPath == null) { - // Get file metadata - List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); - metadataIterator = metadata.iterator(); - - // Schema for Metadata query - SchemaBuilder builder = new SchemaBuilder() - .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) - .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) - .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) - .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) - .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) - .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); - - negotiator.tableSchema(builder.buildSchema(), false); - - loader = negotiator.build(); - dimensions = new int[0]; - rowWriter = loader.writer(); - - } else { - // This is the case when the default path is specified. Since the user is explicitly asking for a dataset - // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns - Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath); - dimensions = dataSet.getDimensions(); - - loader = negotiator.build(); - rowWriter = loader.writer(); - writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), - negotiator.parentErrorContext()); - if (dimensions.length <= 1) { - buildSchemaFor1DimensionalDataset(dataSet); - } else if (dimensions.length == 2) { - buildSchemaFor2DimensionalDataset(dataSet); - } else { - // Case for datasets of greater than 2D - // These are automatically flattened - buildSchemaFor2DimensionalDataset(dataSet); + { // Opens an HDF5 file + InputStream in = null; + try { + /* + * As a possible future improvement, the jhdf reader has the ability to read hdf5 files from + * a byte array or byte buffer. This implementation is better in that it does not require creating + * a temporary file which must be deleted later. However, it could result in memory issues in the + * event of large files. + */ + in = file.fileSystem().openPossiblyCompressedStream(file.split().getPath()); + hdfFile = HdfFile.fromInputStream(in); + reader = new BufferedReader(new InputStreamReader(in)); + } catch (IOException e) { + throw UserException + .dataReadError(e) + .message("Failed to open input file: %s", file.split().getPath()) + .addContext(errorContext) + .build(logger); + } finally { + AutoCloseables.closeSilently(in); } } - if (readerConfig.defaultPath == null) { - pathWriter = rowWriter.scalar(PATH_COLUMN_NAME); - dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME); - fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME); - dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME); - linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME); - elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME); - datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME); - dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME); - } - return true; - } - - /** - * This function is called when the default path is set and the data set is a single dimension. - * This function will create an array of one dataWriter of the - * correct datatype - * @param dataset The HDF5 dataset - */ - private void buildSchemaFor1DimensionalDataset(Dataset dataset) { - MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType()); - // Case for null or unknown data types: - if (currentDataType == null) { - logger.warn("Couldn't add {}", dataset.getJavaType().getName()); - return; + { // Build the schema and initial the writer + ResultSetLoader loader; + if (readerConfig.defaultPath == null) { + // Get file metadata + List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>()); + metadataIterator = metadata.iterator(); + + // Schema for Metadata query + SchemaBuilder builder = new SchemaBuilder() + .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR) + .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT) + .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT) + .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT) + .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR) + .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR); + + negotiator.tableSchema(builder.buildSchema(), false); + + loader = negotiator.build(); + dimensions = new int[0]; Review Comment: Good catch, thanks. > Convert HDF5 format to EVF2 > --------------------------- > > Key: DRILL-8188 > URL: https://issues.apache.org/jira/browse/DRILL-8188 > Project: Apache Drill > Issue Type: Improvement > Affects Versions: 1.20.0 > Reporter: Cong Luo > Assignee: Cong Luo > Priority: Major > > Use EVF V2 instead of old V1. > Also, fixed a few bugs in V2 framework. -- This message was sent by Atlassian Jira (v8.20.1#820001)