[
https://issues.apache.org/jira/browse/DRILL-8188?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17523084#comment-17523084
]
ASF GitHub Bot commented on DRILL-8188:
---------------------------------------
luocooong commented on code in PR #2515:
URL: https://github.com/apache/drill/pull/2515#discussion_r851623209
##########
contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java:
##########
@@ -171,107 +168,109 @@ public HDF5ReaderConfig(HDF5FormatPlugin plugin,
HDF5FormatConfig formatConfig)
}
}
- public HDF5BatchReader(HDF5ReaderConfig readerConfig, int maxRecords) {
- this.readerConfig = readerConfig;
- this.maxRecords = maxRecords;
+ public HDF5BatchReader(HDF5ReaderConfig config, EasySubScan scan,
FileSchemaNegotiator negotiator) {
+ errorContext = negotiator.parentErrorContext();
+ file = negotiator.file();
+ readerConfig = config;
dataWriters = new ArrayList<>();
- this.showMetadataPreview = readerConfig.formatConfig.showPreview();
- }
+ showMetadataPreview = readerConfig.formatConfig.showPreview();
- @Override
- public boolean open(FileSchemaNegotiator negotiator) {
- split = negotiator.split();
- errorContext = negotiator.parentErrorContext();
// Since the HDF file reader uses a stream to actually read the file, the
file name from the
// module is incorrect.
- fileName = split.getPath().getName();
- try {
- openFile(negotiator);
- } catch (IOException e) {
- throw UserException
- .dataReadError(e)
- .addContext("Failed to close input file: %s", split.getPath())
- .addContext(errorContext)
- .build(logger);
- }
+ fileName = file.split().getPath().getName();
- ResultSetLoader loader;
- if (readerConfig.defaultPath == null) {
- // Get file metadata
- List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new
ArrayList<>());
- metadataIterator = metadata.iterator();
-
- // Schema for Metadata query
- SchemaBuilder builder = new SchemaBuilder()
- .addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR)
- .addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT)
- .addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT)
- .addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT)
- .addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR)
- .addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
-
- negotiator.tableSchema(builder.buildSchema(), false);
-
- loader = negotiator.build();
- dimensions = new int[0];
- rowWriter = loader.writer();
-
- } else {
- // This is the case when the default path is specified. Since the user
is explicitly asking for a dataset
- // Drill can obtain the schema by getting the datatypes below and
ultimately mapping that schema to columns
- Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
- dimensions = dataSet.getDimensions();
-
- loader = negotiator.build();
- rowWriter = loader.writer();
- writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
- negotiator.parentErrorContext());
- if (dimensions.length <= 1) {
- buildSchemaFor1DimensionalDataset(dataSet);
- } else if (dimensions.length == 2) {
- buildSchemaFor2DimensionalDataset(dataSet);
- } else {
- // Case for datasets of greater than 2D
- // These are automatically flattened
- buildSchemaFor2DimensionalDataset(dataSet);
+ { // Opens an HDF5 file
+ InputStream in = null;
+ try {
+ /*
+ * As a possible future improvement, the jhdf reader has the ability
to read hdf5 files from
+ * a byte array or byte buffer. This implementation is better in that
it does not require creating
+ * a temporary file which must be deleted later. However, it could
result in memory issues in the
+ * event of large files.
+ */
+ in =
file.fileSystem().openPossiblyCompressedStream(file.split().getPath());
+ hdfFile = HdfFile.fromInputStream(in);
+ reader = new BufferedReader(new InputStreamReader(in));
Review Comment:
Good catch, Done.
> Convert HDF5 format to EVF2
> ---------------------------
>
> Key: DRILL-8188
> URL: https://issues.apache.org/jira/browse/DRILL-8188
> Project: Apache Drill
> Issue Type: Improvement
> Affects Versions: 1.20.0
> Reporter: Cong Luo
> Assignee: Cong Luo
> Priority: Major
>
> Use EVF V2 instead of old V1.
> Also, fixed a few bugs in V2 framework.
--
This message was sent by Atlassian Jira
(v8.20.1#820001)