[
https://issues.apache.org/jira/browse/DRILL-7641?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17068684#comment-17068684
]
ASF GitHub Bot commented on DRILL-7641:
---------------------------------------
cgivre commented on pull request #2024: DRILL-7641: Convert Excel Reader to use
Streaming Reader
URL: https://github.com/apache/drill/pull/2024#discussion_r399254035
##########
File path:
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -134,121 +131,131 @@ public ExcelBatchReader(ExcelReaderConfig
readerConfig) {
@Override
public boolean open(FileSchemaNegotiator negotiator) {
split = negotiator.split();
- loader = negotiator.build();
+ ResultSetLoader loader = negotiator.build();
rowWriter = loader.writer();
openFile(negotiator);
defineSchema();
return true;
}
+ /**
+ * This method opens the Excel file, initializes the Streaming Excel Reader,
and initializes the sheet variable.
+ * @param negotiator The Drill file negotiator object that represents the
file system
+ */
private void openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
try {
fsStream =
negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
- workbook = new XSSFWorkbook(fsStream);
+
+ // Open streaming reader
+ workbook = StreamingReader.builder()
+ .rowCacheSize(ROW_CACHE_SIZE)
+ .bufferSize(BUFFER_SIZE)
+ .open(fsStream);
} catch (Exception e) {
throw UserException
.dataReadError(e)
.message("Failed to open open input file: %s",
split.getPath().toString())
- .message(e.getMessage())
+ .addContext(e.getMessage())
.build(logger);
}
-
- // Evaluate formulae
- evaluator = workbook.getCreationHelper().createFormulaEvaluator();
-
- workbook.setMissingCellPolicy(Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
sheet = getSheet();
}
/**
* This function defines the schema from the header row.
- * @return TupleMedata of the discovered schema
*/
- private TupleMetadata defineSchema() {
+ private void defineSchema() {
SchemaBuilder builder = new SchemaBuilder();
- return getColumnHeaders(builder);
+ getColumnHeaders(builder);
}
- private TupleMetadata getColumnHeaders(SchemaBuilder builder) {
+ private void getColumnHeaders(SchemaBuilder builder) {
//Get the field names
- int columnCount = 0;
+ int columnCount;
- // Case for empty sheet.
- if (sheet.getFirstRowNum() == 0 && sheet.getLastRowNum() == 0) {
- return builder.buildSchema();
+ // Case for empty sheet
+ if (sheet.getLastRowNum() == 0) {
+ builder.buildSchema();
+ return;
}
+ rowIterator = sheet.iterator();
+
// Get the number of columns.
columnCount = getColumnCount();
- excelFieldNames = new ArrayList<>(columnCount);
- cellWriterArray = new ArrayList<>(columnCount);
- rowIterator = sheet.iterator();
+ excelFieldNames = new ArrayList<>();
+ cellWriterArray = new ArrayList<>();
//If there are no headers, create columns names of field_n
if (readerConfig.headerRow == -1) {
String missingFieldName;
- for (int i = 0; i < columnCount; i++) {
+ int i = 0;
+
+ for(Cell c : currentRow) {
Review comment:
Fixed
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Convert Excel Reader to Use Streaming Reader
> --------------------------------------------
>
> Key: DRILL-7641
> URL: https://issues.apache.org/jira/browse/DRILL-7641
> Project: Apache Drill
> Issue Type: Improvement
> Components: Storage - Text & CSV
> Affects Versions: 1.17.0
> Reporter: Charles Givre
> Assignee: Charles Givre
> Priority: Major
> Fix For: 1.18.0
>
>
> The current implementation of the Excel reader uses the Apache POI reader,
> which uses excessive amounts of memory. As a result, attempting to read large
> Excel files will cause out of memory errors.
> This PR converts the format plugin to use a streaming reader, based still on
> the POI library. The documentation for the streaming reader can be found
> here. [1]
> All unit tests pass and I tested the plugin with some large Excel files on my
> computer.
> [1]: [https://github.com/pjfanning/excel-streaming-reader]
>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)