GerardDellemann commented on a change in pull request #234: MM-82 Detect Column Types URL: https://github.com/apache/metamodel/pull/234#discussion_r355451036
########## File path: excel/src/main/java/org/apache/metamodel/excel/DefaultSpreadsheetReaderDelegate.java ########## @@ -146,34 +157,196 @@ private MutableTable createTable(final Workbook wb, final Sheet sheet) { columnNamingSession.getNextColumnName(new ColumnNamingContextImpl(i)); } - for (int j = offset; j < row.getLastCellNum(); j++) { - final ColumnNamingContext namingContext = new ColumnNamingContextImpl(table, null, j); + for (int i = offset; i < row.getLastCellNum(); i++) { + final ColumnNamingContext namingContext = new ColumnNamingContextImpl(table, null, i); final Column column = new MutableColumn(columnNamingSession.getNextColumnName(namingContext), - ColumnType.STRING, table, j, true); + columnTypes[i], table, i, true); table.addColumn(column); } } } else { + row = iterateToColumnNameRow(rowIterator, row); + + if (row != null) { + createColumns(table, wb, row, columnTypes); + } + } + + return table; + } + + /** + * Iterate to the column name row if the configured ColumnNameLineNumber is above 1. + * @param rowIterator + * @param currentRow + * @return Returns the column name row. Returns the current row if the configured ColumnNameLineNumber is 1. + * Returns null if the columnName row is not found. + */ + private Row iterateToColumnNameRow(final Iterator<Row> rowIterator, final Row currentRow) { + Row row = currentRow; + + // iterate to the column name line number (if above 1) + for (int i = 1; i < _configuration.getColumnNameLineNumber(); i++) { + if (rowIterator.hasNext()) { + row = rowIterator.next(); + } else { + return null; + } + } + + return row; + } + + /** + * Get an array of {@link ColumnType}s. The length of the array is determined by the header row. If there's no + * configured column name line, then the first data row is used. If the {@link ColumnType} should be detected, then + * this is done by using the data rows only. If this shouldn't be detected, then the array is filled with either + * default column type when there is no column name line or legacy column type when there is a column name line. + * @param sheet + * @return + */ + private ColumnType[] getColumnTypes(final Sheet sheet) { + // To find the array length we need the header + final Iterator<Row> iterator = ExcelUtils.getRowIterator(sheet, _configuration, false); + Row row; + if (_configuration.getColumnNameLineNumber() == ExcelConfiguration.NO_COLUMN_NAME_LINE) { + row = findTheFirstNonEmptyRow(iterator); + } else { + row = iterateToColumnNameRow(iterator, iterator.next()); + } + if (row == null) { + return null; + } + + final ColumnType[] columnTypes = new ColumnType[row.getLastCellNum()]; + + if (_configuration.isDetectColumnTypes()) { + // Now we need the first data row + row = findTheFirstNonEmptyRow(iterator); + if (row != null) { + detectColumnTypes(row, iterator, columnTypes); + } + } else { + if (_configuration.getColumnNameLineNumber() == ExcelConfiguration.NO_COLUMN_NAME_LINE) { + Arrays.fill(columnTypes, DEFAULT_COLUMN_TYPE); + } else { + Arrays.fill(columnTypes, LEGACY_COLUMN_TYPE); + } + } + return columnTypes; + } - boolean hasColumns = true; + private static Row findTheFirstNonEmptyRow(final Iterator<Row> rowIterator) { + while (rowIterator.hasNext()) { + final Row row = rowIterator.next(); + if (row != null) { + return row; + } + } + return null; + } - // iterate to the column name line number (if above 1) - for (int j = 1; j < columnNameLineNumber; j++) { - if (rowIterator.hasNext()) { - row = rowIterator.next(); - } else { - hasColumns = false; - break; + private void detectColumnTypes(final Row firstRow, final Iterator<Row> dataRowIterator, final ColumnType[] columnTypes) { + detectColumnTypesFirstRow(firstRow, columnTypes); + detectColumnTypesOtherRows(dataRowIterator, columnTypes); + + // If all cells are null, then this loop sets the column type to the default + for (int i = 0; i < columnTypes.length; i++) { + if (columnTypes[i] == null) { + columnTypes[i] = DEFAULT_COLUMN_TYPE; + } + } + } + + private void detectColumnTypesFirstRow(final Row firstRow, final ColumnType[] columnTypes) { + if (firstRow != null && firstRow.getLastCellNum() > 0) { + for (int i = getColumnOffset(firstRow); i < columnTypes.length; i++) { + if (firstRow.getCell(i) != null) { + columnTypes[i] = determineColumnTypeFromCell(firstRow.getCell(i)); } } + } + } - if (hasColumns) { - createColumns(table, wb, row); + private void detectColumnTypesOtherRows(final Iterator<Row> dataRowIterator, final ColumnType[] columnTypes) { + int numberOfLinesToScan = _configuration.getNumberOfLinesToScan() - 1; + + while (dataRowIterator.hasNext() && numberOfLinesToScan-- > 0) { + final Row currentRow = dataRowIterator.next(); + if (currentRow != null && currentRow.getLastCellNum() > 0) { + for (int i = getColumnOffset(currentRow); i < columnTypes.length; i++) { + final ColumnType detectNewColumnType = detectNewColumnTypeCell(columnTypes[i], currentRow + .getCell(i)); + if (detectNewColumnType != null) { + columnTypes[i] = detectNewColumnType; + } + } } } + } - return table; + /** + * Tries to detect a new {@link ColumnType} for a cell. + * @param currentColumnType + * @param cell + * @return Returns a new {@link ColumnType} when detected. Otherwise null is returned. + */ + private static ColumnType detectNewColumnTypeCell(final ColumnType currentColumnType, final Cell cell) { + // Can't detect something new if it's already on the default. + if (currentColumnType != null && currentColumnType.equals(DEFAULT_COLUMN_TYPE)) { + return null; + } + // Skip if the cell is null. This way 1 missing cell can't influence the column type of all other cells. + if (cell == null) { + return null; + } + + final ColumnType detectedColumnType = determineColumnTypeFromCell(cell); + if (currentColumnType == null) { + return detectedColumnType; + } else if (!currentColumnType.equals(detectedColumnType)) { + // If the column type is Double and a Integer is detected, then don't set it to Integer + if (currentColumnType.equals(ColumnType.INTEGER) && detectedColumnType.equals(ColumnType.DOUBLE)) { + // If the column type is Integer and a Double is detected, then set it to Double + return detectedColumnType; + } else if (currentColumnType.equals(ColumnType.DOUBLE) && detectedColumnType.equals(ColumnType.INTEGER)) { + return null; + } else { + return DEFAULT_COLUMN_TYPE; + } + } + return null; + } + + private static ColumnType determineColumnTypeFromCell(final Cell cell) { + switch (cell.getCellType()) { + case NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + return ColumnType.DATE; + } else { + return cell.getNumericCellValue() % 1 == 0 ? ColumnType.INTEGER : ColumnType.DOUBLE; + } + case BOOLEAN: + return ColumnType.BOOLEAN; + case FORMULA: + final FormulaEvaluator evaluator = cell + .getSheet() + .getWorkbook() + .getCreationHelper() + .createFormulaEvaluator(); Review comment: I like the ColumnTypeScanner as Inner Class. I've positioned the instantion in a different place, but I think you'll probably like this better too :). ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services