dan-s1 commented on code in PR #7194: URL: https://github.com/apache/nifi/pull/7194#discussion_r1198252637
########## nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/excel/ExcelReader.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi.excel; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.annotation.lifecycle.OnEnabled; +import org.apache.nifi.components.AllowableValue; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.context.PropertyContext; +import org.apache.nifi.controller.ConfigurationContext; +import org.apache.nifi.expression.ExpressionLanguageScope; +import org.apache.nifi.logging.ComponentLog; +import org.apache.nifi.processor.util.StandardValidators; +import org.apache.nifi.schema.access.SchemaAccessStrategy; +import org.apache.nifi.schema.access.SchemaNotFoundException; +import org.apache.nifi.schema.inference.InferSchemaAccessStrategy; +import org.apache.nifi.schema.inference.RecordSourceFactory; +import org.apache.nifi.schema.inference.SchemaInferenceEngine; +import org.apache.nifi.schema.inference.SchemaInferenceUtil; +import org.apache.nifi.schema.inference.TimeValueInference; +import org.apache.nifi.schemaregistry.services.SchemaRegistry; +import org.apache.nifi.serialization.DateTimeUtils; +import org.apache.nifi.serialization.MalformedRecordException; +import org.apache.nifi.serialization.RecordReader; +import org.apache.nifi.serialization.RecordReaderFactory; +import org.apache.nifi.serialization.SchemaRegistryService; +import org.apache.nifi.serialization.record.RecordSchema; +import org.apache.nifi.stream.io.NonCloseableInputStream; +import org.apache.poi.ss.usermodel.Row; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicReferenceArray; +import java.util.stream.IntStream; + +@Tags({"excel", "spreadsheet", "xlsx", "parse", "record", "row", "reader", "values", "cell"}) +@CapabilityDescription("Parses a Microsoft Excel document returning each row in each sheet as a separate record. " + + "This reader allows for inferring a schema either based on the first line of an Excel sheet if a 'header line' is " + + "present or from all the desired sheets, or providing an explicit schema " + + "for interpreting the values. See Controller Service's Usage for further documentation. " + + "This reader is currently only capable of processing .xlsx " + + "(XSSF 2007 OOXML file format) Excel documents and not older .xls (HSSF '97(-2007) file format) documents.)") +public class ExcelReader extends SchemaRegistryService implements RecordReaderFactory { + + private static final AllowableValue HEADER_DERIVED = new AllowableValue("excel-header-derived", "Use fields From Header", + "The first chosen row of the Excel sheet is a header row that contains the columns representative of all the rows " + + "in the desired sheets. The schema will be derived by using those columns in the header."); + public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor + .Builder().name("extract-sheets") + .displayName("Sheets to Extract") + .description("Comma separated list of Excel document sheet names whose rows should be extracted from the excel document. If this property" + + " is left blank then all the rows from all the sheets will be extracted from the Excel document. The list of names is case in-sensitive. Any sheets not" + + " specified in this value will be ignored. A bulletin will be generated if a specified sheet(s) are not found.") + .required(false) + .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + public static final PropertyDescriptor FIRST_ROW_NUM = new PropertyDescriptor + .Builder().name("excel-extract-first-row") + .displayName("Row number to start from") + .description("The row number of the first row to start processing (One based)." + + " Use this to skip over rows of data at the top of a worksheet that are not part of the dataset.") + .required(true) + .defaultValue("0") + .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build(); + + private AtomicReferenceArray<String> desiredSheets; + private volatile int firstRow; + private volatile String dateFormat; + private volatile String timeFormat; + private volatile String timestampFormat; + + @OnEnabled + public void onEnabled(final ConfigurationContext context) { + this.firstRow = getFirstRow(context); + String[] rawDesiredSheets = getRawDesiredSheets(context); + this.desiredSheets = new AtomicReferenceArray<>(rawDesiredSheets.length); + IntStream.range(0, rawDesiredSheets.length) + .forEach(index -> this.desiredSheets.set(index, rawDesiredSheets[index])); + this.dateFormat = context.getProperty(DateTimeUtils.DATE_FORMAT).getValue(); + this.timeFormat = context.getProperty(DateTimeUtils.TIME_FORMAT).getValue(); + this.timestampFormat = context.getProperty(DateTimeUtils.TIMESTAMP_FORMAT).getValue(); + } + + private int getFirstRow(final PropertyContext context) { + int rawFirstRow = context.getProperty(FIRST_ROW_NUM).asInteger(); + return getZeroBasedIndex(rawFirstRow); + } + + static int getZeroBasedIndex(int rawFirstRow) { + return rawFirstRow > 0 ? rawFirstRow - 1 : 0; + } + private String[] getRawDesiredSheets(final PropertyContext context) { Review Comment: I will defer to what you want, but I personally order my methods in the order they are called as that follows a logical progression (a tip I picked up years ago) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@nifi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org