[GitHub] nifi pull request #2478: NIFI-4833 Add scanHBase Processor

bdesert Wed, 21 Feb 2018 13:55:25 -0800

Github user bdesert commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/2478#discussion_r169749383
  
    --- Diff: 
nifi-nar-bundles/nifi-hbase-bundle/nifi-hbase-processors/src/main/java/org/apache/nifi/hbase/ScanHBase.java
 ---
    @@ -0,0 +1,562 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nifi.hbase;
    +
    +import org.apache.commons.lang3.StringUtils;
    +import org.apache.nifi.annotation.behavior.InputRequirement;
    +import org.apache.nifi.annotation.behavior.WritesAttribute;
    +import org.apache.nifi.annotation.behavior.WritesAttributes;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.annotation.lifecycle.OnScheduled;
    +import org.apache.nifi.components.AllowableValue;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.components.ValidationContext;
    +import org.apache.nifi.components.ValidationResult;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.flowfile.attributes.CoreAttributes;
    +import org.apache.nifi.hbase.io.JsonFullRowSerializer;
    +import org.apache.nifi.hbase.io.JsonQualifierAndValueRowSerializer;
    +import org.apache.nifi.hbase.io.RowSerializer;
    +import org.apache.nifi.hbase.scan.Column;
    +import org.apache.nifi.hbase.scan.ResultCell;
    +import org.apache.nifi.hbase.scan.ResultHandler;
    +import org.apache.nifi.processor.AbstractProcessor;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.exception.ProcessException;
    +import org.apache.nifi.processor.util.StandardValidators;
    +import org.apache.nifi.util.Tuple;
    +
    +import java.io.IOException;
    +import java.nio.charset.Charset;
    +import java.nio.charset.StandardCharsets;
    +import java.util.ArrayList;
    +import java.util.Collection;
    +import java.util.Collections;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.Iterator;
    +import java.util.LinkedList;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Set;
    +import java.util.concurrent.atomic.AtomicReference;
    +import java.util.regex.Pattern;
    +
    +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
    +@Tags({"hbase", "scan", "fetch", "get"})
    +@CapabilityDescription("Scans and fetches rows from an HBase table. This 
processor may be used to fetch rows from hbase table by specifying a range of 
rowkey values (start and/or end ),"
    +        + "by time range, by filter expression, or any combination of 
them. \n"
    +        + "Order of records can be controlled by a property 
<code>Reversed</code>"
    +        + "Number of rows retrieved by the processor can be limited.")
    +@WritesAttributes({
    +        @WritesAttribute(attribute = "hbase.table", description = "The 
name of the HBase table that the row was fetched from"),
    +        @WritesAttribute(attribute = "hbase.resultset", description = "A 
JSON document/s representing the row/s. This property is only written when a 
Destination of flowfile-attributes is selected."),
    +        @WritesAttribute(attribute = "mime.type", description = "Set to 
application/json when using a Destination of flowfile-content, not set or 
modified otherwise"),
    +        @WritesAttribute(attribute = "hbase.rows.count", description = 
"Number of rows in the content of given flow file"),
    +        @WritesAttribute(attribute = "scanhbase.results.found", 
description = "Indicates whether at least one row has been found in given hbase 
table with provided conditions. <br/>"
    +                + "Could be null (not present) if transfered to FAILURE")
    +})
    +public class ScanHBase extends AbstractProcessor {
    +    //enhanced regex for columns to allow "-" in column qualifier names
    +    static final Pattern COLUMNS_PATTERN = 
Pattern.compile("\\w+(:(\\w|-)+)?(?:,\\w+(:(\\w|-)+)?)*");
    +    static final byte[] nl = System.lineSeparator().getBytes();
    +
    +    static final PropertyDescriptor HBASE_CLIENT_SERVICE = new 
PropertyDescriptor.Builder()
    +            .displayName("HBase Client Service")
    +            .name("scanhbase-client-service")
    +            .description("Specifies the Controller Service to use for 
accessing HBase.")
    +            .required(true)
    +            .identifiesControllerService(HBaseClientService.class)
    +            .build();
    +
    +    static final PropertyDescriptor TABLE_NAME = new 
PropertyDescriptor.Builder()
    +            .displayName("Table Name")
    +            .name("scanhbase-table-name")
    +            .description("The name of the HBase Table to fetch from.")
    +            .required(true)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor START_ROW = new 
PropertyDescriptor.Builder()
    +            .displayName("Start rowkey")
    +            .name("scanhbase-start-rowkey")
    +            .description("The rowkey to start scan from.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor END_ROW = new 
PropertyDescriptor.Builder()
    +            .displayName("End rowkey")
    +            .name("scanhbase-end-rowkey")
    +            .description("The row key to end scan by.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor TIME_RANGE_MIN = new 
PropertyDescriptor.Builder()
    +            .displayName("Time range min")
    +            .name("scanhbase-time-range-min")
    +            .description("Time range min value. Both min and max values 
for time range should be either blank or provided.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.LONG_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor TIME_RANGE_MAX = new 
PropertyDescriptor.Builder()
    +            .displayName("Time range max")
    +            .name("scanhbase-time-range-max")
    +            .description("Time range max value. Both min and max values 
for time range should be either blank or provided.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.LONG_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor LIMIT_ROWS = new 
PropertyDescriptor.Builder()
    +            .displayName("Limit rows")
    +            .name("scanhbase-limit")
    +            .description("Limit number of rows retrieved by scan.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.INTEGER_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor BULK_SIZE = new 
PropertyDescriptor.Builder()
    +            .displayName("Max rows per flow file")
    +            .name("scanhbase-bulk-size")
    +            .description("Limits number of rows in single flow file 
content. Set to 0 to avoid multiple flow files.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .defaultValue("0")
    +            .addValidator(StandardValidators.INTEGER_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor REVERSED_SCAN = new 
PropertyDescriptor.Builder()
    +            .displayName("Reversed order")
    +            .name("scanhbase-reversed-order")
    +            .description("Set whether this scan is a reversed one. This is 
false by default which means forward(normal) scan.")
    +            .expressionLanguageSupported(false)
    +            .allowableValues("true", "false")
    +            .required(false)
    +            .defaultValue("false")
    +            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor FILTER_EXPRESSION = new 
PropertyDescriptor.Builder()
    +            .displayName("Filter expression")
    +            .name("scanhbase-filter-expression")
    +            .description("An HBase filter expression that will be applied 
to the scan. This property can not be used when also using the Columns 
property.<br/>"
    +                    + "Example: \"ValueFilter( =, 'binaryprefix:commit' 
)\"")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor COLUMNS = new 
PropertyDescriptor.Builder()
    +            .displayName("Columns")
    +            .name("scanhbase-columns")
    +            .description("An optional comma-separated list of 
\"<colFamily>:<colQualifier>\" pairs to fetch. To return all columns " +
    +                    "for a given family, leave off the qualifier such as 
\"<colFamily1>,<colFamily2>\".")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            
.addValidator(StandardValidators.createRegexMatchingValidator(COLUMNS_PATTERN))
    +            .build();
    +
    +    static final AllowableValue JSON_FORMAT_FULL_ROW = new 
AllowableValue("full-row", "full-row",
    +            "Creates a JSON document with the format: {\"row\":<row-id>, 
\"cells\":[{\"fam\":<col-fam>, \"qual\":<col-val>, \"val\":<value>, 
\"ts\":<timestamp>}]}.");
    +    static final AllowableValue JSON_FORMAT_QUALIFIER_AND_VALUE = new 
AllowableValue("col-qual-and-val", "col-qual-and-val",
    +            "Creates a JSON document with the format: 
{\"<col-qual>\":\"<value>\", \"<col-qual>\":\"<value>\".");
    +
    +    static final PropertyDescriptor JSON_FORMAT = new 
PropertyDescriptor.Builder()
    +            .displayName("JSON Format")
    +            .name("scanhbase-json-format")
    +            .description("Specifies how to represent the HBase row as a 
JSON document.")
    +            .required(true)
    +            .allowableValues(JSON_FORMAT_FULL_ROW, 
JSON_FORMAT_QUALIFIER_AND_VALUE)
    +            .defaultValue(JSON_FORMAT_FULL_ROW.getValue())
    +            .build();
    +
    +    static final PropertyDescriptor DECODE_CHARSET = new 
PropertyDescriptor.Builder()
    +            .displayName("Decode Character Set")
    +            .name("scanhbase-decode-charset")
    +            .description("The character set used to decode data from 
HBase.")
    +            .required(true)
    +            .defaultValue("UTF-8")
    +            .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
    +            .build();
    +
    +    static final PropertyDescriptor ENCODE_CHARSET = new 
PropertyDescriptor.Builder()
    +            .displayName("Encode Character Set")
    +            .name("scanhbase-encode-charset")
    +            .description("The character set used to encode the JSON 
representation of the row.")
    +            .required(true)
    +            .defaultValue("UTF-8")
    +            .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
    +            .build();
    +
    +    public static final Relationship REL_ORIGINAL = new 
Relationship.Builder()
    +            .name("original")
    +            .description("The original input file will be routed to this 
destination, even if no rows are retrieved based on provided conditions.")
    +            .build();
    +
    +    static final Relationship REL_SUCCESS = new Relationship.Builder()
    +            .name("success")
    +            .description("All successful fetches are routed to this 
relationship.")
    +            .build();
    +    static final Relationship REL_FAILURE = new Relationship.Builder()
    +            .name("failure")
    +            .description("All failed fetches are routed to this 
relationship.")
    +            .build();
    +
    +    static final String HBASE_TABLE_ATTR = "hbase.table";
    +    static final String HBASE_ROWS_COUNT_ATTR = "hbase.rows.count";
    +
    +    static final List<PropertyDescriptor> properties;
    +    static {
    +        List<PropertyDescriptor> props = new ArrayList<>();
    +        props.add(HBASE_CLIENT_SERVICE);
    +        props.add(TABLE_NAME);
    +        props.add(START_ROW);
    +        props.add(END_ROW);
    +        props.add(TIME_RANGE_MIN);
    +        props.add(TIME_RANGE_MAX);
    +        props.add(LIMIT_ROWS);
    +        props.add(REVERSED_SCAN);
    +        props.add(BULK_SIZE);
    +        props.add(FILTER_EXPRESSION);
    +        props.add(COLUMNS);
    +        props.add(JSON_FORMAT);
    +        props.add(ENCODE_CHARSET);
    +        props.add(DECODE_CHARSET);
    +        properties = Collections.unmodifiableList(props);
    +    }
    +
    +    static final Set<Relationship> relationships;
    +    static {
    +        Set<Relationship> rels = new HashSet<>();
    +        rels.add(REL_SUCCESS);
    +        rels.add(REL_ORIGINAL);
    +        rels.add(REL_FAILURE);
    +        relationships = Collections.unmodifiableSet(rels);
    +    }
    +
    +    private volatile Charset decodeCharset;
    +    private volatile Charset encodeCharset;
    +    private RowSerializer serializer = null;
    +
    +    @OnScheduled
    +    public void onScheduled(ProcessContext context) {
    +        this.decodeCharset = 
Charset.forName(context.getProperty(DECODE_CHARSET).getValue());
    +        this.encodeCharset = 
Charset.forName(context.getProperty(ENCODE_CHARSET).getValue());
    +
    +        final String jsonFormat = 
context.getProperty(JSON_FORMAT).getValue();
    +        if (jsonFormat.equals(JSON_FORMAT_FULL_ROW.getValue())) {
    +            this.serializer = new JsonFullRowSerializer(decodeCharset, 
encodeCharset);
    +        } else {
    +            this.serializer = new 
JsonQualifierAndValueRowSerializer(decodeCharset, encodeCharset);
    +        }
    +    }
    +
    +    @Override
    +    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
    +        return properties;
    +    }
    +
    +    @Override
    +    public Set<Relationship> getRelationships() {
    +        return relationships;
    +    }
    +
    +    @Override
    +    protected Collection<ValidationResult> 
customValidate(ValidationContext validationContext) {
    +
    +        final List<ValidationResult> problems = new ArrayList<>();
    +
    +        final String columns = 
validationContext.getProperty(COLUMNS).getValue();
    +        final String filter = 
validationContext.getProperty(FILTER_EXPRESSION).getValue();
    +
    +        if (!StringUtils.isBlank(columns) && !StringUtils.isBlank(filter)) 
{
    +            problems.add(new ValidationResult.Builder()
    +                    .subject(FILTER_EXPRESSION.getDisplayName())
    +                    .input(filter).valid(false)
    +                    .explanation("A filter expression can not be used in 
conjunction with the Columns property")
    +                    .build());
    +        }
    +
    +        return problems;
    +    }
    +
    +    @Override
    +    public void onTrigger(ProcessContext context, ProcessSession session) 
throws ProcessException {
    +        FlowFile flowFile = session.get();
    +        if (flowFile == null) {
    +            return;
    +        }
    +
    +        try{
    +            final String tableName = 
context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
    +            if (StringUtils.isBlank(tableName)) {
    +                getLogger().error("Table Name is blank or null for {}, 
transferring to failure", new Object[] {flowFile});
    +                session.transfer(session.penalize(flowFile), REL_FAILURE);
    +                return;
    +            }
    +
    +            final String startRow = 
context.getProperty(START_ROW).evaluateAttributeExpressions(flowFile).getValue();
    +            final String endRow = 
context.getProperty(END_ROW).evaluateAttributeExpressions(flowFile).getValue();
    +
    +            final String filterExpression = 
context.getProperty(FILTER_EXPRESSION).evaluateAttributeExpressions(flowFile).getValue();
    +
    +            //evaluate and validate time range min and max values. They 
both should be either empty or provided.
    +            Long timerangeMin = null;
    +            Long timerangeMax = null;
    +
    +            try{
    +                timerangeMin = 
context.getProperty(TIME_RANGE_MIN).evaluateAttributeExpressions(flowFile).asLong();
    +            }catch(Exception e){
    +                getLogger().error("Time range min value is not a number 
({}) for {}, transferring to failure",
    +                        new Object[] 
{context.getProperty(TIME_RANGE_MIN).evaluateAttributeExpressions(flowFile).getValue(),
 flowFile});
    +                session.transfer(session.penalize(flowFile), REL_FAILURE);
    +                return;
    +            }
    +            try{
    +                timerangeMax = 
context.getProperty(TIME_RANGE_MAX).evaluateAttributeExpressions(flowFile).asLong();
    +            }catch(Exception e){
    +                getLogger().error("Time range max value is not a number 
({}) for {}, transferring to failure",
    +                        new Object[] 
{context.getProperty(TIME_RANGE_MAX).evaluateAttributeExpressions(flowFile).getValue(),
 flowFile});
    +                session.transfer(session.penalize(flowFile), REL_FAILURE);
    +                return;
    +            }
    +            if (timerangeMin == null && timerangeMax != null) {
    +                getLogger().error("Time range min value cannot be blank 
when max value provided for {}, transferring to failure", new Object[] 
{flowFile});
    +                session.transfer(session.penalize(flowFile), REL_FAILURE);
    +                return;
    +            }else if (timerangeMin != null && timerangeMax == null) {
    +                getLogger().error("Time range max value cannot be blank 
when min value provided for {}, transferring to failure", new Object[] 
{flowFile});
    +                session.transfer(session.penalize(flowFile), REL_FAILURE);
    +                return;
    +            }
    +
    +            final Integer limitRows = 
context.getProperty(LIMIT_ROWS).evaluateAttributeExpressions(flowFile).asInteger();
    +
    +            final Boolean isReversed = 
context.getProperty(REVERSED_SCAN).asBoolean();
    +
    +            final Integer bulkSize = 
context.getProperty(BULK_SIZE).evaluateAttributeExpressions(flowFile).asInteger();
    +
    +            final List<Column> columns = 
getColumns(context.getProperty(COLUMNS).evaluateAttributeExpressions(flowFile).getValue());
    +            final HBaseClientService hBaseClientService = 
context.getProperty(HBASE_CLIENT_SERVICE).asControllerService(HBaseClientService.class);
    +
    +            final AtomicReference<Long> rowsPulledHolder = new 
AtomicReference<>(0L);
    +            final AtomicReference<Long> ffCountHolder = new 
AtomicReference<>(0L);
    +            ScanHBaseResultHandler handler = new 
ScanHBaseResultHandler(context, session, flowFile, rowsPulledHolder, 
ffCountHolder, hBaseClientService, tableName, bulkSize);
    +
    +            try {
    +                hBaseClientService.scan(tableName,
    +                                        startRow, endRow,
    +                                        filterExpression,
    +                                        timerangeMin, timerangeMax,
    +                                        limitRows,
    +                                        isReversed,
    +                                        columns,
    +                                        handler);
    +            } catch (IOException e) {
    +                getLogger().error("Unable to fetch rows from HBase table 
{} due to {}", new Object[] {tableName, e});
    +                flowFile = session.putAttribute(flowFile, 
"scanhbase.results.found", Boolean.toString(handler.isHandledAny()));
    +                session.transfer(flowFile, REL_FAILURE);
    +                return;
    +            }
    +
    +            LinkedList<Tuple<byte[], ResultCell[]>> hangingRows = 
handler.getHangingRows();
    +            if (!handler.isHandledAny() ||                                 
                        // no rows found in hbase
    +                    (handler.isHandledAny() && (hangingRows == null || 
hangingRows.isEmpty()))     // all the rows are flushed to FF inside handlers
    +                ){
    +                flowFile = session.putAttribute(flowFile, 
"scanhbase.results.found", Boolean.toString(handler.isHandledAny()));
    +                session.transfer(flowFile, REL_ORIGINAL);
    +                session.commit();
    +                return;
    +            }
    +
    +            if (hangingRows != null && !hangingRows.isEmpty()) {
    +                FlowFile lastFF = session.create(flowFile);
    +                final Map<String, String> attributes = new HashMap<>();
    +                attributes.put(HBASE_TABLE_ATTR, tableName);
    +                attributes.put(HBASE_ROWS_COUNT_ATTR, 
Long.toString(rowsPulledHolder.get()));
    +                attributes.put(CoreAttributes.MIME_TYPE.key(), 
"application/json");
    +                attributes.put(HBASE_ROWS_COUNT_ATTR, 
Long.toString(hangingRows.size()));
    +                lastFF = session.putAllAttributes(lastFF, attributes);
    +
    +                final AtomicReference<IOException> ioe = new 
AtomicReference<>(null);
    +                lastFF = session.write(lastFF, (out) -> {
    +                    for (Iterator<Tuple<byte[], ResultCell[]>> iter = 
hangingRows.iterator(); iter.hasNext();){
    +                        Tuple<byte[], ResultCell[]> r = iter.next();
    +                        serializer.serialize(r.getKey(), r.getValue(), 
out);
    +                        if (iter.hasNext()){
    +                            out.write(nl);
    +                        }
    +                    }
    +                });
    +
    +                Relationship rel = REL_SUCCESS;
    +                IOException error = ioe.get();
    +                if (error != null){
    +                    lastFF = session.putAttribute(lastFF, 
"scanhbase.error", error.toString());
    +                    rel = REL_FAILURE;
    +                }
    +                session.transfer(lastFF, rel);
    +                flowFile = session.putAttribute(flowFile, 
"scanhbase.results.found", Boolean.toString(handler.isHandledAny()));
    +                session.transfer(flowFile, REL_ORIGINAL);
    +            }
    +            session.commit();
    +
    +        }catch (final Exception e) {
    +            getLogger().error("Failed to receive data from HBase due to 
{}", e);
    +            session.rollback();
    +            // if we failed, we want to yield so that we don't hammer 
hbase.
    +            context.yield();
    +        }
    +    }
    +
    +    /**
    +     * @param columnsValue a String in the form 
colFam:colQual,colFam:colQual
    +     * @return a list of Columns based on parsing the given String
    +     */
    +    private List<Column> getColumns(final String columnsValue) {
    +        final String[] columns = (columnsValue == null || 
columnsValue.isEmpty() ? new String[0] : columnsValue.split(","));
    +
    +        List<Column> columnsList = new ArrayList<>(columns.length);
    +
    +        for (final String column : columns) {
    +            if (column.contains(":"))  {
    +                final String[] parts = column.split(":");
    +                final byte[] cf = 
parts[0].getBytes(StandardCharsets.UTF_8);
    --- End diff --
    
    A charset for data presentation is configurable. It's defined in 
onScheduled while creating serializer.
    A charset for hbase rowkey and column family/qualifier names is utf-8 
intentionally to be consistent with GetHBase and FetchHBase.
    I would prefer to open new JIRA to handle charset used for row key and 
column family/qualifiers throughout the nifi-hbase package.

---

[GitHub] nifi pull request #2478: NIFI-4833 Add scanHBase Processor

Reply via email to