[jira] [Commented] (NIFI-5231) Record stats processor

ASF GitHub Bot (JIRA) Sun, 03 Jun 2018 20:16:17 -0700


    [ 
https://issues.apache.org/jira/browse/NIFI-5231?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16499696#comment-16499696
 ]


ASF GitHub Bot commented on NIFI-5231:
--------------------------------------

Github user ijokarumawak commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/2737#discussion_r192618442
  
    --- Diff: 
nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/RecordStats.java
 ---
    @@ -0,0 +1,176 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.nifi.processors.standard;
    +
    +import org.apache.nifi.annotation.behavior.InputRequirement;
    +import org.apache.nifi.annotation.behavior.WritesAttribute;
    +import org.apache.nifi.annotation.behavior.WritesAttributes;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.annotation.lifecycle.OnScheduled;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.components.Validator;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.processor.AbstractProcessor;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.exception.ProcessException;
    +import org.apache.nifi.processor.util.StandardValidators;
    +import org.apache.nifi.record.path.FieldValue;
    +import org.apache.nifi.record.path.RecordPath;
    +import org.apache.nifi.record.path.RecordPathResult;
    +import org.apache.nifi.record.path.util.RecordPathCache;
    +import org.apache.nifi.serialization.RecordReader;
    +import org.apache.nifi.serialization.RecordReaderFactory;
    +import org.apache.nifi.serialization.record.Record;
    +
    +import java.io.InputStream;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.Map;
    +import java.util.Optional;
    +import java.util.Set;
    +import java.util.stream.Collectors;
    +
    +@Tags({ "record", "stats", "metrics" })
    +@CapabilityDescription("A processor that can count the number of items in 
a record set, as well as provide counts based on " +
    +        "user-defined criteria on subsets of the record set.")
    +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
    +@WritesAttributes({
    +    @WritesAttribute(attribute = RecordStats.RECORD_COUNT_ATTR, 
description = "A count of the records in the record set in the flowfile.")
    +})
    +public class RecordStats extends AbstractProcessor {
    +    static final String RECORD_COUNT_ATTR = "record_count";
    +
    +    static final PropertyDescriptor RECORD_READER = new 
PropertyDescriptor.Builder()
    +        .name("record-stats-reader")
    +        .displayName("Record Reader")
    +        .description("A record reader to use for reading the records.")
    +        .addValidator(Validator.VALID)
    +        .identifiesControllerService(RecordReaderFactory.class)
    +        .build();
    +
    +    static final Relationship REL_SUCCESS = new Relationship.Builder()
    +        .name("success")
    +        .description("If a flowfile is successfully processed, it goes 
here.")
    +        .build();
    +    static final Relationship REL_FAILURE = new Relationship.Builder()
    +        .name("failure")
    +        .description("If a flowfile fails to be processed, it goes here.")
    +        .build();
    +
    +    protected PropertyDescriptor 
getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
    +        return new PropertyDescriptor.Builder()
    +            .name(propertyDescriptorName)
    +            .displayName(propertyDescriptorName)
    +            .dynamic(true)
    +            .addValidator(StandardValidators.NON_BLANK_VALIDATOR)
    +            .build();
    +    }
    +
    +    private RecordPathCache cache;
    +
    +    @OnScheduled
    +    public void onEnabled(ProcessContext context) {
    +        cache = new RecordPathCache(25);
    +    }
    +
    +    @Override
    +    public Set<Relationship> getRelationships() {
    +        return new HashSet<Relationship>() {{
    +            add(REL_SUCCESS);
    +            add(REL_FAILURE);
    +        }};
    +    }
    +
    +    @Override
    +    public void onTrigger(ProcessContext context, ProcessSession session) 
throws ProcessException {
    +        FlowFile input = session.get();
    +        if (input == null) {
    +            return;
    +        }
    +
    +        try {
    +            Map<String, RecordPath> paths = getRecordPaths(context);
    +            Map<String, String> stats = getStats(input, paths, context, 
session);
    +
    +            input = session.putAllAttributes(input, stats);
    +
    +            session.transfer(input, REL_SUCCESS);
    +
    +        } catch (Exception ex) {
    +            getLogger().error("Error processing stats.", ex);
    +            session.transfer(input, REL_FAILURE);
    +        }
    +
    +    }
    +
    +    protected Map<String, RecordPath> getRecordPaths(ProcessContext 
context) {
    +        return context.getProperties().keySet()
    +            .stream().filter(p -> p.isDynamic() && 
!p.getName().contains(RECORD_READER.getName()))
    +            .collect(Collectors.toMap(
    +                e -> e.getName(),
    +                e ->  {
    +                    String val = context.getProperty(e).getValue();
    +                    return cache.getCompiled(val);
    +                })
    +            );
    +    }
    +
    +    protected Map<String, String> getStats(FlowFile flowFile, Map<String, 
RecordPath> paths, ProcessContext context, ProcessSession session) {
    +        try (InputStream is = session.read(flowFile)) {
    +            RecordReaderFactory factory = 
context.getProperty(RECORD_READER).asControllerService(RecordReaderFactory.class);
    +            RecordReader reader = factory.createRecordReader(flowFile, is, 
getLogger());
    +
    +            Map<String, Integer> retVal = new HashMap<>();
    +            Record record;
    +
    +            int recordCount = 0;
    +            while ((record = reader.nextRecord()) != null) {
    +                for (Map.Entry<String, RecordPath> entry : 
paths.entrySet()) {
    +                    RecordPathResult result = 
entry.getValue().evaluate(record);
    +                    Optional<FieldValue> value = 
result.getSelectedFields().findFirst();
    +                    if (value.isPresent() && value.get().getValue() != 
null) {
    +                        String approxValue = 
value.get().getValue().toString();
    --- End diff --
    
    Idea for improvement. If the RecordPath result is number, providing the 
counts based on value will be useful? Rather, I'd like to see in addition to 
counts (current `baseStat`), also min, max, sum and optionally sum of squared x.


> Record stats processor
> ----------------------
>
>                 Key: NIFI-5231
>                 URL: https://issues.apache.org/jira/browse/NIFI-5231
>             Project: Apache NiFi
>          Issue Type: New Feature
>            Reporter: Mike Thomsen
>            Assignee: Mike Thomsen
>            Priority: Major
>
> Should the following:
>  
>  # Take a record reader.
>  # Count the # of records and add a record_count attribute to the flowfile.
>  # Allow user-defined properties that do the following:
>  ## Map attribute name -> record path.
>  ## Provide aggregate value counts for each record path statement.
>  ## Provide total count for record path operation.
>  ## Put those values on the flowfile as attributes.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (NIFI-5231) Record stats processor

Reply via email to