[ 
https://issues.apache.org/jira/browse/NIFI-1156?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15094250#comment-15094250
 ] 

ASF GitHub Bot commented on NIFI-1156:
--------------------------------------

Github user markap14 commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/124#discussion_r49480077
  
    --- Diff: 
nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
 ---
    @@ -0,0 +1,243 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nifi;
    +
    +import org.apache.commons.lang3.StringUtils;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.ProcessorInitializationContext;
    +import org.apache.nifi.annotation.behavior.WritesAttribute;
    +import org.apache.nifi.annotation.behavior.WritesAttributes;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.SeeAlso;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.processor.exception.ProcessException;
    +import org.apache.nifi.processor.io.StreamCallback;
    +import org.apache.nifi.processor.util.StandardValidators;
    +import org.jsoup.nodes.Document;
    +import org.jsoup.nodes.Element;
    +import org.jsoup.select.Elements;
    +
    +import java.io.IOException;
    +import java.io.InputStream;
    +import java.io.OutputStream;
    +import java.util.ArrayList;
    +import java.util.List;
    +import java.util.Set;
    +import java.util.HashSet;
    +import java.util.Collections;
    +
    +@Tags({"get", "html", "dom", "css", "element"})
    +@CapabilityDescription("Parses HTML input using CSS selector syntax and 
creates a new flowfile containing the extracted" +
    +        " element content for each matching CSS selector.")
    +@SeeAlso({ModifyHTMLElement.class, PutHTMLElement.class})
    +@WritesAttributes({@WritesAttribute(attribute="HTMLElement", 
description="Flowfile attribute where the element result" +
    +        " parsed from the HTML using the CSS selector syntax are placed if 
the destination is a flowfile attribute.")})
    +public class GetHTMLElement
    +        extends AbstractHTMLProcessor {
    +
    +    public static final String HTML_ELEMENT_ATTRIBUTE_NAME = "HTMLElement";
    +    public static final String DESTINATION_ATTRIBUTE = 
"flowfile-attribute";
    +    public static final String DESTINATION_CONTENT = "flowfile-content";
    +
    +    public static final PropertyDescriptor PREPEND_ELEMENT_VALUE = new 
PropertyDescriptor
    +            .Builder().name("Prepend Element value")
    +            .description("Prepends the specified value to the resulting 
Element")
    +            .required(false)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .expressionLanguageSupported(true)
    +            .build();
    +
    +    public static final PropertyDescriptor APPEND_ELEMENT_VALUE = new 
PropertyDescriptor
    +            .Builder().name("Append Element value")
    +            .description("Appends the specified value to the resulting 
Element")
    +            .required(false)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .expressionLanguageSupported(true)
    +            .build();
    +
    +    public static final PropertyDescriptor ATTRIBUTE_KEY = new 
PropertyDescriptor
    +            .Builder().name("Attribute Name")
    +            .description(("When getting the value of an element attribute 
this value is used as the key to determine" +
    +                    " which attribute on the selected element should be 
retrieved."))
    +            .required(false)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .expressionLanguageSupported(true)
    +            .build();
    +
    +
    +    public static final PropertyDescriptor OUTPUT_TYPE = new 
PropertyDescriptor.Builder()
    +            .name("Output Type")
    +            .description("Controls the type of value that is retrieved 
from the element. " +
    +                    ELEMENT_HTML + "," + ELEMENT_TEXT + ", " + 
ELEMENT_ATTRIBUTE + " or " + ELEMENT_DATA)
    +            .required(true)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .allowableValues(ELEMENT_HTML, ELEMENT_TEXT, 
ELEMENT_ATTRIBUTE, ELEMENT_DATA)
    +            .defaultValue(ELEMENT_HTML)
    +            .build();
    +
    +    public static final PropertyDescriptor DESTINATION = new 
PropertyDescriptor.Builder()
    +            .name("Destination")
    +            .description("Control if element extracted is written as a 
flowfile attribute or " +
    +                    "as flowfile content.")
    +            .required(true)
    +            .allowableValues(DESTINATION_ATTRIBUTE, DESTINATION_CONTENT)
    +            .defaultValue(DESTINATION_ATTRIBUTE)
    +            .build();
    +
    +    private List<PropertyDescriptor> descriptors;
    +
    +    private Set<Relationship> relationships;
    +
    +    @Override
    +    protected void init(final ProcessorInitializationContext context) {
    +        final List<PropertyDescriptor> descriptors = new ArrayList<>();
    +        descriptors.add(URL);
    +        descriptors.add(CSS_SELECTOR);
    +        descriptors.add(HTML_CHARSET);
    +        descriptors.add(OUTPUT_TYPE);
    +        descriptors.add(DESTINATION);
    +        descriptors.add(PREPEND_ELEMENT_VALUE);
    +        descriptors.add(APPEND_ELEMENT_VALUE);
    +        descriptors.add(ATTRIBUTE_KEY);
    +        this.descriptors = Collections.unmodifiableList(descriptors);
    +
    +        final Set<Relationship> relationships = new HashSet<>();
    +        relationships.add(REL_ORIGINAL);
    +        relationships.add(REL_SUCCESS);
    +        relationships.add(REL_FAILURE);
    +        relationships.add(REL_NOT_FOUND);
    +        this.relationships = Collections.unmodifiableSet(relationships);
    +    }
    +
    +    @Override
    +    public Set<Relationship> getRelationships() {
    +        return this.relationships;
    +    }
    +
    +    @Override
    +    public final List<PropertyDescriptor> 
getSupportedPropertyDescriptors() {
    +        return descriptors;
    +    }
    +
    +    @Override
    +    public void onTrigger(final ProcessContext context, final 
ProcessSession session) throws ProcessException {
    +        final FlowFile flowFile = session.get();
    +        if ( flowFile == null ) {
    +            return;
    +        }
    +
    +        try {
    +
    +            final Document doc = parseHTMLDocumentFromFlowfile(flowFile, 
context, session);
    +            final Elements eles = 
doc.select(context.getProperty(CSS_SELECTOR)
    +                    .evaluateAttributeExpressions().getValue());
    +            final String prependValue = 
context.getProperty(PREPEND_ELEMENT_VALUE)
    +                    .evaluateAttributeExpressions(flowFile).getValue();
    +            final String appendValue = 
context.getProperty(APPEND_ELEMENT_VALUE)
    +                    .evaluateAttributeExpressions(flowFile).getValue();
    +
    +            if (eles == null || eles.size() == 0) {
    +                //No element found
    +                session.transfer(flowFile, REL_NOT_FOUND);
    +            } else {
    +                for (final Element ele : eles) {
    +                    final FlowFile ff = session.create();
    +
    +                    switch (context.getProperty(DESTINATION).getValue()) {
    +                        case DESTINATION_ATTRIBUTE:
    +                            final FlowFile atFlowfile = 
session.putAttribute(ff, HTML_ELEMENT_ATTRIBUTE_NAME,
    +                                    extractElementValue(
    +                                            prependValue,
    +                                            
context.getProperty(OUTPUT_TYPE).getValue(),
    +                                            appendValue,
    +                                            ele,
    +                                            
context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions()
    +                                                    .getValue()));
    +                            
session.getProvenanceReporter().create(atFlowfile);
    +                            session.transfer(atFlowfile, REL_SUCCESS);
    +                            break;
    +                        case DESTINATION_CONTENT:
    +                            final FlowFile conFlowfile = session.write(ff, 
new StreamCallback() {
    +                                @Override
    +                                public void process(InputStream 
inputStream, OutputStream outputStream) throws IOException {
    +                                    try {
    +                                        
outputStream.write(extractElementValue(
    +                                                prependValue,
    +                                                
context.getProperty(OUTPUT_TYPE).getValue(),
    +                                                appendValue,
    +                                                ele,
    +                                                
context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions()
    +                                                        
.getValue()).getBytes());
    +                                    } catch (Exception ex) {
    +                                        session.transfer(ff, REL_FAILURE);
    +                                    }
    +                                }
    +                            });
    +
    +                            
session.getProvenanceReporter().create(conFlowfile);
    --- End diff --
    
    Same point as above, should be a FORK, not CREATE, but it'll be taken care 
of by the framework.


> HTML Parsing Processors Bundle
> ------------------------------
>
>                 Key: NIFI-1156
>                 URL: https://issues.apache.org/jira/browse/NIFI-1156
>             Project: Apache NiFi
>          Issue Type: New Feature
>          Components: Core Framework
>            Reporter: Jeremy Dyer
>            Priority: Minor
>
> NiFi provides the ability to ingest HTML but lacks the convenience to easily 
> interact with that HTML once it has entered the flow. There should be a HTML 
> Processing Bundle that provides mechanisms for manipulating and interacting 
> with HTML data once it has entered the flow. Jsoup http://jsoup.org/ seems 
> like a logical tool to use since it is mature and has a MIT license which 
> would allow it to be incorporated into NiFi.
> “GetHTMLElement” should use the CSS selector-syntax 
> (http://www.w3schools.com/cssref/css_selectors.asp) built into Jsoup to 
> extract 0-N HTML elements from the original HTML input. This processor should 
> support a delimited string of selectors allowing the user to build compound 
> HTML element output. Each HTML element (or compound element result) extracted 
> will create a new Flowfile where the element will be in either the Flowfile 
> content or an attribute depending on the user configuration.
> “ModifyHTMLElement” should provide the ability to modify the original input 
> HTML and overwrite any existing element values. The HTML element that will be 
> modified can be selected by using the CSS selector-syntax
> “PutHTMLElement” should provide the ability to put a new HTML element 
> anywhere in the original input HTML using CSS selector-syntax to indicate the 
> position that the new HTML element should be placed.
> There seems to be a potential for adding more processors but this seems like 
> a good start. Since there is a dependency on Jsoup and a potential for more 
> processors to come I think it makes sense to add this logic as its own nar 
> bundle but I could be wrong.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to